JosuaKugler 4 лет назад
Родитель
Сommit
5f9343bf7f
3 измененных файлов: 40 добавлений и 33 удалений
  1. +4
    -16
      R/parse.R
  2. +35
    -16
      R/repair.R
  3. +1
    -1
      vignettes/funwithdata.Rmd

+ 4
- 16
R/parse.R Просмотреть файл

@@ -13,6 +13,9 @@ read_all <- function(path="records/") {
available_protocols <- list.files(path) available_protocols <- list.files(path)
res <- pblapply(available_protocols, read_one, path=path) res <- pblapply(available_protocols, read_one, path=path)


if (length(available_protocols) == 0)
stop("The given directory is empty or does not exist.")

lapply(res, `[[`, "speaker") %>% lapply(res, `[[`, "speaker") %>%
bind_rows() %>% bind_rows() %>%
distinct() -> distinct() ->
@@ -34,9 +37,6 @@ read_all <- function(path="records/") {
distinct() -> distinct() ->
commentsandapplause commentsandapplause


if (length(available_protocols) == 0)
warning("The given directory is empty or does not exist.")

filter(commentsandapplause, type == "comment") %>% filter(commentsandapplause, type == "comment") %>%
select(-type) -> select(-type) ->
comments comments
@@ -46,7 +46,7 @@ read_all <- function(path="records/") {
"SPD" = str_detect(fraction, "SPD"), "SPD" = str_detect(fraction, "SPD"),
"FDP" = str_detect(fraction, "FDP"), "FDP" = str_detect(fraction, "FDP"),
"DIE_LINKE" = str_detect(fraction, "DIE LINKE"), "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
"BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
"BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
"AfD" = str_detect(fraction, "AfD")) %>% "AfD" = str_detect(fraction, "AfD")) %>%
select(-fraction) -> select(-fraction) ->
applause applause
@@ -227,15 +227,3 @@ read_from_csv <- function(path="csv/") {
comments = tibble %$% read.table(str_c(path, "comments.csv")), comments = tibble %$% read.table(str_c(path, "comments.csv")),
applause = tibble %$% read.table(str_c(path, "applause.csv"))) applause = tibble %$% read.table(str_c(path, "applause.csv")))
} }

# -------------------------------
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
# res <- read_one("records/19126-data.xml")
#
# res$speaker
# res$speeches
# res$talks

# -------------------------------

+ 35
- 16
R/repair.R Просмотреть файл

@@ -1,4 +1,5 @@
fractions <- c("AFD" = "AfD", fractions <- c("AFD" = "AfD",
"AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
"BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
"BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
"FRAKTIONSLOS" = "Fraktionslos", "FRAKTIONSLOS" = "Fraktionslos",
@@ -51,11 +52,20 @@ repair_talks <- function(talks) {
filter(talks, str_length(content) > 0) filter(talks, str_length(content) > 0)
} }


# tries to find the correct speaker id given a name
# this is sufficient since every prename lastname combination in the bundestag is
# unique (luckily :D)
# returns a lookup table
lookup_speaker <- function(comments, speaker) {
#' Lookup name in speakers table
#'
#' Tries to find the correct speaker id given a name.
#' This is sufficient since every prename lastname combination in the bundestag is
#' unique (luckily :D)
#'
#' @param tb tibble
#' @param redner tibble
#' @param name_variable name
#'
#' Tries to match the name_variable column with speaker names
#'
#' returns a lookup table
lookup_speaker <- function(tb, speaker, name_variable) {
tobereplaced <- "[-–—‑­­-­­­ ]" tobereplaced <- "[-–—‑­­-­­­ ]"
speaker %>% speaker %>%
unite(name, vorname, nachname, sep=".*") %>% unite(name, vorname, nachname, sep=".*") %>%
@@ -68,29 +78,38 @@ lookup_speaker <- function(comments, speaker) {
if (length(matches) == 0) return(NA_character_) if (length(matches) == 0) return(NA_character_)
rs[head(matches, 1), ]$id rs[head(matches, 1), ]$id
} }
comments %>%
distinct(kommentator) %>%
mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
tb %>%
distinct({{name_variable}}) %>%
mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
} }


repair_comments <- function(comments, speaker) {
# try to find a speaker id for each actual comment
repair_comments <- function(comments, redner) {
cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
"Use repair(, repair_commments = FALSE) to skip this.\n"))
# try to find a redner id for each actual comment
comments %>% comments %>%
filter(!is.na(kommentator)) %>% filter(!is.na(kommentator)) %>%
lookup_speaker(speaker) %>%
lookup_redner(redner, kommentator) %>%
left_join(comments, ., by="kommentator") %>% left_join(comments, ., by="kommentator") %>%
select(-kommentator) select(-kommentator)
} }


#' Repair parsed tables #' Repair parsed tables
#' #'
#' TODO: Explain repair_comments argument
#' (if TRUE, we try to lookup redner names in redner table)
#'
#' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
#' Since repaired tables should be a fixpoint of repair.
#' @export #' @export
repair <- function(parse_output) {
list(speaker = repair_speaker(parse_output$speaker),
speeches = repair_speeches(parse_output$speeches),
repair <- function(parse_output, repair_comments = FALSE) {

list(redner = repair_speaker(parse_output$speaker),
reden = repair_speeches(parse_output$speeches),
talks = repair_talks(parse_output$talks), talks = repair_talks(parse_output$talks),
#comments = repair_comments(parse_output$comments)
comments = parse_output$comments,
comments = if(repair_comments) repair_comments(parse_output$comments,
parse_output$speaker)
else parse_output$comments,
applause = parse_output$applause applause = parse_output$applause
) )
} }

+ 1
- 1
vignettes/funwithdata.Rmd Просмотреть файл

@@ -107,7 +107,7 @@ res$applause %>%
group_by(on_fraction) %>% group_by(on_fraction) %>%
arrange(on_fraction) %>% arrange(on_fraction) %>%
summarize("AfD" = sum(`AfD`), summarize("AfD" = sum(`AfD`),
"BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
"BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
"CDU/CSU" = sum(`CDU_CSU`), "CDU/CSU" = sum(`CDU_CSU`),
"DIE LINKE" = sum(`DIE_LINKE`), "DIE LINKE" = sum(`DIE_LINKE`),
"FDP" = sum(`FDP`), "FDP" = sum(`FDP`),


Загрузка…
Отмена
Сохранить