diff --git a/R/parse.R b/R/parse.R index 53f73c6..db99eb5 100644 --- a/R/parse.R +++ b/R/parse.R @@ -13,6 +13,9 @@ read_all <- function(path="records/") { available_protocols <- list.files(path) res <- pblapply(available_protocols, read_one, path=path) + if (length(available_protocols) == 0) + stop("The given directory is empty or does not exist.") + lapply(res, `[[`, "speaker") %>% bind_rows() %>% distinct() -> @@ -34,9 +37,6 @@ read_all <- function(path="records/") { distinct() -> commentsandapplause - if (length(available_protocols) == 0) - warning("The given directory is empty or does not exist.") - filter(commentsandapplause, type == "comment") %>% select(-type) -> comments @@ -46,7 +46,7 @@ read_all <- function(path="records/") { "SPD" = str_detect(fraction, "SPD"), "FDP" = str_detect(fraction, "FDP"), "DIE_LINKE" = str_detect(fraction, "DIE LINKE"), - "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"), + "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"), "AfD" = str_detect(fraction, "AfD")) %>% select(-fraction) -> applause @@ -227,15 +227,3 @@ read_from_csv <- function(path="csv/") { comments = tibble %$% read.table(str_c(path, "comments.csv")), applause = tibble %$% read.table(str_c(path, "applause.csv"))) } - -# ------------------------------- -# EXAMPLE USE - -# make sure data ist downloaded via fetch.R -# res <- read_one("records/19126-data.xml") -# -# res$speaker -# res$speeches -# res$talks - -# ------------------------------- diff --git a/R/repair.R b/R/repair.R index cd7bb37..3dfdece 100644 --- a/R/repair.R +++ b/R/repair.R @@ -1,4 +1,5 @@ fractions <- c("AFD" = "AfD", + "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", @@ -51,11 +52,20 @@ repair_talks <- function(talks) { filter(talks, str_length(content) > 0) } -# tries to find the correct speaker id given a name -# this is sufficient since every prename lastname combination in the bundestag is -# unique (luckily :D) -# returns a lookup table -lookup_speaker <- function(comments, speaker) { +#' Lookup name in speakers table +#' +#' Tries to find the correct speaker id given a name. +#' This is sufficient since every prename lastname combination in the bundestag is +#' unique (luckily :D) +#' +#' @param tb tibble +#' @param redner tibble +#' @param name_variable name +#' +#' Tries to match the name_variable column with speaker names +#' +#' returns a lookup table +lookup_speaker <- function(tb, speaker, name_variable) { tobereplaced <- "[-–—‑­­-­­­ ]" speaker %>% unite(name, vorname, nachname, sep=".*") %>% @@ -68,29 +78,38 @@ lookup_speaker <- function(comments, speaker) { if (length(matches) == 0) return(NA_character_) rs[head(matches, 1), ]$id } - comments %>% - distinct(kommentator) %>% - mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) + tb %>% + distinct({{name_variable}}) %>% + mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, ""))) } -repair_comments <- function(comments, speaker) { - # try to find a speaker id for each actual comment +repair_comments <- function(comments, redner) { + cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", + "Use repair(, repair_commments = FALSE) to skip this.\n")) + # try to find a redner id for each actual comment comments %>% filter(!is.na(kommentator)) %>% - lookup_speaker(speaker) %>% + lookup_redner(redner, kommentator) %>% left_join(comments, ., by="kommentator") %>% select(-kommentator) } #' Repair parsed tables #' +#' TODO: Explain repair_comments argument +#' (if TRUE, we try to lookup redner names in redner table) +#' +#' Possible test: check identical(repair(res), repair(repair(res))) == TRUE +#' Since repaired tables should be a fixpoint of repair. #' @export -repair <- function(parse_output) { - list(speaker = repair_speaker(parse_output$speaker), - speeches = repair_speeches(parse_output$speeches), +repair <- function(parse_output, repair_comments = FALSE) { + + list(redner = repair_speaker(parse_output$speaker), + reden = repair_speeches(parse_output$speeches), talks = repair_talks(parse_output$talks), - #comments = repair_comments(parse_output$comments) - comments = parse_output$comments, + comments = if(repair_comments) repair_comments(parse_output$comments, + parse_output$speaker) + else parse_output$comments, applause = parse_output$applause ) } diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 6b6c3e1..b9a2e69 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -107,7 +107,7 @@ res$applause %>% group_by(on_fraction) %>% arrange(on_fraction) %>% summarize("AfD" = sum(`AfD`), - "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), + "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), "CDU/CSU" = sum(`CDU_CSU`), "DIE LINKE" = sum(`DIE_LINKE`), "FDP" = sum(`FDP`),