From 64ede5c757ae12ec4332c303c1517c9bf41e21c3 Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 3 Aug 2021 12:48:01 +0200 Subject: [PATCH 1/4] =?UTF-8?q?replace=20=C3=9C=20with=20UE=20in=20colname?= =?UTF-8?q?s=20in=20applause,=20move=20check=20of=20empty=20directory=20to?= =?UTF-8?q?=20beginning=20in=20read=5Fall?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/parse.R | 8 ++++---- vignettes/funwithdata.Rmd | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/parse.R b/R/parse.R index 40b6045..603c358 100644 --- a/R/parse.R +++ b/R/parse.R @@ -13,6 +13,9 @@ read_all <- function(path="records/") { available_protocols <- list.files(path) res <- pblapply(available_protocols, read_one, path=path) + if (length(available_protocols) == 0) + stop("The given directory is empty or does not exist.") + lapply(res, `[[`, "redner") %>% bind_rows() %>% distinct() -> @@ -34,9 +37,6 @@ read_all <- function(path="records/") { distinct() -> commentsandapplause - if (length(available_protocols) == 0) - warning("The given directory is empty or does not exist.") - filter(commentsandapplause, type == "comment") %>% select(-type) -> comments @@ -46,7 +46,7 @@ read_all <- function(path="records/") { "SPD" = str_detect(fraktion, "SPD"), "FDP" = str_detect(fraktion, "FDP"), "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), - "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), + "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), "AfD" = str_detect(fraktion, "AfD")) %>% select(-fraktion) -> applause diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index d5dded1..fbfe4af 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -107,7 +107,7 @@ res$applause %>% group_by(on_fraktion) %>% arrange(on_fraktion) %>% summarize("AfD" = sum(`AfD`), - "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), + "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), "CDU/CSU" = sum(`CDU_CSU`), "DIE LINKE" = sum(`DIE_LINKE`), "FDP" = sum(`FDP`), From f91456116fa1a06d689ec938e2c9990681d35a07 Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 3 Aug 2021 12:48:32 +0200 Subject: [PATCH 2/4] remove outdated example code --- R/parse.R | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/R/parse.R b/R/parse.R index 603c358..ca428c3 100644 --- a/R/parse.R +++ b/R/parse.R @@ -227,15 +227,3 @@ read_from_csv <- function(path="csv/") { comments = tibble %$% read.table(str_c(path, "comments.csv")), applause = tibble %$% read.table(str_c(path, "applause.csv"))) } - -# ------------------------------- -# EXAMPLE USE - -# make sure data ist downloaded via fetch.R -# res <- read_one("records/19126-data.xml") -# -# res$redner -# res$reden -# res$talks - -# ------------------------------- From 7771b2ebd8678e19a3c397392840c3cb43719b87 Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 3 Aug 2021 13:26:39 +0200 Subject: [PATCH 3/4] add option to lookup redner in repair and fix fraction list to ensure repaired tables are fixpoints of repair(, repair_comments = FALSE) --- R/repair.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/repair.R b/R/repair.R index 21d5738..ba62357 100644 --- a/R/repair.R +++ b/R/repair.R @@ -1,4 +1,5 @@ fraktionen <- c("AFD" = "AfD", + "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", @@ -84,13 +85,20 @@ repair_comments <- function(comments, redner) { #' Repair parsed tables #' +#' TODO: Explain repair_comments argument +#' (if TRUE, we try to lookup redner names in redner table) +#' +#' Possible test: check identical(repair(res), repair(repair(res))) == TRUE +#' Since repaired tables should be a fixpoint of repair. #' @export -repair <- function(parse_output) { +repair <- function(parse_output, repair_comments = FALSE) { + list(redner = repair_redner(parse_output$redner), reden = repair_reden(parse_output$reden), talks = repair_talks(parse_output$talks), - #comments = repair_comments(parse_output$comments) - comments = parse_output$comments, + comments = if(repair_comments) repair_comments(parse_output$comments, + parse_output$redner) + else parse_output$comments, applause = parse_output$applause ) } From 5dc308c16e1fcfeab0c65681345191c8d620d7a3 Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 3 Aug 2021 13:40:02 +0200 Subject: [PATCH 4/4] generalize lookup_redner helper to allow lookup of names in arbitrary tables, add info text --- R/repair.R | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/R/repair.R b/R/repair.R index ba62357..e1bfdb9 100644 --- a/R/repair.R +++ b/R/repair.R @@ -52,11 +52,20 @@ repair_talks <- function(talks) { filter(talks, str_length(content) > 0) } -# tries to find the correct redner id given a name -# this is sufficient since every prename lastname combination in the bundestag is -# unique (luckily :D) -# returns a lookup table -lookup_redner <- function(comments, redner) { +#' Lookup name in speakers table +#' +#' Tries to find the correct speaker id given a name. +#' This is sufficient since every prename lastname combination in the bundestag is +#' unique (luckily :D) +#' +#' @param tb tibble +#' @param redner tibble +#' @param name_variable name +#' +#' Tries to match the name_variable column with speaker names +#' +#' returns a lookup table +lookup_redner <- function(tb, redner, name_variable) { tobereplaced <- "[-–—‑­­-­­­ ]" redner %>% unite(name, vorname, nachname, sep=".*") %>% @@ -69,16 +78,18 @@ lookup_redner <- function(comments, redner) { if (length(matches) == 0) return(NA_character_) rs[head(matches, 1), ]$id } - comments %>% - distinct(kommentator) %>% - mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) + tb %>% + distinct({{name_variable}}) %>% + mutate(redner = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, ""))) } repair_comments <- function(comments, redner) { + cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", + "Use repair(, repair_commments = FALSE) to skip this.\n")) # try to find a redner id for each actual comment comments %>% filter(!is.na(kommentator)) %>% - lookup_redner(redner) %>% + lookup_redner(redner, kommentator) %>% left_join(comments, ., by="kommentator") %>% select(-kommentator) }