From 77f79f0d8602f9422324911a50a361e1ba5beb43 Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 10 Aug 2021 15:40:21 +0200 Subject: [PATCH] change spelling of greens fraction, fix parsing issue in comments table --- R/analyze.R | 4 ++-- R/parse.R | 7 +++++-- R/repair.R | 42 +++++++++++++++++++++------------------ vignettes/interaction.Rmd | 4 ++-- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index 29d3399..84e7b19 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -48,14 +48,14 @@ party_colors <- c( FDP="#FEEB34", "CDU/CSU"="#000000", SPD="#DF0B25", - "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", + "BÜNDNIS 90/DIE GRÜNEN"="#4A932B", "DIE LINKE"="#BC3475", "AfD&Fraktionslos"="#AAAAFF", Fraktionslos="#AAAAAA" ) party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", - "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", + "DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU", "FDP", "AfD", NA_character_)) #' Bar chart visualizing fraction based data diff --git a/R/parse.R b/R/parse.R index 58cc3df..d586aac 100644 --- a/R/parse.R +++ b/R/parse.R @@ -173,7 +173,8 @@ parse_speech <- function(speech_xml, date) { } fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" -fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") +fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP", + "Fraktionslos") parse_comment <- function(comment, speech_id, on_speaker) { base <- c(speech_id = speech_id, on_speaker = on_speaker) @@ -187,7 +188,9 @@ parse_comment <- function(comment, speech_id, on_speaker) { c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] - c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4]) + fraction <- agrep(ps[3], fractionnames, max=0.2, value=T) + if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_ + c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4]) } } diff --git a/R/repair.R b/R/repair.R index b1fb8b8..b7ca3fc 100644 --- a/R/repair.R +++ b/R/repair.R @@ -1,7 +1,7 @@ fractions <- c("AFD" = "AfD", "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos", - "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", - "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", + "BÜNDNIS90/" = "BÜNDNIS 90/DIE GRÜNEN", + "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90/DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", "DIELINKE" = "DIE LINKE", "SPD" = "SPD", @@ -81,35 +81,39 @@ lookup_speaker <- function(tb, speaker, name_variable) { mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, ""))) } -repair_comments <- function(comments, speaker) { - cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", - "Use repair(, repair_commments = FALSE) to skip this.\n")) - # try to find a speaker id for each actual comment +repair_comments <- function(comments, speaker, lookup_speaker=F) { comments %>% - filter(!is.na(commenter)) %>% - lookup_speaker(speaker, commenter) %>% - left_join(comments, ., by="commenter") %>% - select(-commenter) + filter(!is.na(commenter) | !is.na(content) | !is.na(fraction)) -> + tb + if (lookup_speaker) { + cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", + "Use repair(, lookup_speaker = FALSE) to skip this.\n")) + # try to find a speaker id for each actual comment + tb %>% + filter(!is.na(commenter)) %>% + lookup_speaker(speaker, commenter) %>% + left_join(tb, ., by="commenter") + } else tb } #' Repair parsed tables #' #' @param parse_output tibble -#' @param repair_comments bool +#' @param lookup_speaker bool #' -#' If repair_comments is TRUE, members of the parliament mentioned in comments are looked up in speaker table. +#' If lookup_speaker is TRUE, members of the parliament mentioned in comments are looked up in speaker table. #' #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE #' Since repaired tables should be a fixpoint of repair. #' @export -repair <- function(parse_output, repair_comments = FALSE) { - +repair <- function(parse_output, lookup_speaker = FALSE) { + is_valid_res(parse_output) + stopifnot("lookup_speaker must be of type logical" = is.logical(lookup_speaker)) list(speaker = repair_speaker(parse_output$speaker), speeches = repair_speeches(parse_output$speeches), talks = repair_talks(parse_output$talks), - comments = if(repair_comments) repair_comments(parse_output$comments, - parse_output$speaker) - else parse_output$comments, - applause = parse_output$applause - ) + comments = repair_comments(parse_output$comments, + parse_output$speaker, + lookup_speaker), + applause = parse_output$applause) } diff --git a/vignettes/interaction.Rmd b/vignettes/interaction.Rmd index ed3cac0..99bbfa5 100644 --- a/vignettes/interaction.Rmd +++ b/vignettes/interaction.Rmd @@ -52,7 +52,7 @@ res$applause %>% group_by(on_fraction) %>% arrange(on_fraction) %>% summarize("AfD" = sum(`AfD`), - "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), + "BÜNDNIS 90/DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), "CDU/CSU" = sum(`CDU_CSU`), "DIE LINKE" = sum(`DIE_LINKE`), "FDP" = sum(`FDP`), @@ -84,7 +84,7 @@ res$comments %>% select(by_fraction = fraction.x, on_fraction = fraction.y) %>% group_by(on_fraction) %>% summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T), - `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), + `BÜNDNIS 90/DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T), `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T), `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),