From 77f79f0d8602f9422324911a50a361e1ba5beb43 Mon Sep 17 00:00:00 2001
From: flavis <christian@flavigny.de>
Date: Tue, 10 Aug 2021 15:40:21 +0200
Subject: [PATCH] change spelling of greens fraction, fix parsing issue in
 comments table

---
 R/analyze.R               |  4 ++--
 R/parse.R                 |  7 +++++--
 R/repair.R                | 42 +++++++++++++++++++++------------------
 vignettes/interaction.Rmd |  4 ++--
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/R/analyze.R b/R/analyze.R
index 29d3399..84e7b19 100644
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -48,14 +48,14 @@ party_colors <- c(
   FDP="#FEEB34",
   "CDU/CSU"="#000000", 
   SPD="#DF0B25",
-  "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B",
+  "BÜNDNIS 90/DIE GRÜNEN"="#4A932B",
   "DIE LINKE"="#BC3475",
   "AfD&Fraktionslos"="#AAAAFF",
   Fraktionslos="#AAAAAA"
 )
 
 party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",
-                        "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU",
+                        "DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU",
                         "FDP", "AfD", NA_character_))
 
 #' Bar chart visualizing fraction based data
diff --git a/R/parse.R b/R/parse.R
index 58cc3df..d586aac 100644
--- a/R/parse.R
+++ b/R/parse.R
@@ -173,7 +173,8 @@ parse_speech <- function(speech_xml, date) {
 }
 
 fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
-fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
+fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP",
+                   "Fraktionslos")
 
 parse_comment <- function(comment, speech_id, on_speaker) {
     base <- c(speech_id = speech_id, on_speaker = on_speaker)
@@ -187,7 +188,9 @@ parse_comment <- function(comment, speech_id, on_speaker) {
         c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
     } else {
         ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
-        c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4])
+        fraction <- agrep(ps[3], fractionnames, max=0.2, value=T)
+        if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_
+        c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4])
     }
 }
 
diff --git a/R/repair.R b/R/repair.R
index b1fb8b8..b7ca3fc 100644
--- a/R/repair.R
+++ b/R/repair.R
@@ -1,7 +1,7 @@
 fractions <- c("AFD" = "AfD",
                 "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
-                "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
-                "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
+                "BÜNDNIS90/" = "BÜNDNIS 90/DIE GRÜNEN",
+                "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90/DIE GRÜNEN",
                 "FRAKTIONSLOS" = "Fraktionslos",
                 "DIELINKE" = "DIE LINKE",
                 "SPD" = "SPD",
@@ -81,35 +81,39 @@ lookup_speaker <- function(tb, speaker, name_variable) {
         mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
 }
 
-repair_comments <- function(comments, speaker) {
-    cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
-               "Use repair(, repair_commments = FALSE) to skip this.\n"))
-    # try to find a speaker id for each actual comment
+repair_comments <- function(comments, speaker, lookup_speaker=F) {
     comments %>%
-        filter(!is.na(commenter)) %>%
-        lookup_speaker(speaker, commenter) %>%
-        left_join(comments, ., by="commenter") %>%
-        select(-commenter)
+        filter(!is.na(commenter) | !is.na(content) | !is.na(fraction)) ->
+        tb
+    if (lookup_speaker) {
+        cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
+                   "Use repair(, lookup_speaker = FALSE) to skip this.\n"))
+        # try to find a speaker id for each actual comment
+        tb %>%
+            filter(!is.na(commenter)) %>%
+            lookup_speaker(speaker, commenter) %>%
+            left_join(tb, ., by="commenter")
+    } else tb
 }
 
 #' Repair parsed tables
 #'
 #' @param parse_output tibble
-#' @param repair_comments bool
+#' @param lookup_speaker bool
 #' 
-#' If repair_comments is TRUE, members of the parliament mentioned in comments are looked up in speaker table.
+#' If lookup_speaker is TRUE, members of the parliament mentioned in comments are looked up in speaker table.
 #'
 #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
 #' Since repaired tables should be a fixpoint of repair.
 #' @export
-repair <- function(parse_output, repair_comments = FALSE) {
-
+repair <- function(parse_output, lookup_speaker = FALSE) {
+    is_valid_res(parse_output)
+    stopifnot("lookup_speaker must be of type logical" = is.logical(lookup_speaker))
     list(speaker = repair_speaker(parse_output$speaker),
          speeches = repair_speeches(parse_output$speeches),
          talks = repair_talks(parse_output$talks),
-         comments = if(repair_comments) repair_comments(parse_output$comments,
-                                                        parse_output$speaker)
-                    else parse_output$comments,
-         applause = parse_output$applause
-         )
+         comments = repair_comments(parse_output$comments,
+                                    parse_output$speaker,
+                                    lookup_speaker),
+         applause = parse_output$applause)
 }
diff --git a/vignettes/interaction.Rmd b/vignettes/interaction.Rmd
index ed3cac0..99bbfa5 100644
--- a/vignettes/interaction.Rmd
+++ b/vignettes/interaction.Rmd
@@ -52,7 +52,7 @@ res$applause %>%
     group_by(on_fraction) %>%
     arrange(on_fraction) %>%
     summarize("AfD" = sum(`AfD`),
-              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
+              "BÜNDNIS 90/DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
               "CDU/CSU" = sum(`CDU_CSU`),
               "DIE LINKE" = sum(`DIE_LINKE`),
               "FDP" = sum(`FDP`),
@@ -84,7 +84,7 @@ res$comments %>%
     select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
     group_by(on_fraction) %>%
     summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
-              `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
+              `BÜNDNIS 90/DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
               `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
               `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
               `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),