From f09cbac172311b9cc158d18b34cd4744ebcabc02 Mon Sep 17 00:00:00 2001
From: flavis <christian@flavigny.de>
Date: Thu, 1 Jul 2021 11:05:50 +0200
Subject: [PATCH] improve comment parsing and add comment commentator redner
 matching

---
 R/parse.R  | 20 ++++++++++++++------
 R/repair.R | 46 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/R/parse.R b/R/parse.R
index de4de17..655c36b 100644
--- a/R/parse.R
+++ b/R/parse.R
@@ -28,9 +28,14 @@ read_all <- function(path="records/") {
         distinct() ->
         talks
 
+    lapply(res, `[[`, "comments") %>%
+        bind_rows() %>%
+        distinct() ->
+        comments
+
     if (length(available_protocols) == 0)
         warning("The given directory is empty or does not exist.")
-    list(redner = redner, reden = reden, talks = talks)
+    list(redner = redner, reden = reden, talks = talks, comments = comments)
 }
 
 # this reads all currently parseable data from one xml
@@ -99,7 +104,8 @@ parse_rede <- function(rede_xml) {
                               content = cur_content)
                     reden <- c(reden, list(rede))
                     cur_content <- ""
-                } else {
+                } 
+                if (is.na(principal_redner) && xml_name(node) != "name") {
                     principal_redner <- xml_child(node) %>% xml_attr("id")
                 }
                 if (xml_name(node) == "name") {
@@ -144,9 +150,10 @@ parse_comment <- function(comment, rede_id, on_redner) {
     # - actually separate content properly
     # - differentiate between [AfD] and AfD in by
     if(str_detect(comment, "Beifall")) {
-        c(base, type = "applause", by = by, content = comment)
+        c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
     } else {
-        c(base, type = "comment", by = by, content = comment)
+        ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
+        c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
     }
 }
 
@@ -163,7 +170,8 @@ parse_redenliste <- function(redenliste_xml) {
          comments = tibble(rede_id = comments["rede_id",],
                            on_redner = comments["on_redner",],
                            type = comments["type",],
-                           by = comments["by",],
+                           fraktion = comments["fraktion",],
+                           kommentator = comments["kommentator",],
                            content = comments["content", ]))
 }
 
@@ -183,7 +191,7 @@ parse_rednerliste <- function(rednerliste_xml) {
 # EXAMPLE USE
 
 # make sure data ist downloaded via fetch.R
-res <- read_one("records/19126-data.xml")
+# res <- read_one("records/19126-data.xml")
 # 
 # res$redner
 # res$reden
diff --git a/R/repair.R b/R/repair.R
index e36f99b..b079f37 100644
--- a/R/repair.R
+++ b/R/repair.R
@@ -24,12 +24,19 @@ collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse
 # expects a tibble of redner and repairs
 repair_redner <- function(redner) {
     if (nrow(redner) == 0) return(redner)
-    redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
-        group_by(id, vorname, nachname) %>%
-        summarize(fraktion = collect_unique(fraktion),
+    redner %>%
+        filter(id != "10000") %>% # invalid id's
+        mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
+        group_by(id) %>%
+        summarize(vorname = head(vorname, 1),
+                  nachname = head(nachname, 1),
+                  fraktion = collect_unique(fraktion),
                   titel = longest_titel(titel),
                   rolle_kurz = collect_unique(str_squish(rolle_kurz)),
-                  rolle_lang = collect_unique(str_squish(rolle_lang)))
+                  rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
+        ungroup() #%>%
+        # arrange(id) %>%
+        # distinct(vorname, nachname, fraktion, titel)
 }
 
 repair_reden <- function(reden) {
@@ -44,6 +51,37 @@ repair_talks <- function(talks) {
     talks
 }
 
+# tries to find the correct redner id given a name
+# this is sufficient since every prename lastname combination in the bundestag is
+# unique (luckily :D)
+# returns a lookup table
+lookup_redner <- function(comments, redner) {
+    tobereplaced <- "[-–—‑­­-­­­ ]"
+    redner %>%
+        unite(name, vorname, nachname, sep=".*") %>%
+        mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
+        rs
+    find_match <- function(komm) {
+        if (komm == "") return (NA_character_)
+        # I tried with agrep (levensthein distance) but results are better that way
+        matches <- str_which(komm, rs$name)
+        if (length(matches) == 0) return(NA_character_)
+        rs[head(matches, 1), ]$id
+    }
+    comments %>%
+        distinct(kommentator) %>%
+        mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
+}
+
+repair_comments <- function(comments, redner) {
+    # try to find a redner id for each actual comment
+    comments %>%
+        filter(!is.na(kommentator)) %>%
+        lookup_redner(redner) %>%
+        left_join(comments, ., by="kommentator") %>%
+        select(-kommentator)
+}
+
 #' Repair parsed tables
 #'
 #' @export