From f09cbac172311b9cc158d18b34cd4744ebcabc02 Mon Sep 17 00:00:00 2001 From: flavis Date: Thu, 1 Jul 2021 11:05:50 +0200 Subject: [PATCH] improve comment parsing and add comment commentator redner matching --- R/parse.R | 20 ++++++++++++++------ R/repair.R | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/R/parse.R b/R/parse.R index de4de17..655c36b 100644 --- a/R/parse.R +++ b/R/parse.R @@ -28,9 +28,14 @@ read_all <- function(path="records/") { distinct() -> talks + lapply(res, `[[`, "comments") %>% + bind_rows() %>% + distinct() -> + comments + if (length(available_protocols) == 0) warning("The given directory is empty or does not exist.") - list(redner = redner, reden = reden, talks = talks) + list(redner = redner, reden = reden, talks = talks, comments = comments) } # this reads all currently parseable data from one xml @@ -99,7 +104,8 @@ parse_rede <- function(rede_xml) { content = cur_content) reden <- c(reden, list(rede)) cur_content <- "" - } else { + } + if (is.na(principal_redner) && xml_name(node) != "name") { principal_redner <- xml_child(node) %>% xml_attr("id") } if (xml_name(node) == "name") { @@ -144,9 +150,10 @@ parse_comment <- function(comment, rede_id, on_redner) { # - actually separate content properly # - differentiate between [AfD] and AfD in by if(str_detect(comment, "Beifall")) { - c(base, type = "applause", by = by, content = comment) + c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) } else { - c(base, type = "comment", by = by, content = comment) + ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] + c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4]) } } @@ -163,7 +170,8 @@ parse_redenliste <- function(redenliste_xml) { comments = tibble(rede_id = comments["rede_id",], on_redner = comments["on_redner",], type = comments["type",], - by = comments["by",], + fraktion = comments["fraktion",], + kommentator = comments["kommentator",], content = comments["content", ])) } @@ -183,7 +191,7 @@ parse_rednerliste <- function(rednerliste_xml) { # EXAMPLE USE # make sure data ist downloaded via fetch.R -res <- read_one("records/19126-data.xml") +# res <- read_one("records/19126-data.xml") # # res$redner # res$reden diff --git a/R/repair.R b/R/repair.R index e36f99b..b079f37 100644 --- a/R/repair.R +++ b/R/repair.R @@ -24,12 +24,19 @@ collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse # expects a tibble of redner and repairs repair_redner <- function(redner) { if (nrow(redner) == 0) return(redner) - redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion - group_by(id, vorname, nachname) %>% - summarize(fraktion = collect_unique(fraktion), + redner %>% + filter(id != "10000") %>% # invalid id's + mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion + group_by(id) %>% + summarize(vorname = head(vorname, 1), + nachname = head(nachname, 1), + fraktion = collect_unique(fraktion), titel = longest_titel(titel), rolle_kurz = collect_unique(str_squish(rolle_kurz)), - rolle_lang = collect_unique(str_squish(rolle_lang))) + rolle_lang = collect_unique(str_squish(rolle_lang))) %>% + ungroup() #%>% + # arrange(id) %>% + # distinct(vorname, nachname, fraktion, titel) } repair_reden <- function(reden) { @@ -44,6 +51,37 @@ repair_talks <- function(talks) { talks } +# tries to find the correct redner id given a name +# this is sufficient since every prename lastname combination in the bundestag is +# unique (luckily :D) +# returns a lookup table +lookup_redner <- function(comments, redner) { + tobereplaced <- "[-–—‑­­-­­­ ]" + redner %>% + unite(name, vorname, nachname, sep=".*") %>% + mutate(name = str_replace_all(name, tobereplaced, ".*")) -> + rs + find_match <- function(komm) { + if (komm == "") return (NA_character_) + # I tried with agrep (levensthein distance) but results are better that way + matches <- str_which(komm, rs$name) + if (length(matches) == 0) return(NA_character_) + rs[head(matches, 1), ]$id + } + comments %>% + distinct(kommentator) %>% + mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) +} + +repair_comments <- function(comments, redner) { + # try to find a redner id for each actual comment + comments %>% + filter(!is.na(kommentator)) %>% + lookup_redner(redner) %>% + left_join(comments, ., by="kommentator") %>% + select(-kommentator) +} + #' Repair parsed tables #' #' @export