| @@ -28,9 +28,14 @@ read_all <- function(path="records/") { | |||
| distinct() -> | |||
| talks | |||
| lapply(res, `[[`, "comments") %>% | |||
| bind_rows() %>% | |||
| distinct() -> | |||
| comments | |||
| if (length(available_protocols) == 0) | |||
| warning("The given directory is empty or does not exist.") | |||
| list(redner = redner, reden = reden, talks = talks) | |||
| list(redner = redner, reden = reden, talks = talks, comments = comments) | |||
| } | |||
| # this reads all currently parseable data from one xml | |||
| @@ -99,7 +104,8 @@ parse_rede <- function(rede_xml) { | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| cur_content <- "" | |||
| } else { | |||
| } | |||
| if (is.na(principal_redner) && xml_name(node) != "name") { | |||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| if (xml_name(node) == "name") { | |||
| @@ -144,9 +150,10 @@ parse_comment <- function(comment, rede_id, on_redner) { | |||
| # - actually separate content properly | |||
| # - differentiate between [AfD] and AfD in by | |||
| if(str_detect(comment, "Beifall")) { | |||
| c(base, type = "applause", by = by, content = comment) | |||
| c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) | |||
| } else { | |||
| c(base, type = "comment", by = by, content = comment) | |||
| ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] | |||
| c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4]) | |||
| } | |||
| } | |||
| @@ -163,7 +170,8 @@ parse_redenliste <- function(redenliste_xml) { | |||
| comments = tibble(rede_id = comments["rede_id",], | |||
| on_redner = comments["on_redner",], | |||
| type = comments["type",], | |||
| by = comments["by",], | |||
| fraktion = comments["fraktion",], | |||
| kommentator = comments["kommentator",], | |||
| content = comments["content", ])) | |||
| } | |||
| @@ -183,7 +191,7 @@ parse_rednerliste <- function(rednerliste_xml) { | |||
| # EXAMPLE USE | |||
| # make sure data ist downloaded via fetch.R | |||
| res <- read_one("records/19126-data.xml") | |||
| # res <- read_one("records/19126-data.xml") | |||
| # | |||
| # res$redner | |||
| # res$reden | |||
| @@ -24,12 +24,19 @@ collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse | |||
| # expects a tibble of redner and repairs | |||
| repair_redner <- function(redner) { | |||
| if (nrow(redner) == 0) return(redner) | |||
| redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion | |||
| group_by(id, vorname, nachname) %>% | |||
| summarize(fraktion = collect_unique(fraktion), | |||
| redner %>% | |||
| filter(id != "10000") %>% # invalid id's | |||
| mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion | |||
| group_by(id) %>% | |||
| summarize(vorname = head(vorname, 1), | |||
| nachname = head(nachname, 1), | |||
| fraktion = collect_unique(fraktion), | |||
| titel = longest_titel(titel), | |||
| rolle_kurz = collect_unique(str_squish(rolle_kurz)), | |||
| rolle_lang = collect_unique(str_squish(rolle_lang))) | |||
| rolle_lang = collect_unique(str_squish(rolle_lang))) %>% | |||
| ungroup() #%>% | |||
| # arrange(id) %>% | |||
| # distinct(vorname, nachname, fraktion, titel) | |||
| } | |||
| repair_reden <- function(reden) { | |||
| @@ -44,6 +51,37 @@ repair_talks <- function(talks) { | |||
| talks | |||
| } | |||
| # tries to find the correct redner id given a name | |||
| # this is sufficient since every prename lastname combination in the bundestag is | |||
| # unique (luckily :D) | |||
| # returns a lookup table | |||
| lookup_redner <- function(comments, redner) { | |||
| tobereplaced <- "[-–—‑- ]" | |||
| redner %>% | |||
| unite(name, vorname, nachname, sep=".*") %>% | |||
| mutate(name = str_replace_all(name, tobereplaced, ".*")) -> | |||
| rs | |||
| find_match <- function(komm) { | |||
| if (komm == "") return (NA_character_) | |||
| # I tried with agrep (levensthein distance) but results are better that way | |||
| matches <- str_which(komm, rs$name) | |||
| if (length(matches) == 0) return(NA_character_) | |||
| rs[head(matches, 1), ]$id | |||
| } | |||
| comments %>% | |||
| distinct(kommentator) %>% | |||
| mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) | |||
| } | |||
| repair_comments <- function(comments, redner) { | |||
| # try to find a redner id for each actual comment | |||
| comments %>% | |||
| filter(!is.na(kommentator)) %>% | |||
| lookup_redner(redner) %>% | |||
| left_join(comments, ., by="kommentator") %>% | |||
| select(-kommentator) | |||
| } | |||
| #' Repair parsed tables | |||
| #' | |||
| #' @export | |||