fraktionen <- c("AFD" = "AfD", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", "DIELINKE" = "DIE LINKE", "SPD" = "SPD", "CDU/CSU" = "CDU/CSU", "FDP" = "FDP") repair_fraktion <- function(fraktion) { cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "") fraktionen[cleaned] } # takes vector of titel and keeps longest longest_titel <- function(titel) { if (all(is.na(titel))) NA_character_ else titel[which.max %$% str_length(titel)] } # takes character vector, removes duplicates and collapses collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") # expects a tibble of redner and repairs repair_redner <- function(redner) { if (nrow(redner) == 0) return(redner) redner %>% filter(id != "10000") %>% # invalid id's mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion group_by(id) %>% summarize(vorname = head(vorname, 1), nachname = head(nachname, 1), fraktion = collect_unique(fraktion), titel = longest_titel(titel), rolle_kurz = collect_unique(str_squish(rolle_kurz)), rolle_lang = collect_unique(str_squish(rolle_lang))) %>% ungroup() #%>% # arrange(id) %>% # distinct(vorname, nachname, fraktion, titel) } repair_reden <- function(reden) { if (nrow(reden) == 0) return(reden) # TODO: fill with content reden } repair_talks <- function(talks) { if (nrow(talks) == 0) return(talks) # ignore all talks which have empty content filter(talks, str_length(content) > 0) } # tries to find the correct redner id given a name # this is sufficient since every prename lastname combination in the bundestag is # unique (luckily :D) # returns a lookup table lookup_redner <- function(comments, redner) { tobereplaced <- "[-–—‑­­-­­­ ]" redner %>% unite(name, vorname, nachname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs find_match <- function(komm) { if (komm == "") return (NA_character_) # I tried with agrep (levensthein distance) but results are better that way matches <- str_which(komm, rs$name) if (length(matches) == 0) return(NA_character_) rs[head(matches, 1), ]$id } comments %>% distinct(kommentator) %>% mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) } repair_comments <- function(comments, redner) { # try to find a redner id for each actual comment comments %>% filter(!is.na(kommentator)) %>% lookup_redner(redner) %>% left_join(comments, ., by="kommentator") %>% select(-kommentator) } #' Repair parsed tables #' #' @export repair <- function(parse_output) { list(redner = repair_redner(parse_output$redner), reden = repair_reden(parse_output$reden), talks = repair_talks(parse_output$talks), #comments = repair_comments(parse_output$comments) comments = parse_output$comments, applause = parse_output$applause ) }