fractions <- c("AFD" = "AfD", "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", "DIELINKE" = "DIE LINKE", "SPD" = "SPD", "CDU/CSU" = "CDU/CSU", "FDP" = "FDP") repair_fraction <- function(fraction) { cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "") fractions[cleaned] } # takes vector of titel and keeps longest longest_titel <- function(titel) { if (all(is.na(titel))) NA_character_ else titel[which.max %$% str_length(titel)] } # takes character vector, removes duplicates and collapses collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") # expects a tibble of speaker and repairs repair_speaker <- function(speaker) { if (nrow(speaker) == 0) return(speaker) speaker %>% filter(id != "10000") %>% # invalid id's mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction group_by(id) %>% summarize(vorname = head(vorname, 1), nachname = head(nachname, 1), fraction = collect_unique(fraction), titel = longest_titel(titel), rolle_kurz = collect_unique(str_squish(rolle_kurz)), rolle_lang = collect_unique(str_squish(rolle_lang))) %>% ungroup() #%>% # arrange(id) %>% # distinct(vorname, nachname, fraction, titel) } repair_speeches <- function(speeches) { if (nrow(speeches) == 0) return(speeches) # TODO: fill with content speeches } repair_talks <- function(talks) { if (nrow(talks) == 0) return(talks) # ignore all talks which have empty content filter(talks, str_length(content) > 0) } #' Lookup name in speakers table #' #' Tries to find the correct speaker id given a name. #' This is sufficient since every prename lastname combination in the bundestag is #' unique (luckily :D) #' #' @param tb tibble #' @param speaker tibble #' @param name_variable name #' #' Tries to match the name_variable column with speaker names #' #' returns a lookup table lookup_speaker <- function(tb, speaker, name_variable) { tobereplaced <- "[-–—‑­­-­­­ ]" speaker %>% unite(name, vorname, nachname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs find_match <- function(komm) { if (komm == "") return (NA_character_) # I tried with agrep (levensthein distance) but results are better that way matches <- str_which(komm, rs$name) if (length(matches) == 0) return(NA_character_) rs[head(matches, 1), ]$id } tb %>% distinct({{name_variable}}) %>% mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, ""))) } repair_comments <- function(comments, speaker) { cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", "Use repair(, repair_commments = FALSE) to skip this.\n")) # try to find a speaker id for each actual comment comments %>% filter(!is.na(kommentator)) %>% lookup_speaker(speaker, kommentator) %>% left_join(comments, ., by="kommentator") %>% select(-kommentator) } #' Repair parsed tables #' #' @param parse_output tibble #' @param repair_comments bool #' #' If repair_comments is TRUE, members of the parliament mentioned in comments are looked up in speaker table. #' #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE #' Since repaired tables should be a fixpoint of repair. #' @export repair <- function(parse_output, repair_comments = FALSE) { list(speaker = repair_speaker(parse_output$speaker), speeches = repair_speeches(parse_output$speeches), talks = repair_talks(parse_output$talks), comments = if(repair_comments) repair_comments(parse_output$comments, parse_output$speaker) else parse_output$comments, applause = parse_output$applause ) }