fractions <- c("AFD" = "AfD", "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos", "B\u00DCNDNIS90/" = "B\u00DCNDNIS 90/DIE GR\u00DCNEN", "B\u00DCNDNIS90/DIEGR\u00DCNEN" = "B\u00DCNDNIS 90/DIE GR\u00DCNEN", "FRAKTIONSLOS" = "Fraktionslos", "DIELINKE" = "DIE LINKE", "SPD" = "SPD", "CDU/CSU" = "CDU/CSU", "FDP" = "FDP") repair_fraction <- function(fraction) { cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "") fractions[cleaned] } # takes vector of titel and keeps longest longest_titel <- function(titel) { if (all(is.na(titel))) NA_character_ else titel[which.max %$% str_length(titel)] } # takes character vector, removes duplicates and collapses collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") # expects a tibble of speaker and repairs repair_speaker <- function(speaker) { if (nrow(speaker) == 0) return(speaker) speaker %>% filter(id != "10000") %>% # invalid id's mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction group_by(id) %>% summarize(prename = head(prename, 1), lastname = head(lastname, 1), fraction = collect_unique(fraction), title = longest_titel(title), role_short = collect_unique(str_squish(role_short)), role_long = collect_unique(str_squish(role_long))) %>% ungroup() #%>% } repair_speeches <- function(speeches) { if (nrow(speeches) == 0) return(speeches) # TODO: fill with content speeches } repair_talks <- function(talks) { if (nrow(talks) == 0) return(talks) # ignore all talks which have empty content filter(talks, str_length(content) > 0) } #' Lookup name in speakers table #' #' Tries to find the correct speaker id given a name. #' This is sufficient since every prename lastname combination in the bundestag is #' unique (luckily :D) #' #' @param tb tibble #' @param speaker tibble #' @param name_variable name #' #' Tries to match the name_variable column with speaker names #' #' returns a lookup table lookup_speaker <- function(tb, speaker, name_variable) { tobereplaced <- "[\u002D\u2013\u2014\u2011\u00AD ]" speaker %>% unite(name, prename, lastname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs find_match <- function(komm) { if (komm == "") return (NA_character_) # I tried with agrep (levensthein distance) but results are better that way matches <- str_which(komm, rs$name) if (length(matches) == 0) return(NA_character_) rs[head(matches, 1), ]$id } tb %>% distinct({{name_variable}}) %>% mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, ""))) } repair_comments <- function(comments, speaker, lookup_speaker=F) { comments %>% filter(!is.na(commenter) | !is.na(content) | !is.na(fraction)) -> tb if (lookup_speaker) { cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n", "Use repair(, lookup_speaker = FALSE) to skip this.\n")) # try to find a speaker id for each actual comment tb %>% filter(!is.na(commenter)) %>% lookup_speaker(speaker, commenter) %>% left_join(tb, ., by="commenter") } else tb } #' Repair parsed tables #' #' @param parse_output tibble #' @param lookup_speaker bool #' #' If lookup_speaker is TRUE, members of the parliament mentioned in comments are looked up in speaker table. #' #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE #' Since repaired tables should be a fixpoint of repair. #' @export repair <- function(parse_output, lookup_speaker = FALSE) { is_valid_res(parse_output) stopifnot("lookup_speaker must be of type logical" = is.logical(lookup_speaker)) list(speaker = repair_speaker(parse_output$speaker), speeches = repair_speeches(parse_output$speeches), talks = repair_talks(parse_output$talks), comments = repair_comments(parse_output$comments, parse_output$speaker, lookup_speaker), applause = parse_output$applause) }