|
- fraktionen <- c("AFD" = "AfD",
- "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
- "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
- "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
- "FRAKTIONSLOS" = "Fraktionslos",
- "DIELINKE" = "DIE LINKE",
- "SPD" = "SPD",
- "CDU/CSU" = "CDU/CSU",
- "FDP" = "FDP")
-
- repair_fraktion <- function(fraktion) {
- cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "")
- fraktionen[cleaned]
- }
-
- # takes vector of titel and keeps longest
- longest_titel <- function(titel) {
- if (all(is.na(titel))) NA_character_
- else titel[which.max %$% str_length(titel)]
- }
-
- # takes character vector, removes duplicates and collapses
- collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")
-
- # expects a tibble of redner and repairs
- repair_redner <- function(redner) {
- if (nrow(redner) == 0) return(redner)
- redner %>%
- filter(id != "10000") %>% # invalid id's
- mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
- group_by(id) %>%
- summarize(vorname = head(vorname, 1),
- nachname = head(nachname, 1),
- fraktion = collect_unique(fraktion),
- titel = longest_titel(titel),
- rolle_kurz = collect_unique(str_squish(rolle_kurz)),
- rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
- ungroup() #%>%
- # arrange(id) %>%
- # distinct(vorname, nachname, fraktion, titel)
- }
-
- repair_reden <- function(reden) {
- if (nrow(reden) == 0) return(reden)
- # TODO: fill with content
- reden
- }
-
- repair_talks <- function(talks) {
- if (nrow(talks) == 0) return(talks)
- # ignore all talks which have empty content
- filter(talks, str_length(content) > 0)
- }
-
- #' Lookup name in speakers table
- #'
- #' Tries to find the correct speaker id given a name.
- #' This is sufficient since every prename lastname combination in the bundestag is
- #' unique (luckily :D)
- #'
- #' @param tb tibble
- #' @param redner tibble
- #' @param name_variable name
- #'
- #' Tries to match the name_variable column with speaker names
- #'
- #' returns a lookup table
- lookup_redner <- function(tb, redner, name_variable) {
- tobereplaced <- "[-–—‑- ]"
- redner %>%
- unite(name, vorname, nachname, sep=".*") %>%
- mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
- rs
- find_match <- function(komm) {
- if (komm == "") return (NA_character_)
- # I tried with agrep (levensthein distance) but results are better that way
- matches <- str_which(komm, rs$name)
- if (length(matches) == 0) return(NA_character_)
- rs[head(matches, 1), ]$id
- }
- tb %>%
- distinct({{name_variable}}) %>%
- mutate(redner = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
- }
-
- repair_comments <- function(comments, redner) {
- cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
- "Use repair(, repair_commments = FALSE) to skip this.\n"))
- # try to find a redner id for each actual comment
- comments %>%
- filter(!is.na(kommentator)) %>%
- lookup_redner(redner, kommentator) %>%
- left_join(comments, ., by="kommentator") %>%
- select(-kommentator)
- }
-
- #' Repair parsed tables
- #'
- #' TODO: Explain repair_comments argument
- #' (if TRUE, we try to lookup redner names in redner table)
- #'
- #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
- #' Since repaired tables should be a fixpoint of repair.
- #' @export
- repair <- function(parse_output, repair_comments = FALSE) {
-
- list(redner = repair_redner(parse_output$redner),
- reden = repair_reden(parse_output$reden),
- talks = repair_talks(parse_output$talks),
- comments = if(repair_comments) repair_comments(parse_output$comments,
- parse_output$redner)
- else parse_output$comments,
- applause = parse_output$applause
- )
- }
|