|
- fractions <- c("AFD" = "AfD",
- "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
- "BÜNDNIS90/" = "BÜNDNIS 90/DIE GRÜNEN",
- "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90/DIE GRÜNEN",
- "FRAKTIONSLOS" = "Fraktionslos",
- "DIELINKE" = "DIE LINKE",
- "SPD" = "SPD",
- "CDU/CSU" = "CDU/CSU",
- "FDP" = "FDP")
-
- repair_fraction <- function(fraction) {
- cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "")
- fractions[cleaned]
- }
-
- # takes vector of titel and keeps longest
- longest_titel <- function(titel) {
- if (all(is.na(titel))) NA_character_
- else titel[which.max %$% str_length(titel)]
- }
-
- # takes character vector, removes duplicates and collapses
- collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")
-
- # expects a tibble of speaker and repairs
- repair_speaker <- function(speaker) {
- if (nrow(speaker) == 0) return(speaker)
- speaker %>%
- filter(id != "10000") %>% # invalid id's
- mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction
- group_by(id) %>%
- summarize(prename = head(prename, 1),
- lastname = head(lastname, 1),
- fraction = collect_unique(fraction),
- title = longest_titel(title),
- role_short = collect_unique(str_squish(role_short)),
- role_long = collect_unique(str_squish(role_long))) %>%
- ungroup() #%>%
- }
-
- repair_speeches <- function(speeches) {
- if (nrow(speeches) == 0) return(speeches)
- # TODO: fill with content
- speeches
- }
-
- repair_talks <- function(talks) {
- if (nrow(talks) == 0) return(talks)
- # ignore all talks which have empty content
- filter(talks, str_length(content) > 0)
- }
-
- #' Lookup name in speakers table
- #'
- #' Tries to find the correct speaker id given a name.
- #' This is sufficient since every prename lastname combination in the bundestag is
- #' unique (luckily :D)
- #'
- #' @param tb tibble
- #' @param speaker tibble
- #' @param name_variable name
- #'
- #' Tries to match the name_variable column with speaker names
- #'
- #' returns a lookup table
- lookup_speaker <- function(tb, speaker, name_variable) {
- tobereplaced <- "[-–—‑- ]"
- speaker %>%
- unite(name, prename, lastname, sep=".*") %>%
- mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
- rs
- find_match <- function(komm) {
- if (komm == "") return (NA_character_)
- # I tried with agrep (levensthein distance) but results are better that way
- matches <- str_which(komm, rs$name)
- if (length(matches) == 0) return(NA_character_)
- rs[head(matches, 1), ]$id
- }
- tb %>%
- distinct({{name_variable}}) %>%
- mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
- }
-
- repair_comments <- function(comments, speaker, lookup_speaker=F) {
- comments %>%
- filter(!is.na(commenter) | !is.na(content) | !is.na(fraction)) ->
- tb
- if (lookup_speaker) {
- cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
- "Use repair(, lookup_speaker = FALSE) to skip this.\n"))
- # try to find a speaker id for each actual comment
- tb %>%
- filter(!is.na(commenter)) %>%
- lookup_speaker(speaker, commenter) %>%
- left_join(tb, ., by="commenter")
- } else tb
- }
-
- #' Repair parsed tables
- #'
- #' @param parse_output tibble
- #' @param lookup_speaker bool
- #'
- #' If lookup_speaker is TRUE, members of the parliament mentioned in comments are looked up in speaker table.
- #'
- #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
- #' Since repaired tables should be a fixpoint of repair.
- #' @export
- repair <- function(parse_output, lookup_speaker = FALSE) {
- is_valid_res(parse_output)
- stopifnot("lookup_speaker must be of type logical" = is.logical(lookup_speaker))
- list(speaker = repair_speaker(parse_output$speaker),
- speeches = repair_speeches(parse_output$speeches),
- talks = repair_talks(parse_output$talks),
- comments = repair_comments(parse_output$comments,
- parse_output$speaker,
- lookup_speaker),
- applause = parse_output$applause)
- }
|