christian
/
hateimparlament


			
				
					
						
						
							
							fraktionen <- c("AFD" = "AfD",
                "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
                "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
                "FRAKTIONSLOS" = "Fraktionslos",
                "DIELINKE" = "DIE LINKE",
                "SPD" = "SPD",
                "CDU/CSU" = "CDU/CSU",
                "FDP" = "FDP")

repair_fraktion <- function(fraktion) {
    cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "")
    fraktionen[cleaned]
}

# takes vector of titel and keeps longest
longest_titel <- function(titel) {
    if (all(is.na(titel))) NA_character_
    else titel[which.max %$% str_length(titel)]
}

# takes character vector, removes duplicates and collapses
collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")

# expects a tibble of speaker and repairs
repair_speaker <- function(speaker) {
    if (nrow(speaker) == 0) return(speaker)
    speaker %>%
        filter(id != "10000") %>% # invalid id's
        mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
        group_by(id) %>%
        summarize(vorname = head(vorname, 1),
                  nachname = head(nachname, 1),
                  fraktion = collect_unique(fraktion),
                  titel = longest_titel(titel),
                  rolle_kurz = collect_unique(str_squish(rolle_kurz)),
                  rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
        ungroup() #%>%
        # arrange(id) %>%
        # distinct(vorname, nachname, fraktion, titel)
}

repair_speeches <- function(speeches) {
    if (nrow(speeches) == 0) return(speeches)
    # TODO: fill with content
    speeches
}

repair_talks <- function(talks) {
    if (nrow(talks) == 0) return(talks)
    # ignore all talks which have empty content
    filter(talks, str_length(content) > 0)
}

# tries to find the correct speaker id given a name
# this is sufficient since every prename lastname combination in the bundestag is
# unique (luckily :D)
# returns a lookup table
lookup_speaker <- function(comments, speaker) {
    tobereplaced <- "[-–—‑- ]"
    speaker %>%
        unite(name, vorname, nachname, sep=".*") %>%
        mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
        rs
    find_match <- function(komm) {
        if (komm == "") return (NA_character_)
        # I tried with agrep (levensthein distance) but results are better that way
        matches <- str_which(komm, rs$name)
        if (length(matches) == 0) return(NA_character_)
        rs[head(matches, 1), ]$id
    }
    comments %>%
        distinct(kommentator) %>%
        mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
}

repair_comments <- function(comments, speaker) {
    # try to find a speaker id for each actual comment
    comments %>%
        filter(!is.na(kommentator)) %>%
        lookup_speaker(speaker) %>%
        left_join(comments, ., by="kommentator") %>%
        select(-kommentator)
}

#' Repair parsed tables
#'
#' @export
repair <- function(parse_output) {
    list(speaker = repair_speaker(parse_output$speaker),
         speeches = repair_speeches(parse_output$speeches),
         talks = repair_talks(parse_output$talks),
         #comments = repair_comments(parse_output$comments)
         comments = parse_output$comments,
         applause = parse_output$applause
         )
}