improve comment parsing and add comment commentator redner matching

4 år sedan · f09cbac172
--- a/R/parse.R
+++ b/R/parse.R
@@ -28,9 +28,14 @@ read_all <- function(path="records/") {
        distinct() ->
        talks

    lapply(res, `[[`, "comments") %>%
        bind_rows() %>%
        distinct() ->
        comments

    if (length(available_protocols) == 0)
        warning("The given directory is empty or does not exist.")
    list(redner = redner, reden = reden, talks = talks)
    list(redner = redner, reden = reden, talks = talks, comments = comments)
 }

 # this reads all currently parseable data from one xml
@@ -99,7 +104,8 @@ parse_rede <- function(rede_xml) {
                              content = cur_content)
                    reden <- c(reden, list(rede))
                    cur_content <- ""
                } else {
                } 
                if (is.na(principal_redner) && xml_name(node) != "name") {
                    principal_redner <- xml_child(node) %>% xml_attr("id")
                }
                if (xml_name(node) == "name") {
@@ -144,9 +150,10 @@ parse_comment <- function(comment, rede_id, on_redner) {
    # - actually separate content properly
    # - differentiate between [AfD] and AfD in by
    if(str_detect(comment, "Beifall")) {
        c(base, type = "applause", by = by, content = comment)
        c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
    } else {
        c(base, type = "comment", by = by, content = comment)
        ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
        c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
    }
 }

@@ -163,7 +170,8 @@ parse_redenliste <- function(redenliste_xml) {
         comments = tibble(rede_id = comments["rede_id",],
                           on_redner = comments["on_redner",],
                           type = comments["type",],
                           by = comments["by",],
                           fraktion = comments["fraktion",],
                           kommentator = comments["kommentator",],
                           content = comments["content", ]))
 }

@@ -183,7 +191,7 @@ parse_rednerliste <- function(rednerliste_xml) {
 # EXAMPLE USE

 # make sure data ist downloaded via fetch.R
 res <- read_one("records/19126-data.xml")
 # res <- read_one("records/19126-data.xml")
 # 
 # res$redner
 # res$reden
--- a/R/repair.R
+++ b/R/repair.R
@@ -24,12 +24,19 @@ collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse
 # expects a tibble of redner and repairs
 repair_redner <- function(redner) {
    if (nrow(redner) == 0) return(redner)
    redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
        group_by(id, vorname, nachname) %>%
        summarize(fraktion = collect_unique(fraktion),
    redner %>%
        filter(id != "10000") %>% # invalid id's
        mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
        group_by(id) %>%
        summarize(vorname = head(vorname, 1),
                  nachname = head(nachname, 1),
                  fraktion = collect_unique(fraktion),
                  titel = longest_titel(titel),
                  rolle_kurz = collect_unique(str_squish(rolle_kurz)),
                  rolle_lang = collect_unique(str_squish(rolle_lang)))
                  rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
        ungroup() #%>%
        # arrange(id) %>%
        # distinct(vorname, nachname, fraktion, titel)
 }

 repair_reden <- function(reden) {
@@ -44,6 +51,37 @@ repair_talks <- function(talks) {
    talks
 }

 # tries to find the correct redner id given a name
 # this is sufficient since every prename lastname combination in the bundestag is
 # unique (luckily :D)
 # returns a lookup table
 lookup_redner <- function(comments, redner) {
    tobereplaced <- "[-–—‑- ]"
    redner %>%
        unite(name, vorname, nachname, sep=".*") %>%
        mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
        rs
    find_match <- function(komm) {
        if (komm == "") return (NA_character_)
        # I tried with agrep (levensthein distance) but results are better that way
        matches <- str_which(komm, rs$name)
        if (length(matches) == 0) return(NA_character_)
        rs[head(matches, 1), ]$id
    }
    comments %>%
        distinct(kommentator) %>%
        mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
 }

 repair_comments <- function(comments, redner) {
    # try to find a redner id for each actual comment
    comments %>%
        filter(!is.na(kommentator)) %>%
        lookup_redner(redner) %>%
        left_join(comments, ., by="kommentator") %>%
        select(-kommentator)
 }

 #' Repair parsed tables
 #'
 #' @export