Merge branch 'master' of https://git.flavigny.de/christian/hateimparlament

4 年之前 · 5f9343bf7f
--- a/R/parse.R
+++ b/R/parse.R
@@ -13,6 +13,9 @@ read_all <- function(path="records/") {
    available_protocols <- list.files(path)
    res <- pblapply(available_protocols, read_one, path=path)

    if (length(available_protocols) == 0)
        stop("The given directory is empty or does not exist.")

    lapply(res, `[[`, "speaker") %>%
        bind_rows() %>%
        distinct() ->
@@ -34,9 +37,6 @@ read_all <- function(path="records/") {
        distinct() ->
        commentsandapplause

    if (length(available_protocols) == 0)
        warning("The given directory is empty or does not exist.")

    filter(commentsandapplause, type == "comment") %>%
        select(-type) ->
        comments
@@ -46,7 +46,7 @@ read_all <- function(path="records/") {
               "SPD" = str_detect(fraction, "SPD"),
               "FDP" = str_detect(fraction, "FDP"),
               "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
               "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
               "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
               "AfD" = str_detect(fraction, "AfD")) %>%
        select(-fraction) ->
        applause
@@ -227,15 +227,3 @@ read_from_csv <- function(path="csv/") {
         comments = tibble %$% read.table(str_c(path, "comments.csv")),
         applause = tibble %$% read.table(str_c(path, "applause.csv")))
 }

 # -------------------------------
 # EXAMPLE USE

 # make sure data ist downloaded via fetch.R
 # res <- read_one("records/19126-data.xml")
 # 
 # res$speaker
 # res$speeches
 # res$talks

 # -------------------------------
--- a/R/repair.R
+++ b/R/repair.R
@@ -1,4 +1,5 @@
 fractions <- c("AFD" = "AfD",
                "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
                "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
                "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
                "FRAKTIONSLOS" = "Fraktionslos",
@@ -51,11 +52,20 @@ repair_talks <- function(talks) {
    filter(talks, str_length(content) > 0)
 }

 # tries to find the correct speaker id given a name
 # this is sufficient since every prename lastname combination in the bundestag is
 # unique (luckily :D)
 # returns a lookup table
 lookup_speaker <- function(comments, speaker) {
 #' Lookup name in speakers table
 #'
 #' Tries to find the correct speaker id given a name.
 #' This is sufficient since every prename lastname combination in the bundestag is
 #' unique (luckily :D)
 #' 
 #' @param tb tibble
 #' @param redner tibble
 #' @param name_variable name
 #' 
 #' Tries to match the name_variable column with speaker names
 #' 
 #' returns a lookup table
 lookup_speaker <- function(tb, speaker, name_variable) {
    tobereplaced <- "[-–—‑- ]"
    speaker %>%
        unite(name, vorname, nachname, sep=".*") %>%
@@ -68,29 +78,38 @@ lookup_speaker <- function(comments, speaker) {
        if (length(matches) == 0) return(NA_character_)
        rs[head(matches, 1), ]$id
    }
    comments %>%
        distinct(kommentator) %>%
        mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
    tb %>%
        distinct({{name_variable}}) %>%
        mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
 }

 repair_comments <- function(comments, speaker) {
    # try to find a speaker id for each actual comment
 repair_comments <- function(comments, redner) {
    cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
               "Use repair(, repair_commments = FALSE) to skip this.\n"))
    # try to find a redner id for each actual comment
    comments %>%
        filter(!is.na(kommentator)) %>%
        lookup_speaker(speaker) %>%
        lookup_redner(redner, kommentator) %>%
        left_join(comments, ., by="kommentator") %>%
        select(-kommentator)
 }

 #' Repair parsed tables
 #'
 #' TODO: Explain repair_comments argument
 #' (if TRUE, we try to lookup redner names in redner table)
 #'
 #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
 #' Since repaired tables should be a fixpoint of repair.
 #' @export
 repair <- function(parse_output) {
    list(speaker = repair_speaker(parse_output$speaker),
         speeches = repair_speeches(parse_output$speeches),
 repair <- function(parse_output, repair_comments = FALSE) {

    list(redner = repair_speaker(parse_output$speaker),
         reden = repair_speeches(parse_output$speeches),
         talks = repair_talks(parse_output$talks),
         #comments = repair_comments(parse_output$comments)
         comments = parse_output$comments,
         comments = if(repair_comments) repair_comments(parse_output$comments,
                                                        parse_output$speaker)
                    else parse_output$comments,
         applause = parse_output$applause
         )
 }
--- a/vignettes/funwithdata.Rmd
+++ b/vignettes/funwithdata.Rmd
@@ -107,7 +107,7 @@ res$applause %>%
    group_by(on_fraction) %>%
    arrange(on_fraction) %>%
    summarize("AfD" = sum(`AfD`),
              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
              "CDU/CSU" = sum(`CDU_CSU`),
              "DIE LINKE" = sum(`DIE_LINKE`),
              "FDP" = sum(`FDP`),