solve most of predefined challenges

преди 4 години · 4fe45feec9
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,6 +21,7 @@ Imports:
    rvest,
    stringr,
    tibble,
    tidyr,
    xml2
 Suggests: 
    rmarkdown,
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,16 +1,19 @@
 # Generated by roxygen2: do not edit by hand

 export(bar_plot_fraktionen)
 export(fetch_all)
 export(find_word)
 export(join_redner)
 export(read_all)
 export(read_from_csv)
 export(repair)
 export(word_usage_by_date)
 export(write_to_csv)
 import(dplyr)
 import(pbapply)
 import(purrr)
 import(stringr)
 import(tibble)
 import(tidyr)
 import(utils)
 import(xml2)
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -29,3 +29,22 @@ bar_plot_fraktionen <- function(tb) {
    scale_fill_manual(values = party_colors) +
    geom_bar(stat = "identity")
 }

 # Counts how many talks do match a given pattern and summarises by date
 #
 #' @export
 word_usage_by_date <- function(res, patterns, name, tidy=F) {
    tb <- res$talks
    nms <- names(patterns)
    for (i in seq_along(patterns)) {
        if (!is.null(nms)) name <- nms[[i]]
        else name <- patterns[[i]]
        tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]]))
    }
    left_join(tb, res$reden, by=c("rede_id" = "id")) %>%
        group_by(date) %>%
        summarize(across(where(is.numeric), sum)) %>%
        arrange(date) -> tb
    if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count")
    else tb
 }
--- a/R/hateimparlament-package.R
+++ b/R/hateimparlament-package.R
@@ -6,6 +6,7 @@
 #' @import stringr
 #' @import xml2
 #' @import utils
 #' @import tidyr
 #' @import purrr
 #' @keywords internal
 "_PACKAGE"
--- a/R/parse.R
+++ b/R/parse.R
@@ -20,7 +20,8 @@ read_all <- function(path="records/") {
    
    lapply(res, `[[`, "reden") %>%
        bind_rows() %>%
        distinct() ->
        distinct() %>%
        mutate(date = as.Date(date, format="%d.%m.%Y")) ->
        reden

    lapply(res, `[[`, "talks") %>%
@@ -31,11 +32,26 @@ read_all <- function(path="records/") {
    lapply(res, `[[`, "comments") %>%
        bind_rows() %>%
        distinct() ->
        comments
        commentsandapplause

    if (length(available_protocols) == 0)
        warning("The given directory is empty or does not exist.")
    list(redner = redner, reden = reden, talks = talks, comments = comments)

    filter(commentsandapplause, type == "comment") %>%
        select(-type) ->
        comments
    filter(commentsandapplause, type == "applause") %>%
        select(-type, -kommentator, -content) %>%
        mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
               "SPD" = str_detect(fraktion, "SPD"),
               "FDP" = str_detect(fraktion, "FDP"),
               "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
               "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
               "AfD" = str_detect(fraktion, "AfD")) %>%
        select(-fraktion) ->
        applause

    list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause)
 }

 # this reads all currently parseable data from one xml
@@ -43,6 +59,8 @@ read_one <- function(name, path) {
    x <- tryCatch(read_xml(paste0(path, name)),
                  error = function(c) NULL)
    if (is.null(x)) return(NULL)
    # extract date of session
    date <- xml_attr(x, "sitzung-datum")
    cs <- xml_children(x)

    verlauf <- xml_find_first(x, "sitzungsverlauf")
@@ -54,7 +72,7 @@ read_one <- function(name, path) {

    xml_children(verlauf) %>%
        xml_find_all("rede") %>%
        parse_redenliste() ->
        parse_redenliste(date) ->
        res

    list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
@@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) {
 # parse one rede
 # returns: - a rede (with rede id and redner id)
 #          - all talks appearing in the rede (with corresponding content)
 parse_rede <- function(rede_xml) {
 parse_rede <- function(rede_xml, date) {
    rede_id <- xml_attr(rede_xml, "id")
    cs <- xml_children(rede_xml)
    cur_redner <- NA_character_
@@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) {
              redner = cur_redner,
              content = cur_content)
    reden <- c(reden, list(rede))
    list(rede = c(id = rede_id, redner = principal_redner),
    list(rede = c(id = rede_id, redner = principal_redner, date = date),
         parts = reden,
         comments = comments)
 }
@@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN

 parse_comment <- function(comment, rede_id, on_redner) {
    base <- c(rede_id = rede_id, on_redner = on_redner)
    str_extract_all(comment, fraktionspattern) %>%
        `[[`(1) %>%
        sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
        str_c(collapse=",") ->
        by
    # classify comment
    # TODO:
    # - actually separate content properly
    # - differentiate between [AfD] and AfD in by
    if(str_detect(comment, "Beifall")) {
        str_extract_all(comment, fraktionspattern) %>%
            `[[`(1) %>%
            sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
            str_c(collapse=",") ->
            by
        c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
    } else {
        ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
@@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) {
 }

 # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
 parse_redenliste <- function(redenliste_xml) {
    d <- sapply(redenliste_xml, parse_rede)
 parse_redenliste <- function(redenliste_xml, date) {
    d <- sapply(redenliste_xml, parse_rede, date = date)
    reden <- simplify2array(d["rede", ])
    parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
    comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
    list(reden = tibble(id = reden["id",], redner = reden["redner",]),
    list(reden = tibble(id = reden["id",], redner = reden["redner",],
                        date = reden["date",]),
         talks = tibble(rede_id = parts["rede_id", ],
                        redner = parts["redner", ],
                        content = parts["content", ]),
@@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) {
    write.table(tables$reden, str_c(path, "reden.csv"))
    write.table(tables$talks, str_c(path, "talks.csv"))
    write.table(tables$comments, str_c(path, "comments.csv"))
    write.table(tables$applause, str_c(path, "applause.csv"))
 }

 #' @export
@@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") {
             tibble() %>%
             mutate(redner = as.character(redner)),
         talks = tibble %$% read.table(str_c(path, "talks.csv")),
         comments = tibble %$% read.table(str_c(path, "comments.csv")))
         comments = tibble %$% read.table(str_c(path, "comments.csv")),
         applause = tibble %$% read.table(str_c(path, "applause.csv")))
 }

 # -------------------------------
--- a/R/repair.R
+++ b/R/repair.R
@@ -47,8 +47,8 @@ repair_reden <- function(reden) {

 repair_talks <- function(talks) {
    if (nrow(talks) == 0) return(talks)
    # TODO: fill with content
    talks
    # ignore all talks which have empty content
    filter(talks, str_length(content) > 0)
 }

 # tries to find the correct redner id given a name
@@ -90,6 +90,7 @@ repair <- function(parse_output) {
         reden = repair_reden(parse_output$reden),
         talks = repair_talks(parse_output$talks),
         #comments = repair_comments(parse_output$comments)
         comments = parse_output$comments
         comments = parse_output$comments,
         applause = parse_output$applause
         )
 }
--- a/vignettes/funwithdata.Rmd
+++ b/vignettes/funwithdata.Rmd
@@ -48,8 +48,8 @@ talks <- res$talks
 ## Analysis

 Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
 ```{r, fig.width=10}
 join_redner(reden, res) %>%
 ```{r, fig.width=8}
 join_redner(res$reden, res) %>%
    group_by(fraktion) %>%
    summarize(n = n()) %>%
    arrange(n) %>%
@@ -58,7 +58,7 @@ join_redner(reden, res) %>%

 ### Count a word occurence

 ```{r, fig.width=10}
 ```{r, fig.width=8}
 find_word(res, "hitler") %>%
    filter(occurences > 0) %>%
    join_redner(res) %>%
@@ -68,3 +68,74 @@ find_word(res, "hitler") %>%
    arrange(desc(n)) %>%
    bar_plot_fraktionen()
 ```

 ### Who gives the most speeches?

 ```{r}
 res$reden %>%
    group_by(redner) %>%
    summarize(n = n()) %>%
    arrange(-n) %>%
    left_join(res$redner, by=c("redner" = "id")) %>%
    head(10)
 ```

 ### Who talks the longest?

 ```{r}
 res$talks %>%
    mutate(content_len = str_length(content)) %>%
    group_by(redner) %>%
    summarize(avg_content_len = mean(content_len)) %>%
    arrange(-avg_content_len) %>%
    left_join(res$redner, by=c("redner" = "id")) %>%
    head(10)
 ```

 ### Which party gives the most applause to which parties?

 ```{r}
 res$applause %>%
    left_join(res$redner, by=c("on_redner" = "id")) %>%
    select(on_fraktion = fraktion, where(is.logical)) %>%
    group_by(on_fraktion) %>%
    arrange(on_fraktion) %>%
    summarize("AfD" = sum(`AfD`),
              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
              "CDU/CSU" = sum(`CDU_CSU`),
              "DIE LINKE" = sum(`DIE_LINKE`),
              "FDP" = sum(`FDP`),
              "SPD" = sum(`SPD`))
 ```

 ### Which party comments the most on which parties?

 ```{r}
 res$comments %>%
    left_join(res$redner, by=c("on_redner" = "id")) %>%
    select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>%
    group_by(on_fraktion) %>%
    summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T),
              `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
              `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T),
              `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T),
              `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T),
              `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T))
 ```

 ### When are which topics discussed the most?

 ```{r, fig.width=8}
 pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
 climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
 pension_pattern <- "(?i)rente|pension|altersarmut"

 word_usage_by_date(res, c(pandemic = pandemic_pattern,
                          climate = climate_pattern,
                          pension = pension_pattern)) %>%
    ggplot(aes(x = date, y = count, color = pattern)) +
    xlab("date of session") +
    ylab("occurence of word per session") +
    labs(color = "Topic") +
    geom_point()
 ```