From 4fe45feec9233cfa39b3d801ba1e3b106f6828b0 Mon Sep 17 00:00:00 2001
From: flavis <christian@flavigny.de>
Date: Mon, 2 Aug 2021 16:12:18 +0200
Subject: [PATCH] solve most of predefined challenges

---
 DESCRIPTION                 |  1 +
 NAMESPACE                   |  3 ++
 R/analyze.R                 | 19 +++++++++
 R/hateimparlament-package.R |  1 +
 R/parse.R                   | 54 +++++++++++++++++---------
 R/repair.R                  |  7 ++--
 vignettes/funwithdata.Rmd   | 77 +++++++++++++++++++++++++++++++++++--
 7 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 71133cb..a0170ad 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,6 +21,7 @@ Imports:
     rvest,
     stringr,
     tibble,
+    tidyr,
     xml2
 Suggests: 
     rmarkdown,
diff --git a/NAMESPACE b/NAMESPACE
index 9b21836..d99fd6f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,16 +1,19 @@
 # Generated by roxygen2: do not edit by hand
 
+export(bar_plot_fraktionen)
 export(fetch_all)
 export(find_word)
 export(join_redner)
 export(read_all)
 export(read_from_csv)
 export(repair)
+export(word_usage_by_date)
 export(write_to_csv)
 import(dplyr)
 import(pbapply)
 import(purrr)
 import(stringr)
 import(tibble)
+import(tidyr)
 import(utils)
 import(xml2)
diff --git a/R/analyze.R b/R/analyze.R
index dfb12b7..18dd453 100644
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -29,3 +29,22 @@ bar_plot_fraktionen <- function(tb) {
     scale_fill_manual(values = party_colors) +
     geom_bar(stat = "identity")
 }
+
+# Counts how many talks do match a given pattern and summarises by date
+#
+#' @export
+word_usage_by_date <- function(res, patterns, name, tidy=F) {
+    tb <- res$talks
+    nms <- names(patterns)
+    for (i in seq_along(patterns)) {
+        if (!is.null(nms)) name <- nms[[i]]
+        else name <- patterns[[i]]
+        tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]]))
+    }
+    left_join(tb, res$reden, by=c("rede_id" = "id")) %>%
+        group_by(date) %>%
+        summarize(across(where(is.numeric), sum)) %>%
+        arrange(date) -> tb
+    if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count")
+    else tb
+}
diff --git a/R/hateimparlament-package.R b/R/hateimparlament-package.R
index 542609b..eab8891 100644
--- a/R/hateimparlament-package.R
+++ b/R/hateimparlament-package.R
@@ -6,6 +6,7 @@
 #' @import stringr
 #' @import xml2
 #' @import utils
+#' @import tidyr
 #' @import purrr
 #' @keywords internal
 "_PACKAGE"
diff --git a/R/parse.R b/R/parse.R
index d3d2476..40b6045 100644
--- a/R/parse.R
+++ b/R/parse.R
@@ -20,7 +20,8 @@ read_all <- function(path="records/") {
     
     lapply(res, `[[`, "reden") %>%
         bind_rows() %>%
-        distinct() ->
+        distinct() %>%
+        mutate(date = as.Date(date, format="%d.%m.%Y")) ->
         reden
 
     lapply(res, `[[`, "talks") %>%
@@ -31,11 +32,26 @@ read_all <- function(path="records/") {
     lapply(res, `[[`, "comments") %>%
         bind_rows() %>%
         distinct() ->
-        comments
+        commentsandapplause
 
     if (length(available_protocols) == 0)
         warning("The given directory is empty or does not exist.")
-    list(redner = redner, reden = reden, talks = talks, comments = comments)
+
+    filter(commentsandapplause, type == "comment") %>%
+        select(-type) ->
+        comments
+    filter(commentsandapplause, type == "applause") %>%
+        select(-type, -kommentator, -content) %>%
+        mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
+               "SPD" = str_detect(fraktion, "SPD"),
+               "FDP" = str_detect(fraktion, "FDP"),
+               "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
+               "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
+               "AfD" = str_detect(fraktion, "AfD")) %>%
+        select(-fraktion) ->
+        applause
+
+    list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause)
 }
 
 # this reads all currently parseable data from one xml
@@ -43,6 +59,8 @@ read_one <- function(name, path) {
     x <- tryCatch(read_xml(paste0(path, name)),
                   error = function(c) NULL)
     if (is.null(x)) return(NULL)
+    # extract date of session
+    date <- xml_attr(x, "sitzung-datum")
     cs <- xml_children(x)
 
     verlauf <- xml_find_first(x, "sitzungsverlauf")
@@ -54,7 +72,7 @@ read_one <- function(name, path) {
 
     xml_children(verlauf) %>%
         xml_find_all("rede") %>%
-        parse_redenliste() ->
+        parse_redenliste(date) ->
         res
 
     list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
@@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) {
 # parse one rede
 # returns: - a rede (with rede id and redner id)
 #          - all talks appearing in the rede (with corresponding content)
-parse_rede <- function(rede_xml) {
+parse_rede <- function(rede_xml, date) {
     rede_id <- xml_attr(rede_xml, "id")
     cs <- xml_children(rede_xml)
     cur_redner <- NA_character_
@@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) {
               redner = cur_redner,
               content = cur_content)
     reden <- c(reden, list(rede))
-    list(rede = c(id = rede_id, redner = principal_redner),
+    list(rede = c(id = rede_id, redner = principal_redner, date = date),
          parts = reden,
          comments = comments)
 }
@@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN
 
 parse_comment <- function(comment, rede_id, on_redner) {
     base <- c(rede_id = rede_id, on_redner = on_redner)
-    str_extract_all(comment, fraktionspattern) %>%
-        `[[`(1) %>%
-        sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
-        str_c(collapse=",") ->
-        by
     # classify comment
-    # TODO:
-    # - actually separate content properly
-    # - differentiate between [AfD] and AfD in by
     if(str_detect(comment, "Beifall")) {
+        str_extract_all(comment, fraktionspattern) %>%
+            `[[`(1) %>%
+            sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
+            str_c(collapse=",") ->
+            by
         c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
     } else {
         ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
@@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) {
 }
 
 # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
-parse_redenliste <- function(redenliste_xml) {
-    d <- sapply(redenliste_xml, parse_rede)
+parse_redenliste <- function(redenliste_xml, date) {
+    d <- sapply(redenliste_xml, parse_rede, date = date)
     reden <- simplify2array(d["rede", ])
     parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
     comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
-    list(reden = tibble(id = reden["id",], redner = reden["redner",]),
+    list(reden = tibble(id = reden["id",], redner = reden["redner",],
+                        date = reden["date",]),
          talks = tibble(rede_id = parts["rede_id", ],
                         redner = parts["redner", ],
                         content = parts["content", ]),
@@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) {
     write.table(tables$reden, str_c(path, "reden.csv"))
     write.table(tables$talks, str_c(path, "talks.csv"))
     write.table(tables$comments, str_c(path, "comments.csv"))
+    write.table(tables$applause, str_c(path, "applause.csv"))
 }
 
 #' @export
@@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") {
              tibble() %>%
              mutate(redner = as.character(redner)),
          talks = tibble %$% read.table(str_c(path, "talks.csv")),
-         comments = tibble %$% read.table(str_c(path, "comments.csv")))
+         comments = tibble %$% read.table(str_c(path, "comments.csv")),
+         applause = tibble %$% read.table(str_c(path, "applause.csv")))
 }
 
 # -------------------------------
diff --git a/R/repair.R b/R/repair.R
index 53ff264..21d5738 100644
--- a/R/repair.R
+++ b/R/repair.R
@@ -47,8 +47,8 @@ repair_reden <- function(reden) {
 
 repair_talks <- function(talks) {
     if (nrow(talks) == 0) return(talks)
-    # TODO: fill with content
-    talks
+    # ignore all talks which have empty content
+    filter(talks, str_length(content) > 0)
 }
 
 # tries to find the correct redner id given a name
@@ -90,6 +90,7 @@ repair <- function(parse_output) {
          reden = repair_reden(parse_output$reden),
          talks = repair_talks(parse_output$talks),
          #comments = repair_comments(parse_output$comments)
-         comments = parse_output$comments
+         comments = parse_output$comments,
+         applause = parse_output$applause
          )
 }
diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd
index 2e6bc28..6fc9370 100644
--- a/vignettes/funwithdata.Rmd
+++ b/vignettes/funwithdata.Rmd
@@ -48,8 +48,8 @@ talks <- res$talks
 ## Analysis
 
 Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
-```{r, fig.width=10}
-join_redner(reden, res) %>%
+```{r, fig.width=8}
+join_redner(res$reden, res) %>%
     group_by(fraktion) %>%
     summarize(n = n()) %>%
     arrange(n) %>%
@@ -58,7 +58,7 @@ join_redner(reden, res) %>%
 
 ### Count a word occurence
 
-```{r, fig.width=10}
+```{r, fig.width=8}
 find_word(res, "hitler") %>%
     filter(occurences > 0) %>%
     join_redner(res) %>%
@@ -68,3 +68,74 @@ find_word(res, "hitler") %>%
     arrange(desc(n)) %>%
     bar_plot_fraktionen()
 ```
+
+### Who gives the most speeches?
+
+```{r}
+res$reden %>%
+    group_by(redner) %>%
+    summarize(n = n()) %>%
+    arrange(-n) %>%
+    left_join(res$redner, by=c("redner" = "id")) %>%
+    head(10)
+```
+
+### Who talks the longest?
+
+```{r}
+res$talks %>%
+    mutate(content_len = str_length(content)) %>%
+    group_by(redner) %>%
+    summarize(avg_content_len = mean(content_len)) %>%
+    arrange(-avg_content_len) %>%
+    left_join(res$redner, by=c("redner" = "id")) %>%
+    head(10)
+```
+
+### Which party gives the most applause to which parties?
+
+```{r}
+res$applause %>%
+    left_join(res$redner, by=c("on_redner" = "id")) %>%
+    select(on_fraktion = fraktion, where(is.logical)) %>%
+    group_by(on_fraktion) %>%
+    arrange(on_fraktion) %>%
+    summarize("AfD" = sum(`AfD`),
+              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
+              "CDU/CSU" = sum(`CDU_CSU`),
+              "DIE LINKE" = sum(`DIE_LINKE`),
+              "FDP" = sum(`FDP`),
+              "SPD" = sum(`SPD`))
+```
+
+### Which party comments the most on which parties?
+
+```{r}
+res$comments %>%
+    left_join(res$redner, by=c("on_redner" = "id")) %>%
+    select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>%
+    group_by(on_fraktion) %>%
+    summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T),
+              `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
+              `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T),
+              `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T),
+              `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T),
+              `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T))
+```
+
+### When are which topics discussed the most?
+
+```{r, fig.width=8}
+pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
+climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
+pension_pattern <- "(?i)rente|pension|altersarmut"
+
+word_usage_by_date(res, c(pandemic = pandemic_pattern,
+                          climate = climate_pattern,
+                          pension = pension_pattern)) %>%
+    ggplot(aes(x = date, y = count, color = pattern)) +
+    xlab("date of session") +
+    ylab("occurence of word per session") +
+    labs(color = "Topic") +
+    geom_point()
+```