From 5490f9fed6398840215ac05faf2b6b0a3a0afc1c Mon Sep 17 00:00:00 2001
From: flavis <christian@flavigny.de>
Date: Mon, 9 Aug 2021 16:08:33 +0200
Subject: [PATCH] remove funwithdata, add some text, improve some fig heights

---
 vignettes/explicittopic.Rmd    |  26 +++--
 vignettes/funwithdata.Rmd      | 181 ---------------------------------
 vignettes/generalquestions.Rmd |  18 ++--
 vignettes/interaction.Rmd      |  14 +--
 4 files changed, 22 insertions(+), 217 deletions(-)
 delete mode 100644 vignettes/funwithdata.Rmd

diff --git a/vignettes/explicittopic.Rmd b/vignettes/explicittopic.Rmd
index 5149ecf..c571164 100644
--- a/vignettes/explicittopic.Rmd
+++ b/vignettes/explicittopic.Rmd
@@ -32,20 +32,12 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp
 ```r
 read_all("../inst/records/") %>% repair() -> res
 ```
-We also used `repair` to fix a bunch of formatting issues in the records and unpacked
-the result into more descriptive variables.
+We also used `repair` to fix a bunch of formatting issues in the records.
 
 For development purposes, we load the tables from csv files.
 ```{r}
 res <- read_from_csv('../inst/csv/')
 ```
-and unpack our tibbles
-```{r}
-comments <- res$comments
-speeches <- res$speeches
-speaker <- res$speaker
-talks <- res$talks
-```
 
 ## Analysis
 
@@ -53,7 +45,7 @@ Now we can start analysing our parsed dataset:
 
 ### Counting the occurences of a given word:
 
-```{r, fig.width=7}
+```{r, fig.width=7, fig.height=7}
 find_word(res, "Kohleausstieg") %>%
     filter(occurences > 0) %>%
     join_speaker(res) %>%
@@ -63,17 +55,22 @@ find_word(res, "Kohleausstieg") %>%
     summarize(n = n()) %>%
     arrange(desc(n)) %>%
     bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
-                        ylab = "Number of uses of 'Kohleausstieg'",
-                        flipped = F)
+                       ylab = "Number of uses of 'Kohleausstieg'",
+                       flipped = F,
+                       rotatelab = T)
 ```
 
 ### When are which topics discussed the most?
 
-```{r, fig.width=7}
+First we define some search patterns, according to some common political topics.
+```{r}
 pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
 climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
 pension_pattern <- "(?i)rente|pension|altersarmut"
-
+```
+Then we use the analysis helper `word_usage_by_date` to generate a tibble counting the
+occurences of our search patterns per date. We can then plot the results:
+```{r, fig.width=7, fig.height=6}
 word_usage_by_date(res, c(pandemic = pandemic_pattern,
                           climate = climate_pattern,
                           pension = pension_pattern)) %>%
@@ -83,3 +80,4 @@ word_usage_by_date(res, c(pandemic = pandemic_pattern,
     labs(color = "Topic") +
     geom_point()
 ```
+
diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd
deleted file mode 100644
index 16bd7b9..0000000
--- a/vignettes/funwithdata.Rmd
+++ /dev/null
@@ -1,181 +0,0 @@
----
-title: "funwithdata"
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{funwithdata}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-
-```{r, include = FALSE}
-knitr::opts_chunk$set(
-  collapse = TRUE,
-  comment = "#>"
-)
-```
-
-```{r setup}
-library(hateimparlament)
-library(dplyr)
-library(ggplot2)
-library(stringr)
-library(tidyr)
-```
-
-## Preparation of data
-
-First, you need to download all records of the current legislative period.
-```r
-fetch_all("../inst/records/") # path to directory where records should be stored
-```
-Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
-```r
-read_all("../inst/records/") %>% repair() -> res
-```
-We also used `repair` to fix a bunch of formatting issues in the records and unpacked
-the result into more descriptive variables.
-
-For development purposes, we load the tables from csv files.
-```{r}
-res <- read_from_csv('../inst/csv/')
-```
-and unpack our tibbles
-```{r}
-comments <- res$comments
-speeches <- res$speeches
-speaker <- res$speaker
-talks <- res$talks
-applause <- res$applause
-```
-
-## Analysis
-
-Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
-```{r, fig.width=7}
-join_speaker(res$speeches, res) %>%
-    group_by(fraction) %>%
-    summarize(n = n()) %>%
-    arrange(n) %>%
-    bar_plot_fractions(title="Number of speeches given by fraction",
-                        ylab="Number of speeches")
-```
-
-or counting the occurences of a given word:
-
-```{r, fig.width=7}
-find_word(res, "Kohleausstieg") %>%
-    filter(occurences > 0) %>%
-    join_speaker(res) %>%
-    select(content, fraction) %>%
-    filter(!is.na(fraction)) %>%
-    group_by(fraction) %>%
-    summarize(n = n()) %>%
-    arrange(desc(n)) %>%
-    bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
-                        ylab = "Number of uses of 'Kohleausstieg'",
-                        flipped = F,
-                        rotatelab = TRUE)
-```
-
-### Who gives the most speeches?
-
-```{r}
-res$speeches %>%
-    group_by(speaker) %>%
-    summarize(n = n()) %>%
-    arrange(-n) %>%
-    left_join(res$speaker, by=c("speaker" = "id")) %>%
-    head(10)
-```
-
-### Who talks the longest?
-
-```{r}
-res$talks %>%
-    mutate(content_len = str_length(content)) %>%
-    group_by(speaker) %>%
-    summarize(avg_content_len = mean(content_len)) %>%
-    arrange(-avg_content_len) %>%
-    left_join(res$speaker, by=c("speaker" = "id")) %>%
-    head(10)
-```
-
-### Which party gives the most applause to which parties?
-
-```{r}
-res$applause %>%
-    left_join(res$speaker, by=c("on_speaker" = "id")) %>%
-    select(on_fraction = fraction, where(is.logical)) %>%
-    group_by(on_fraction) %>%
-    arrange(on_fraction) %>%
-    summarize("AfD" = sum(`AfD`),
-              "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
-              "CDU/CSU" = sum(`CDU_CSU`),
-              "DIE LINKE" = sum(`DIE_LINKE`),
-              "FDP" = sum(`FDP`),
-              "SPD" = sum(`SPD`)) -> tb
-```
-
-For plotting our results we reorganize them a bit and produce a bar plot:
-
-```{r, fig.width=7}
-pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
-    filter(!is.na(on_fraction)) %>%
-    bar_plot_fractions(x_variable = on_fraction,
-                        y_variable = value,
-                        fill = by_fraction,
-                        title = "Number of rounds of applauses from fractions to fractions",
-                        xlab = "Applauded fraction",
-                        ylab = "Rounds of applauses",
-                        filllab = "Applauding fraction",
-                        flipped = FALSE,
-                        rotatelab = TRUE)
-```
-
-
-### Which party comments the most on which parties?
-
-```{r}
-res$comments %>%
-    left_join(res$speaker, by=c("on_speaker" = "id")) %>%
-    select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
-    group_by(on_fraction) %>%
-    summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
-              `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
-              `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
-              `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
-              `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),
-              `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb
-```
-Analogously we plot the results:
-
-```{r, fig.width=7}
-pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
-    filter(!is.na(on_fraction)) %>%
-    bar_plot_fractions(x_variable = on_fraction,
-                        y_variable = value,
-                        fill = by_fraction,
-                        title = "Number of comments from fractions to fractions",
-                        xlab = "Commented fraction",
-                        ylab = "Number of comments",
-                        filllab = "Commenting fraction",
-                        flipped = FALSE,
-                        rotatelab = TRUE)
-```
-
-### When are which topics discussed the most?
-
-```{r, fig.width=7}
-pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
-climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
-pension_pattern <- "(?i)rente|pension|altersarmut"
-
-word_usage_by_date(res, c(pandemic = pandemic_pattern,
-                          climate = climate_pattern,
-                          pension = pension_pattern)) %>%
-    ggplot(aes(x = date, y = count, color = pattern)) +
-    xlab("date of session") +
-    ylab("occurence of word per session") +
-    labs(color = "Topic") +
-    geom_point()
-```
diff --git a/vignettes/generalquestions.Rmd b/vignettes/generalquestions.Rmd
index 68241d2..a2b05fc 100644
--- a/vignettes/generalquestions.Rmd
+++ b/vignettes/generalquestions.Rmd
@@ -32,26 +32,18 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp
 ```r
 read_all("../inst/records/") %>% repair() -> res
 ```
-We also used `repair` to fix a bunch of formatting issues in the records and unpacked
-the result into more descriptive variables.
+We also used `repair` to fix a bunch of formatting issues in the records.
 
 For development purposes, we load the tables from csv files.
 ```{r}
 res <- read_from_csv('../inst/csv/')
 ```
-and unpack our tibbles
-```{r}
-comments <- res$comments
-speeches <- res$speeches
-speaker <- res$speaker
-talks <- res$talks
-```
 
 ## Analysis
 
 Now we can start analysing our parsed dataset:
 
-### Which partie gives the most talkes?
+### Which party gives the most talks?
 
 ```{r, fig.width=7}
 join_speaker(res$speeches, res) %>%
@@ -59,9 +51,11 @@ join_speaker(res$speeches, res) %>%
     summarize(n = n()) %>%
     arrange(n) %>%
     bar_plot_fractions(title="Number of speeches given by fraction",
-                        ylab="Number of speeches")
+                       ylab="Number of speeches")
 ```
 
+Note that `NA` signifies speeches given by speakers who are not members of parliament.
+
 ### Who gives the most speeches?
 
 ```{r}
@@ -75,6 +69,8 @@ res$speeches %>%
 
 ### Who talks the longest?
 
+Calculate the average character length of talks given by speakers:
+
 ```{r}
 res$talks %>%
     mutate(content_len = str_length(content)) %>%
diff --git a/vignettes/interaction.Rmd b/vignettes/interaction.Rmd
index 6c1768a..ed3cac0 100644
--- a/vignettes/interaction.Rmd
+++ b/vignettes/interaction.Rmd
@@ -32,20 +32,12 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp
 ```r
 read_all("../inst/records/") %>% repair() -> res
 ```
-We also used `repair` to fix a bunch of formatting issues in the records and unpacked
-the result into more descriptive variables.
+We also used `repair` to fix a bunch of formatting issues in the records.
 
 For development purposes, we load the tables from csv files.
 ```{r}
 res <- read_from_csv('../inst/csv/')
 ```
-and unpack our tibbles
-```{r}
-comments <- res$comments
-speeches <- res$speeches
-speaker <- res$speaker
-talks <- res$talks
-```
 
 ## Analysis
 
@@ -69,7 +61,7 @@ res$applause %>%
 
 For plotting our results we reorganize them a bit and produce a bar plot:
 
-```{r, fig.width=7}
+```{r, fig.width=7, fig.height=6}
 pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
     filter(!is.na(on_fraction)) %>%
     bar_plot_fractions(x_variable = on_fraction,
@@ -100,7 +92,7 @@ res$comments %>%
 ```
 Analogously we plot the results:
 
-```{r, fig.width=7}
+```{r, fig.width=7, fig.height=6}
 pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
     filter(!is.na(on_fraction)) %>%
     bar_plot_fractions(x_variable = on_fraction,