From 5490f9fed6398840215ac05faf2b6b0a3a0afc1c Mon Sep 17 00:00:00 2001 From: flavis Date: Mon, 9 Aug 2021 16:08:33 +0200 Subject: [PATCH] remove funwithdata, add some text, improve some fig heights --- vignettes/explicittopic.Rmd | 26 +++-- vignettes/funwithdata.Rmd | 181 --------------------------------- vignettes/generalquestions.Rmd | 18 ++-- vignettes/interaction.Rmd | 14 +-- 4 files changed, 22 insertions(+), 217 deletions(-) delete mode 100644 vignettes/funwithdata.Rmd diff --git a/vignettes/explicittopic.Rmd b/vignettes/explicittopic.Rmd index 5149ecf..c571164 100644 --- a/vignettes/explicittopic.Rmd +++ b/vignettes/explicittopic.Rmd @@ -32,20 +32,12 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp ```r read_all("../inst/records/") %>% repair() -> res ``` -We also used `repair` to fix a bunch of formatting issues in the records and unpacked -the result into more descriptive variables. +We also used `repair` to fix a bunch of formatting issues in the records. For development purposes, we load the tables from csv files. ```{r} res <- read_from_csv('../inst/csv/') ``` -and unpack our tibbles -```{r} -comments <- res$comments -speeches <- res$speeches -speaker <- res$speaker -talks <- res$talks -``` ## Analysis @@ -53,7 +45,7 @@ Now we can start analysing our parsed dataset: ### Counting the occurences of a given word: -```{r, fig.width=7} +```{r, fig.width=7, fig.height=7} find_word(res, "Kohleausstieg") %>% filter(occurences > 0) %>% join_speaker(res) %>% @@ -63,17 +55,22 @@ find_word(res, "Kohleausstieg") %>% summarize(n = n()) %>% arrange(desc(n)) %>% bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", - ylab = "Number of uses of 'Kohleausstieg'", - flipped = F) + ylab = "Number of uses of 'Kohleausstieg'", + flipped = F, + rotatelab = T) ``` ### When are which topics discussed the most? -```{r, fig.width=7} +First we define some search patterns, according to some common political topics. +```{r} pandemic_pattern <- "(?i)virus|corona|covid|lockdown" climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" pension_pattern <- "(?i)rente|pension|altersarmut" - +``` +Then we use the analysis helper `word_usage_by_date` to generate a tibble counting the +occurences of our search patterns per date. We can then plot the results: +```{r, fig.width=7, fig.height=6} word_usage_by_date(res, c(pandemic = pandemic_pattern, climate = climate_pattern, pension = pension_pattern)) %>% @@ -83,3 +80,4 @@ word_usage_by_date(res, c(pandemic = pandemic_pattern, labs(color = "Topic") + geom_point() ``` + diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd deleted file mode 100644 index 16bd7b9..0000000 --- a/vignettes/funwithdata.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "funwithdata" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{funwithdata} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup} -library(hateimparlament) -library(dplyr) -library(ggplot2) -library(stringr) -library(tidyr) -``` - -## Preparation of data - -First, you need to download all records of the current legislative period. -```r -fetch_all("../inst/records/") # path to directory where records should be stored -``` -Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: -```r -read_all("../inst/records/") %>% repair() -> res -``` -We also used `repair` to fix a bunch of formatting issues in the records and unpacked -the result into more descriptive variables. - -For development purposes, we load the tables from csv files. -```{r} -res <- read_from_csv('../inst/csv/') -``` -and unpack our tibbles -```{r} -comments <- res$comments -speeches <- res$speeches -speaker <- res$speaker -talks <- res$talks -applause <- res$applause -``` - -## Analysis - -Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: -```{r, fig.width=7} -join_speaker(res$speeches, res) %>% - group_by(fraction) %>% - summarize(n = n()) %>% - arrange(n) %>% - bar_plot_fractions(title="Number of speeches given by fraction", - ylab="Number of speeches") -``` - -or counting the occurences of a given word: - -```{r, fig.width=7} -find_word(res, "Kohleausstieg") %>% - filter(occurences > 0) %>% - join_speaker(res) %>% - select(content, fraction) %>% - filter(!is.na(fraction)) %>% - group_by(fraction) %>% - summarize(n = n()) %>% - arrange(desc(n)) %>% - bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", - ylab = "Number of uses of 'Kohleausstieg'", - flipped = F, - rotatelab = TRUE) -``` - -### Who gives the most speeches? - -```{r} -res$speeches %>% - group_by(speaker) %>% - summarize(n = n()) %>% - arrange(-n) %>% - left_join(res$speaker, by=c("speaker" = "id")) %>% - head(10) -``` - -### Who talks the longest? - -```{r} -res$talks %>% - mutate(content_len = str_length(content)) %>% - group_by(speaker) %>% - summarize(avg_content_len = mean(content_len)) %>% - arrange(-avg_content_len) %>% - left_join(res$speaker, by=c("speaker" = "id")) %>% - head(10) -``` - -### Which party gives the most applause to which parties? - -```{r} -res$applause %>% - left_join(res$speaker, by=c("on_speaker" = "id")) %>% - select(on_fraction = fraction, where(is.logical)) %>% - group_by(on_fraction) %>% - arrange(on_fraction) %>% - summarize("AfD" = sum(`AfD`), - "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), - "CDU/CSU" = sum(`CDU_CSU`), - "DIE LINKE" = sum(`DIE_LINKE`), - "FDP" = sum(`FDP`), - "SPD" = sum(`SPD`)) -> tb -``` - -For plotting our results we reorganize them a bit and produce a bar plot: - -```{r, fig.width=7} -pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% - filter(!is.na(on_fraction)) %>% - bar_plot_fractions(x_variable = on_fraction, - y_variable = value, - fill = by_fraction, - title = "Number of rounds of applauses from fractions to fractions", - xlab = "Applauded fraction", - ylab = "Rounds of applauses", - filllab = "Applauding fraction", - flipped = FALSE, - rotatelab = TRUE) -``` - - -### Which party comments the most on which parties? - -```{r} -res$comments %>% - left_join(res$speaker, by=c("on_speaker" = "id")) %>% - select(by_fraction = fraction.x, on_fraction = fraction.y) %>% - group_by(on_fraction) %>% - summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T), - `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), - `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T), - `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T), - `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T), - `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb -``` -Analogously we plot the results: - -```{r, fig.width=7} -pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% - filter(!is.na(on_fraction)) %>% - bar_plot_fractions(x_variable = on_fraction, - y_variable = value, - fill = by_fraction, - title = "Number of comments from fractions to fractions", - xlab = "Commented fraction", - ylab = "Number of comments", - filllab = "Commenting fraction", - flipped = FALSE, - rotatelab = TRUE) -``` - -### When are which topics discussed the most? - -```{r, fig.width=7} -pandemic_pattern <- "(?i)virus|corona|covid|lockdown" -climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" -pension_pattern <- "(?i)rente|pension|altersarmut" - -word_usage_by_date(res, c(pandemic = pandemic_pattern, - climate = climate_pattern, - pension = pension_pattern)) %>% - ggplot(aes(x = date, y = count, color = pattern)) + - xlab("date of session") + - ylab("occurence of word per session") + - labs(color = "Topic") + - geom_point() -``` diff --git a/vignettes/generalquestions.Rmd b/vignettes/generalquestions.Rmd index 68241d2..a2b05fc 100644 --- a/vignettes/generalquestions.Rmd +++ b/vignettes/generalquestions.Rmd @@ -32,26 +32,18 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp ```r read_all("../inst/records/") %>% repair() -> res ``` -We also used `repair` to fix a bunch of formatting issues in the records and unpacked -the result into more descriptive variables. +We also used `repair` to fix a bunch of formatting issues in the records. For development purposes, we load the tables from csv files. ```{r} res <- read_from_csv('../inst/csv/') ``` -and unpack our tibbles -```{r} -comments <- res$comments -speeches <- res$speeches -speaker <- res$speaker -talks <- res$talks -``` ## Analysis Now we can start analysing our parsed dataset: -### Which partie gives the most talkes? +### Which party gives the most talks? ```{r, fig.width=7} join_speaker(res$speeches, res) %>% @@ -59,9 +51,11 @@ join_speaker(res$speeches, res) %>% summarize(n = n()) %>% arrange(n) %>% bar_plot_fractions(title="Number of speeches given by fraction", - ylab="Number of speeches") + ylab="Number of speeches") ``` +Note that `NA` signifies speeches given by speakers who are not members of parliament. + ### Who gives the most speeches? ```{r} @@ -75,6 +69,8 @@ res$speeches %>% ### Who talks the longest? +Calculate the average character length of talks given by speakers: + ```{r} res$talks %>% mutate(content_len = str_length(content)) %>% diff --git a/vignettes/interaction.Rmd b/vignettes/interaction.Rmd index 6c1768a..ed3cac0 100644 --- a/vignettes/interaction.Rmd +++ b/vignettes/interaction.Rmd @@ -32,20 +32,12 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp ```r read_all("../inst/records/") %>% repair() -> res ``` -We also used `repair` to fix a bunch of formatting issues in the records and unpacked -the result into more descriptive variables. +We also used `repair` to fix a bunch of formatting issues in the records. For development purposes, we load the tables from csv files. ```{r} res <- read_from_csv('../inst/csv/') ``` -and unpack our tibbles -```{r} -comments <- res$comments -speeches <- res$speeches -speaker <- res$speaker -talks <- res$talks -``` ## Analysis @@ -69,7 +61,7 @@ res$applause %>% For plotting our results we reorganize them a bit and produce a bar plot: -```{r, fig.width=7} +```{r, fig.width=7, fig.height=6} pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% filter(!is.na(on_fraction)) %>% bar_plot_fractions(x_variable = on_fraction, @@ -100,7 +92,7 @@ res$comments %>% ``` Analogously we plot the results: -```{r, fig.width=7} +```{r, fig.width=7, fig.height=6} pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% filter(!is.na(on_fraction)) %>% bar_plot_fractions(x_variable = on_fraction,