|
|
|
@@ -1,181 +0,0 @@ |
|
|
|
--- |
|
|
|
title: "funwithdata" |
|
|
|
output: rmarkdown::html_vignette |
|
|
|
vignette: > |
|
|
|
%\VignetteIndexEntry{funwithdata} |
|
|
|
%\VignetteEngine{knitr::rmarkdown} |
|
|
|
%\VignetteEncoding{UTF-8} |
|
|
|
--- |
|
|
|
|
|
|
|
```{r, include = FALSE} |
|
|
|
knitr::opts_chunk$set( |
|
|
|
collapse = TRUE, |
|
|
|
comment = "#>" |
|
|
|
) |
|
|
|
``` |
|
|
|
|
|
|
|
```{r setup} |
|
|
|
library(hateimparlament) |
|
|
|
library(dplyr) |
|
|
|
library(ggplot2) |
|
|
|
library(stringr) |
|
|
|
library(tidyr) |
|
|
|
``` |
|
|
|
|
|
|
|
## Preparation of data |
|
|
|
|
|
|
|
First, you need to download all records of the current legislative period. |
|
|
|
```r |
|
|
|
fetch_all("../inst/records/") # path to directory where records should be stored |
|
|
|
``` |
|
|
|
Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: |
|
|
|
```r |
|
|
|
read_all("../inst/records/") %>% repair() -> res |
|
|
|
``` |
|
|
|
We also used `repair` to fix a bunch of formatting issues in the records and unpacked |
|
|
|
the result into more descriptive variables. |
|
|
|
|
|
|
|
For development purposes, we load the tables from csv files. |
|
|
|
```{r} |
|
|
|
res <- read_from_csv('../inst/csv/') |
|
|
|
``` |
|
|
|
and unpack our tibbles |
|
|
|
```{r} |
|
|
|
comments <- res$comments |
|
|
|
speeches <- res$speeches |
|
|
|
speaker <- res$speaker |
|
|
|
talks <- res$talks |
|
|
|
applause <- res$applause |
|
|
|
``` |
|
|
|
|
|
|
|
## Analysis |
|
|
|
|
|
|
|
Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: |
|
|
|
```{r, fig.width=7} |
|
|
|
join_speaker(res$speeches, res) %>% |
|
|
|
group_by(fraction) %>% |
|
|
|
summarize(n = n()) %>% |
|
|
|
arrange(n) %>% |
|
|
|
bar_plot_fractions(title="Number of speeches given by fraction", |
|
|
|
ylab="Number of speeches") |
|
|
|
``` |
|
|
|
|
|
|
|
or counting the occurences of a given word: |
|
|
|
|
|
|
|
```{r, fig.width=7} |
|
|
|
find_word(res, "Kohleausstieg") %>% |
|
|
|
filter(occurences > 0) %>% |
|
|
|
join_speaker(res) %>% |
|
|
|
select(content, fraction) %>% |
|
|
|
filter(!is.na(fraction)) %>% |
|
|
|
group_by(fraction) %>% |
|
|
|
summarize(n = n()) %>% |
|
|
|
arrange(desc(n)) %>% |
|
|
|
bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", |
|
|
|
ylab = "Number of uses of 'Kohleausstieg'", |
|
|
|
flipped = F, |
|
|
|
rotatelab = TRUE) |
|
|
|
``` |
|
|
|
|
|
|
|
### Who gives the most speeches? |
|
|
|
|
|
|
|
```{r} |
|
|
|
res$speeches %>% |
|
|
|
group_by(speaker) %>% |
|
|
|
summarize(n = n()) %>% |
|
|
|
arrange(-n) %>% |
|
|
|
left_join(res$speaker, by=c("speaker" = "id")) %>% |
|
|
|
head(10) |
|
|
|
``` |
|
|
|
|
|
|
|
### Who talks the longest? |
|
|
|
|
|
|
|
```{r} |
|
|
|
res$talks %>% |
|
|
|
mutate(content_len = str_length(content)) %>% |
|
|
|
group_by(speaker) %>% |
|
|
|
summarize(avg_content_len = mean(content_len)) %>% |
|
|
|
arrange(-avg_content_len) %>% |
|
|
|
left_join(res$speaker, by=c("speaker" = "id")) %>% |
|
|
|
head(10) |
|
|
|
``` |
|
|
|
|
|
|
|
### Which party gives the most applause to which parties? |
|
|
|
|
|
|
|
```{r} |
|
|
|
res$applause %>% |
|
|
|
left_join(res$speaker, by=c("on_speaker" = "id")) %>% |
|
|
|
select(on_fraction = fraction, where(is.logical)) %>% |
|
|
|
group_by(on_fraction) %>% |
|
|
|
arrange(on_fraction) %>% |
|
|
|
summarize("AfD" = sum(`AfD`), |
|
|
|
"BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`), |
|
|
|
"CDU/CSU" = sum(`CDU_CSU`), |
|
|
|
"DIE LINKE" = sum(`DIE_LINKE`), |
|
|
|
"FDP" = sum(`FDP`), |
|
|
|
"SPD" = sum(`SPD`)) -> tb |
|
|
|
``` |
|
|
|
|
|
|
|
For plotting our results we reorganize them a bit and produce a bar plot: |
|
|
|
|
|
|
|
```{r, fig.width=7} |
|
|
|
pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% |
|
|
|
filter(!is.na(on_fraction)) %>% |
|
|
|
bar_plot_fractions(x_variable = on_fraction, |
|
|
|
y_variable = value, |
|
|
|
fill = by_fraction, |
|
|
|
title = "Number of rounds of applauses from fractions to fractions", |
|
|
|
xlab = "Applauded fraction", |
|
|
|
ylab = "Rounds of applauses", |
|
|
|
filllab = "Applauding fraction", |
|
|
|
flipped = FALSE, |
|
|
|
rotatelab = TRUE) |
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### Which party comments the most on which parties? |
|
|
|
|
|
|
|
```{r} |
|
|
|
res$comments %>% |
|
|
|
left_join(res$speaker, by=c("on_speaker" = "id")) %>% |
|
|
|
select(by_fraction = fraction.x, on_fraction = fraction.y) %>% |
|
|
|
group_by(on_fraction) %>% |
|
|
|
summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T), |
|
|
|
`BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), |
|
|
|
`CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T), |
|
|
|
`DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T), |
|
|
|
`FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T), |
|
|
|
`SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb |
|
|
|
``` |
|
|
|
Analogously we plot the results: |
|
|
|
|
|
|
|
```{r, fig.width=7} |
|
|
|
pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% |
|
|
|
filter(!is.na(on_fraction)) %>% |
|
|
|
bar_plot_fractions(x_variable = on_fraction, |
|
|
|
y_variable = value, |
|
|
|
fill = by_fraction, |
|
|
|
title = "Number of comments from fractions to fractions", |
|
|
|
xlab = "Commented fraction", |
|
|
|
ylab = "Number of comments", |
|
|
|
filllab = "Commenting fraction", |
|
|
|
flipped = FALSE, |
|
|
|
rotatelab = TRUE) |
|
|
|
``` |
|
|
|
|
|
|
|
### When are which topics discussed the most? |
|
|
|
|
|
|
|
```{r, fig.width=7} |
|
|
|
pandemic_pattern <- "(?i)virus|corona|covid|lockdown" |
|
|
|
climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" |
|
|
|
pension_pattern <- "(?i)rente|pension|altersarmut" |
|
|
|
|
|
|
|
word_usage_by_date(res, c(pandemic = pandemic_pattern, |
|
|
|
climate = climate_pattern, |
|
|
|
pension = pension_pattern)) %>% |
|
|
|
ggplot(aes(x = date, y = count, color = pattern)) + |
|
|
|
xlab("date of session") + |
|
|
|
ylab("occurence of word per session") + |
|
|
|
labs(color = "Topic") + |
|
|
|
geom_point() |
|
|
|
``` |