|
- ---
- title: "funwithdata"
- output: rmarkdown::html_vignette
- vignette: >
- %\VignetteIndexEntry{funwithdata}
- %\VignetteEngine{knitr::rmarkdown}
- %\VignetteEncoding{UTF-8}
- ---
-
- ```{r, include = FALSE}
- knitr::opts_chunk$set(
- collapse = TRUE,
- comment = "#>"
- )
- ```
-
- ```{r setup}
- library(hateimparlament)
- library(dplyr)
- library(ggplot2)
- library(stringr)
- library(tidyr)
- ```
-
- ## Preparation of data
-
- First, you need to download all records of the current legislative period.
- ```r
- fetch_all("../inst/records/") # path to directory where records should be stored
- ```
- Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
- ```r
- read_all("../inst/records/") %>% repair() -> res
- ```
- We also used `repair` to fix a bunch of formatting issues in the records and unpacked
- the result into more descriptive variables.
-
- For development purposes, we load the tables from csv files.
- ```{r}
- res <- read_from_csv('../inst/csv/')
- ```
- and unpack our tibbles
- ```{r}
- comments <- res$comments
- speeches <- res$speeches
- speaker <- res$speaker
- talks <- res$talks
- applause <- res$applause
- ```
-
- ## Analysis
-
- Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
- ```{r, fig.width=7}
- join_speaker(res$speeches, res) %>%
- group_by(fraction) %>%
- summarize(n = n()) %>%
- arrange(n) %>%
- bar_plot_fractions(title="Number of speeches given by fraction",
- ylab="Number of speeches")
- ```
-
- or counting the occurences of a given word:
-
- ```{r, fig.width=7}
- find_word(res, "Kohleausstieg") %>%
- filter(occurences > 0) %>%
- join_speaker(res) %>%
- select(content, fraction) %>%
- filter(!is.na(fraction)) %>%
- group_by(fraction) %>%
- summarize(n = n()) %>%
- arrange(desc(n)) %>%
- bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
- ylab = "Number of uses of 'Kohleausstieg'",
- flipped = F,
- rotatelab = TRUE)
- ```
-
- ### Who gives the most speeches?
-
- ```{r}
- res$speeches %>%
- group_by(speaker) %>%
- summarize(n = n()) %>%
- arrange(-n) %>%
- left_join(res$speaker, by=c("speaker" = "id")) %>%
- head(10)
- ```
-
- ### Who talks the longest?
-
- ```{r}
- res$talks %>%
- mutate(content_len = str_length(content)) %>%
- group_by(speaker) %>%
- summarize(avg_content_len = mean(content_len)) %>%
- arrange(-avg_content_len) %>%
- left_join(res$speaker, by=c("speaker" = "id")) %>%
- head(10)
- ```
-
- ### Which party gives the most applause to which parties?
-
- ```{r}
- res$applause %>%
- left_join(res$speaker, by=c("on_speaker" = "id")) %>%
- select(on_fraction = fraction, where(is.logical)) %>%
- group_by(on_fraction) %>%
- arrange(on_fraction) %>%
- summarize("AfD" = sum(`AfD`),
- "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
- "CDU/CSU" = sum(`CDU_CSU`),
- "DIE LINKE" = sum(`DIE_LINKE`),
- "FDP" = sum(`FDP`),
- "SPD" = sum(`SPD`)) -> tb
- ```
-
- For plotting our results we reorganize them a bit and produce a bar plot:
-
- ```{r, fig.width=7}
- pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
- filter(!is.na(on_fraction)) %>%
- bar_plot_fractions(x_variable = on_fraction,
- y_variable = value,
- fill = by_fraction,
- title = "Number of rounds of applauses from fractions to fractions",
- xlab = "Applauded fraction",
- ylab = "Rounds of applauses",
- filllab = "Applauding fraction",
- flipped = FALSE,
- rotatelab = TRUE)
- ```
-
-
- ### Which party comments the most on which parties?
-
- ```{r}
- res$comments %>%
- left_join(res$speaker, by=c("on_speaker" = "id")) %>%
- select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
- group_by(on_fraction) %>%
- summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
- `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
- `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
- `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
- `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),
- `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb
- ```
- Analogously we plot the results:
-
- ```{r, fig.width=7}
- pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
- filter(!is.na(on_fraction)) %>%
- bar_plot_fractions(x_variable = on_fraction,
- y_variable = value,
- fill = by_fraction,
- title = "Number of comments from fractions to fractions",
- xlab = "Commented fraction",
- ylab = "Number of comments",
- filllab = "Commenting fraction",
- flipped = FALSE,
- rotatelab = TRUE)
- ```
-
- ### When are which topics discussed the most?
-
- ```{r, fig.width=7}
- pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
- climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
- pension_pattern <- "(?i)rente|pension|altersarmut"
-
- word_usage_by_date(res, c(pandemic = pandemic_pattern,
- climate = climate_pattern,
- pension = pension_pattern)) %>%
- ggplot(aes(x = date, y = count, color = pattern)) +
- xlab("date of session") +
- ylab("occurence of word per session") +
- labs(color = "Topic") +
- geom_point()
- ```
|