An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

175 line
5.2KB

  1. ---
  2. title: "funwithdata"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{funwithdata}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. ```
  22. ## Preparation of data
  23. First, you need to download all records of the current legislative period.
  24. ```r
  25. fetch_all("../records/") # path to directory where records should be stored
  26. ```
  27. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  28. ```r
  29. read_all("../records/") %>% repair() -> res
  30. ```
  31. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  32. the result into more descriptive variables.
  33. For development purposes, we load the tables from csv files.
  34. ```{r}
  35. res <- read_from_csv('../csv/')
  36. ```
  37. and unpack our tibbles
  38. ```{r}
  39. comments <- res$comments
  40. reden <- res$reden
  41. redner <- res$redner
  42. talks <- res$talks
  43. ```
  44. ## Analysis
  45. Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
  46. ```{r, fig.width=7}
  47. join_redner(res$reden, res) %>%
  48. group_by(fraktion) %>%
  49. summarize(n = n()) %>%
  50. arrange(n) %>%
  51. bar_plot_fraktionen(n, fill = fraktion, title="Number of speeches given by fraction", ylab="Number of speeches")
  52. ```
  53. or counting the occurences of a given word:
  54. ```{r, fig.width=7}
  55. find_word(res, "Kohleausstieg") %>%
  56. filter(occurences > 0) %>%
  57. join_redner(res) %>%
  58. select(content, fraktion) %>%
  59. filter(!is.na(fraktion)) %>%
  60. group_by(fraktion) %>%
  61. summarize(n = n()) %>%
  62. arrange(desc(n)) %>%
  63. bar_plot_fraktionen(n, fill = fraktion,
  64. title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
  65. ylab = "Number of uses of 'Kohleausstieg'")
  66. ```
  67. ### Who gives the most speeches?
  68. ```{r}
  69. res$reden %>%
  70. group_by(redner) %>%
  71. summarize(n = n()) %>%
  72. arrange(-n) %>%
  73. left_join(res$redner, by=c("redner" = "id")) %>%
  74. head(10)
  75. ```
  76. ### Who talks the longest?
  77. ```{r}
  78. res$talks %>%
  79. mutate(content_len = str_length(content)) %>%
  80. group_by(redner) %>%
  81. summarize(avg_content_len = mean(content_len)) %>%
  82. arrange(-avg_content_len) %>%
  83. left_join(res$redner, by=c("redner" = "id")) %>%
  84. head(10)
  85. ```
  86. ### Which party gives the most applause to which parties?
  87. ```{r}
  88. res$applause %>%
  89. left_join(res$redner, by=c("on_redner" = "id")) %>%
  90. select(on_fraktion = fraktion, where(is.logical)) %>%
  91. group_by(on_fraktion) %>%
  92. arrange(on_fraktion) %>%
  93. summarize("AfD" = sum(`AfD`),
  94. "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
  95. "CDU/CSU" = sum(`CDU_CSU`),
  96. "DIE LINKE" = sum(`DIE_LINKE`),
  97. "FDP" = sum(`FDP`),
  98. "SPD" = sum(`SPD`)) -> tb
  99. ```
  100. For plotting our results we reorganize them a bit and produce a bar plot:
  101. ```{r, fig.width=7}
  102. pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>%
  103. filter(!is.na(on_fraktion)) %>%
  104. rename(fraktion = on_fraktion) %>%
  105. bar_plot_fraktionen(value,
  106. fill = by_fraktion,
  107. title = "Number of rounds of applauses from fractions to fractions",
  108. xlab = "Applauded fraction",
  109. ylab = "Rounds of applauses",
  110. filllab = "Applauding fraction")
  111. ```
  112. ### Which party comments the most on which parties?
  113. ```{r}
  114. res$comments %>%
  115. left_join(res$redner, by=c("on_redner" = "id")) %>%
  116. select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>%
  117. group_by(on_fraktion) %>%
  118. summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T),
  119. `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
  120. `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T),
  121. `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T),
  122. `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T),
  123. `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) -> tb
  124. ```
  125. Analogously we plot the results:
  126. ```{r, fig.width=7}
  127. pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>%
  128. filter(!is.na(on_fraktion)) %>%
  129. rename(fraktion = on_fraktion) %>%
  130. bar_plot_fraktionen(value,
  131. fill = by_fraktion,
  132. title = "Number of comments from fractions to fractions",
  133. xlab = "Commented fraction",
  134. ylab = "Number of comments",
  135. filllab = "Commenting fraction")
  136. ```
  137. ### When are which topics discussed the most?
  138. ```{r, fig.width=7}
  139. pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
  140. climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
  141. pension_pattern <- "(?i)rente|pension|altersarmut"
  142. word_usage_by_date(res, c(pandemic = pandemic_pattern,
  143. climate = climate_pattern,
  144. pension = pension_pattern)) %>%
  145. ggplot(aes(x = date, y = count, color = pattern)) +
  146. xlab("date of session") +
  147. ylab("occurence of word per session") +
  148. labs(color = "Topic") +
  149. geom_point()
  150. ```