An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

182 строки
5.5KB

  1. ---
  2. title: "funwithdata"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{funwithdata}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. ```
  22. ## Preparation of data
  23. First, you need to download all records of the current legislative period.
  24. ```r
  25. fetch_all("../inst/records/") # path to directory where records should be stored
  26. ```
  27. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  28. ```r
  29. read_all("../inst/records/") %>% repair() -> res
  30. ```
  31. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  32. the result into more descriptive variables.
  33. For development purposes, we load the tables from csv files.
  34. ```{r}
  35. res <- read_from_csv('../inst/csv/')
  36. ```
  37. and unpack our tibbles
  38. ```{r}
  39. comments <- res$comments
  40. speeches <- res$speeches
  41. speaker <- res$speaker
  42. talks <- res$talks
  43. applause <- res$applause
  44. ```
  45. ## Analysis
  46. Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
  47. ```{r, fig.width=7}
  48. join_speaker(res$speeches, res) %>%
  49. group_by(fraction) %>%
  50. summarize(n = n()) %>%
  51. arrange(n) %>%
  52. bar_plot_fractions(title="Number of speeches given by fraction",
  53. ylab="Number of speeches")
  54. ```
  55. or counting the occurences of a given word:
  56. ```{r, fig.width=7}
  57. find_word(res, "Kohleausstieg") %>%
  58. filter(occurences > 0) %>%
  59. join_speaker(res) %>%
  60. select(content, fraction) %>%
  61. filter(!is.na(fraction)) %>%
  62. group_by(fraction) %>%
  63. summarize(n = n()) %>%
  64. arrange(desc(n)) %>%
  65. bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
  66. ylab = "Number of uses of 'Kohleausstieg'",
  67. flipped = F,
  68. rotatelab = TRUE)
  69. ```
  70. ### Who gives the most speeches?
  71. ```{r}
  72. res$speeches %>%
  73. group_by(speaker) %>%
  74. summarize(n = n()) %>%
  75. arrange(-n) %>%
  76. left_join(res$speaker, by=c("speaker" = "id")) %>%
  77. head(10)
  78. ```
  79. ### Who talks the longest?
  80. ```{r}
  81. res$talks %>%
  82. mutate(content_len = str_length(content)) %>%
  83. group_by(speaker) %>%
  84. summarize(avg_content_len = mean(content_len)) %>%
  85. arrange(-avg_content_len) %>%
  86. left_join(res$speaker, by=c("speaker" = "id")) %>%
  87. head(10)
  88. ```
  89. ### Which party gives the most applause to which parties?
  90. ```{r}
  91. res$applause %>%
  92. left_join(res$speaker, by=c("on_speaker" = "id")) %>%
  93. select(on_fraction = fraction, where(is.logical)) %>%
  94. group_by(on_fraction) %>%
  95. arrange(on_fraction) %>%
  96. summarize("AfD" = sum(`AfD`),
  97. "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
  98. "CDU/CSU" = sum(`CDU_CSU`),
  99. "DIE LINKE" = sum(`DIE_LINKE`),
  100. "FDP" = sum(`FDP`),
  101. "SPD" = sum(`SPD`)) -> tb
  102. ```
  103. For plotting our results we reorganize them a bit and produce a bar plot:
  104. ```{r, fig.width=7}
  105. pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
  106. filter(!is.na(on_fraction)) %>%
  107. bar_plot_fractions(x_variable = on_fraction,
  108. y_variable = value,
  109. fill = by_fraction,
  110. title = "Number of rounds of applauses from fractions to fractions",
  111. xlab = "Applauded fraction",
  112. ylab = "Rounds of applauses",
  113. filllab = "Applauding fraction",
  114. flipped = FALSE,
  115. rotatelab = TRUE)
  116. ```
  117. ### Which party comments the most on which parties?
  118. ```{r}
  119. res$comments %>%
  120. left_join(res$speaker, by=c("on_speaker" = "id")) %>%
  121. select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
  122. group_by(on_fraction) %>%
  123. summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
  124. `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
  125. `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
  126. `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
  127. `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),
  128. `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb
  129. ```
  130. Analogously we plot the results:
  131. ```{r, fig.width=7}
  132. pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
  133. filter(!is.na(on_fraction)) %>%
  134. bar_plot_fractions(x_variable = on_fraction,
  135. y_variable = value,
  136. fill = by_fraction,
  137. title = "Number of comments from fractions to fractions",
  138. xlab = "Commented fraction",
  139. ylab = "Number of comments",
  140. filllab = "Commenting fraction",
  141. flipped = FALSE,
  142. rotatelab = TRUE)
  143. ```
  144. ### When are which topics discussed the most?
  145. ```{r, fig.width=7}
  146. pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
  147. climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
  148. pension_pattern <- "(?i)rente|pension|altersarmut"
  149. word_usage_by_date(res, c(pandemic = pandemic_pattern,
  150. climate = climate_pattern,
  151. pension = pension_pattern)) %>%
  152. ggplot(aes(x = date, y = count, color = pattern)) +
  153. xlab("date of session") +
  154. ylab("occurence of word per session") +
  155. labs(color = "Topic") +
  156. geom_point()
  157. ```