An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

178 linhas
5.4KB

  1. ---
  2. title: "funwithdata"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{funwithdata}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. ```
  22. ## Preparation of data
  23. First, you need to download all records of the current legislative period.
  24. ```r
  25. fetch_all("../records/") # path to directory where records should be stored
  26. ```
  27. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  28. ```r
  29. read_all("../records/") %>% repair() -> res
  30. ```
  31. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  32. the result into more descriptive variables.
  33. For development purposes, we load the tables from csv files.
  34. ```{r}
  35. res <- read_from_csv('../csv/')
  36. ```
  37. and unpack our tibbles
  38. ```{r}
  39. comments <- res$comments
  40. speeches <- res$speeches
  41. speaker <- res$speaker
  42. talks <- res$talks
  43. ```
  44. ## Analysis
  45. Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
  46. ```{r, fig.width=7}
  47. join_speaker(res$speeches, res) %>%
  48. group_by(fraction) %>%
  49. summarize(n = n()) %>%
  50. arrange(n) %>%
  51. bar_plot_fractions(title="Number of speeches given by fraction",
  52. ylab="Number of speeches")
  53. ```
  54. or counting the occurences of a given word:
  55. ```{r, fig.width=7}
  56. find_word(res, "Kohleausstieg") %>%
  57. filter(occurences > 0) %>%
  58. join_speaker(res) %>%
  59. select(content, fraction) %>%
  60. filter(!is.na(fraction)) %>%
  61. group_by(fraction) %>%
  62. summarize(n = n()) %>%
  63. arrange(desc(n)) %>%
  64. bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
  65. ylab = "Number of uses of 'Kohleausstieg'",
  66. flipped = F)
  67. ```
  68. ### Who gives the most speeches?
  69. ```{r}
  70. res$speeches %>%
  71. group_by(speaker) %>%
  72. summarize(n = n()) %>%
  73. arrange(-n) %>%
  74. left_join(res$speaker, by=c("speaker" = "id")) %>%
  75. head(10)
  76. ```
  77. ### Who talks the longest?
  78. ```{r}
  79. res$talks %>%
  80. mutate(content_len = str_length(content)) %>%
  81. group_by(speaker) %>%
  82. summarize(avg_content_len = mean(content_len)) %>%
  83. arrange(-avg_content_len) %>%
  84. left_join(res$speaker, by=c("speaker" = "id")) %>%
  85. head(10)
  86. ```
  87. ### Which party gives the most applause to which parties?
  88. ```{r}
  89. res$applause %>%
  90. left_join(res$speaker, by=c("on_speaker" = "id")) %>%
  91. select(on_fraction = fraction, where(is.logical)) %>%
  92. group_by(on_fraction) %>%
  93. arrange(on_fraction) %>%
  94. summarize("AfD" = sum(`AfD`),
  95. "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BUENDNIS_90_DIE_GRUENEN`),
  96. "CDU/CSU" = sum(`CDU_CSU`),
  97. "DIE LINKE" = sum(`DIE_LINKE`),
  98. "FDP" = sum(`FDP`),
  99. "SPD" = sum(`SPD`)) -> tb
  100. ```
  101. For plotting our results we reorganize them a bit and produce a bar plot:
  102. ```{r, fig.width=7}
  103. pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
  104. filter(!is.na(on_fraction)) %>%
  105. bar_plot_fractions(x_variable = on_fraction,
  106. y_variable = value,
  107. fill = by_fraction,
  108. title = "Number of rounds of applauses from fractions to fractions",
  109. xlab = "Applauded fraction",
  110. ylab = "Rounds of applauses",
  111. filllab = "Applauding fraction",
  112. flipped = FALSE)
  113. ```
  114. ### Which party comments the most on which parties?
  115. ```{r}
  116. res$comments %>%
  117. left_join(res$speaker, by=c("on_speaker" = "id")) %>%
  118. select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
  119. group_by(on_fraction) %>%
  120. summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
  121. `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
  122. `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
  123. `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
  124. `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),
  125. `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb
  126. ```
  127. Analogously we plot the results:
  128. ```{r, fig.width=7}
  129. pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
  130. filter(!is.na(on_fraction)) %>%
  131. bar_plot_fractions(x_variable = on_fraction,
  132. y_variable = value,
  133. fill = by_fraction,
  134. title = "Number of comments from fractions to fractions",
  135. xlab = "Commented fraction",
  136. ylab = "Number of comments",
  137. filllab = "Commenting fraction",
  138. flipped = FALSE)
  139. ```
  140. ### When are which topics discussed the most?
  141. ```{r, fig.width=7}
  142. pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
  143. climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
  144. pension_pattern <- "(?i)rente|pension|altersarmut"
  145. word_usage_by_date(res, c(pandemic = pandemic_pattern,
  146. climate = climate_pattern,
  147. pension = pension_pattern)) %>%
  148. ggplot(aes(x = date, y = count, color = pattern)) +
  149. xlab("date of session") +
  150. ylab("occurence of word per session") +
  151. labs(color = "Topic") +
  152. geom_point()
  153. ```