An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

123 wiersze
4.4KB

  1. ---
  2. title: "funwithdata"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{funwithdata}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(stringr)
  19. library(ggplot2)
  20. ```
  21. ## Preparation of data
  22. First, you need to download all records of the current legislative period.
  23. ```r
  24. fetch_all("../records/") # path to directory where records should be stored
  25. ```
  26. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  27. ```r
  28. read_all("../records/") %>% repair() -> res
  29. reden <- res$reden
  30. redner <- res$redner
  31. talks <- res$talks
  32. ```
  33. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  34. the result into more descriptive variables.
  35. For development purposes, we load the tables from csv files.
  36. ```{r}
  37. tables <- read_from_csv('../csv/')
  38. comments <- tables$comments
  39. reden <- tables$reden
  40. redner <- tables$redner
  41. talks <- tables$talks
  42. ```
  43. Further, we need to load a list of words that were used by Hitler but not by standard German texts.
  44. ```{r}
  45. fil <- file('../hitler_texts/hitler_words')
  46. Worte <- readLines(fil)
  47. hitlerwords <- tibble(Worte)
  48. ```
  49. ## Analysis
  50. Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`.
  51. ```{r}
  52. talks %>%
  53. left_join(redner, by=c(redner='id')) %>%
  54. group_by(fraktion) %>%
  55. summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion
  56. talks_by_fraktion
  57. ```
  58. For each party, we want to get a tibble of words with frequency.
  59. ```{r}
  60. #AfD
  61. Worte <- str_extract_all(talks_by_fraktion$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  62. total = length(Worte)
  63. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> afd_words
  64. #AfD&Fraktionslos
  65. Worte <- str_extract_all(talks_by_fraktion$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  66. total = length(Worte)
  67. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> afdundfraktionslos_words
  68. #BÜNDNIS 90 / DIE GRÜNEN
  69. Worte <- str_extract_all(talks_by_fraktion$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  70. total = length(Worte)
  71. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> grüne_words
  72. #CDU/CSU
  73. Worte <- str_extract_all(talks_by_fraktion$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  74. total = length(Worte)
  75. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> cdu_words
  76. #DIE LINKE
  77. Worte <- str_extract_all(talks_by_fraktion$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  78. total = length(Worte)
  79. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> linke_words
  80. #FDP
  81. Worte <- str_extract_all(talks_by_fraktion$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  82. total = length(Worte)
  83. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> fdp_words
  84. #Fraktionslos
  85. Worte <- str_extract_all(talks_by_fraktion$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  86. total = length(Worte)
  87. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> fraktionslos_words
  88. #SPD
  89. Worte <- str_extract_all(talks_by_fraktion$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  90. total = length(Worte)
  91. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> spd_words
  92. #NA
  93. Worte <- str_extract_all(talks_by_fraktion$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  94. total = length(Worte)
  95. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/total) -> na_words
  96. #alle
  97. all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words)
  98. total <- sum(all_words$n)
  99. all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
  100. ```
  101. Now we want to extract the words that are more frequently used by a specific `fraktion`.
  102. ```{r}
  103. afd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afd_high_frequent
  104. ```
  105. We compare these words with `hitlerwords`.
  106. ```{r}
  107. afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords)
  108. ```