An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

189 wiersze
9.9KB

  1. ---
  2. title: "Analysis of vocabulary"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{Analysis of vocabulary}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(stringr)
  19. library(ggplot2)
  20. ```
  21. ## Preparation of data
  22. First, you need to download all records of the current legislative period.
  23. ```r
  24. fetch_all("../inst/records/") # path to directory where records should be stored
  25. ```
  26. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  27. ```r
  28. read_all("../inst/records/") %>% repair() -> res
  29. speeches <- res$speeches
  30. speaker <- res$speaker
  31. talks <- res$talks
  32. ```
  33. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  34. the result into more descriptive variables.
  35. For development purposes, we only fetch records if they are not already
  36. stored as csv files:
  37. ```{r}
  38. tables <- read_from_csv_or_fetch('../inst/')
  39. comments <- tables$comments
  40. speeches <- tables$speeches
  41. speaker <- tables$speaker
  42. talks <- tables$talks
  43. ```
  44. Further, we need to load a list of words that were used by Hitler but not by standard German texts.
  45. ```{r}
  46. fil <- file('../inst/hitler_texts/hitler_words')
  47. Worte <- readLines(fil)
  48. hitlerwords <- tibble(Worte)
  49. ```
  50. ## Analysis
  51. Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`.
  52. ```{r}
  53. talks %>%
  54. left_join(speaker, by=c(speaker='id')) %>%
  55. group_by(fraction) %>%
  56. summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraction
  57. ```
  58. For each party, we want to get a tibble of words with frequency.
  59. ```{r}
  60. #AfD
  61. Worte <- str_extract_all(talks_by_fraction$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  62. afdtotal = length(Worte)
  63. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words
  64. #AfD&Fraktionslos
  65. Worte <- str_extract_all(talks_by_fraction$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  66. afdundfraktionslostotal = length(Worte)
  67. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words
  68. #BÜNDNIS 90 / DIE GRÜNEN
  69. Worte <- str_extract_all(talks_by_fraction$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  70. grünetotal = length(Worte)
  71. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words
  72. #CDU/CSU
  73. Worte <- str_extract_all(talks_by_fraction$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  74. cdutotal = length(Worte)
  75. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words
  76. #DIE LINKE
  77. Worte <- str_extract_all(talks_by_fraction$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  78. linketotal = length(Worte)
  79. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words
  80. #FDP
  81. Worte <- str_extract_all(talks_by_fraction$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  82. fdptotal = length(Worte)
  83. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words
  84. #Fraktionslos
  85. Worte <- str_extract_all(talks_by_fraction$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  86. fraktionslostotal = length(Worte)
  87. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words
  88. #SPD
  89. Worte <- str_extract_all(talks_by_fraction$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  90. spdtotal = length(Worte)
  91. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words
  92. #NA
  93. Worte <- str_extract_all(talks_by_fraction$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  94. natotal = length(Worte)
  95. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words
  96. #alle
  97. all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words)
  98. total <- sum(all_words$n)
  99. all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
  100. ```
  101. Now we want to extract the words that are more frequently used by a specific fraction.
  102. ```{r}
  103. afd_words %>%
  104. transmute(freq, fraction_n = n) %>%
  105. left_join(all_words) %>%
  106. transmute(
  107. fraction_freq = freq,
  108. total_freq = part,
  109. fraction_n,
  110. total_n = n,
  111. rel_quotient = fraction_freq/total_freq,
  112. abs_quotient = fraction_n/total_n) %>%
  113. arrange(-abs_quotient, -fraction_n) %>%
  114. filter(rel_quotient > 1) ->
  115. afd_high_frequent
  116. select(afd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>%
  117. filter(total_n > 80)
  118. afdundfraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
  119. select(afdundfraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  120. grüne_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
  121. select(grüne_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  122. cdu_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
  123. select(cdu_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  124. linke_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
  125. select(linke_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  126. fdp_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
  127. select(fdp_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  128. fraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
  129. select(fraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  130. spd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
  131. select(spd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  132. na_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> na_high_frequent
  133. select(na_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  134. ```
  135. We compare these words with `hitlerwords`.
  136. ```{r}
  137. afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afd_hitler_comparison
  138. afdundfraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afdundfraktionslos_hitler_comparison
  139. grüne_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> grüne_hitler_comparison
  140. cdu_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> cdu_hitler_comparison
  141. linke_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> linke_hitler_comparison
  142. fdp_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fdp_hitler_comparison
  143. fraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fraktionslos_hitler_comparison
  144. spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> spd_hitler_comparison
  145. na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison
  146. #not unique
  147. tibble(fraction = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"),
  148. absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)),
  149. total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words))
  150. ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison
  151. hitler_comparison
  152. ```
  153. Finally, we want to plot our results:
  154. ```{r, fig.width=7}
  155. bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]")
  156. ```