An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

174 líneas
9.8KB

  1. ---
  2. title: "Analysis of vocabulary"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{Analysis of vocabulary}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(stringr)
  19. library(ggplot2)
  20. ```
  21. ## Preparation of data
  22. First, you need to download all records of the current legislative period.
  23. ```r
  24. fetch_all("../records/") # path to directory where records should be stored
  25. ```
  26. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  27. ```r
  28. read_all("../records/") %>% repair() -> res
  29. speeches <- res$speeches
  30. speaker <- res$speaker
  31. talks <- res$talks
  32. ```
  33. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  34. the result into more descriptive variables.
  35. For development purposes, we load the tables from csv files.
  36. ```{r}
  37. tables <- read_from_csv('../csv/')
  38. comments <- tables$comments
  39. speeches <- tables$speeches
  40. speaker <- tables$speaker
  41. talks <- tables$talks
  42. ```
  43. Further, we need to load a list of words that were used by Hitler but not by standard German texts.
  44. ```{r}
  45. fil <- file('../hitler_texts/hitler_words')
  46. Worte <- readLines(fil)
  47. hitlerwords <- tibble(Worte)
  48. ```
  49. ## Analysis
  50. Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`.
  51. ```{r}
  52. talks %>%
  53. left_join(speaker, by=c(speaker='id')) %>%
  54. group_by(fraktion) %>%
  55. summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion
  56. ```
  57. For each party, we want to get a tibble of words with frequency.
  58. ```{r}
  59. #AfD
  60. Worte <- str_extract_all(talks_by_fraktion$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  61. afdtotal = length(Worte)
  62. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words
  63. #AfD&Fraktionslos
  64. Worte <- str_extract_all(talks_by_fraktion$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  65. afdundfraktionslostotal = length(Worte)
  66. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words
  67. #BÜNDNIS 90 / DIE GRÜNEN
  68. Worte <- str_extract_all(talks_by_fraktion$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  69. grünetotal = length(Worte)
  70. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words
  71. #CDU/CSU
  72. Worte <- str_extract_all(talks_by_fraktion$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  73. cdutotal = length(Worte)
  74. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words
  75. #DIE LINKE
  76. Worte <- str_extract_all(talks_by_fraktion$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  77. linketotal = length(Worte)
  78. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words
  79. #FDP
  80. Worte <- str_extract_all(talks_by_fraktion$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  81. fdptotal = length(Worte)
  82. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words
  83. #Fraktionslos
  84. Worte <- str_extract_all(talks_by_fraktion$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  85. fraktionslostotal = length(Worte)
  86. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words
  87. #SPD
  88. Worte <- str_extract_all(talks_by_fraktion$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  89. spdtotal = length(Worte)
  90. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words
  91. #NA
  92. Worte <- str_extract_all(talks_by_fraktion$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  93. natotal = length(Worte)
  94. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words
  95. #alle
  96. all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words)
  97. total <- sum(all_words$n)
  98. all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
  99. ```
  100. Now we want to extract the words that are more frequently used by a specific `fraktion`.
  101. ```{r}
  102. afd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afd_high_frequent
  103. select(afd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  104. afdundfraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
  105. select(afdundfraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  106. grüne_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
  107. select(grüne_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  108. cdu_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
  109. select(cdu_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  110. linke_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
  111. select(linke_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  112. fdp_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
  113. select(fdp_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  114. fraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
  115. select(fraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  116. spd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
  117. select(spd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  118. na_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> na_high_frequent
  119. select(na_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
  120. ```
  121. We compare these words with `hitlerwords`.
  122. ```{r}
  123. afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afd_hitler_comparison
  124. afdundfraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afdundfraktionslos_hitler_comparison
  125. grüne_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> grüne_hitler_comparison
  126. cdu_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> cdu_hitler_comparison
  127. linke_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> linke_hitler_comparison
  128. fdp_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fdp_hitler_comparison
  129. fraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fraktionslos_hitler_comparison
  130. spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> spd_hitler_comparison
  131. na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison
  132. #not unique
  133. tibble(fraktion = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"),
  134. absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)),
  135. total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words))
  136. ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison
  137. hitler_comparison
  138. ```
  139. Finally, we want to plot our results:
  140. ```{r, fig.width=7}
  141. bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]")
  142. ```