An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

185 líneas
9.7KB

  1. ---
  2. title: "funwithdata"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{funwithdata}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(stringr)
  19. library(ggplot2)
  20. ```
  21. ## Preparation of data
  22. First, you need to download all records of the current legislative period.
  23. ```r
  24. fetch_all("../records/") # path to directory where records should be stored
  25. ```
  26. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  27. ```r
  28. read_all("../records/") %>% repair() -> res
  29. reden <- res$reden
  30. redner <- res$redner
  31. talks <- res$talks
  32. ```
  33. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  34. the result into more descriptive variables.
  35. For development purposes, we load the tables from csv files.
  36. ```{r}
  37. tables <- read_from_csv('../csv/')
  38. comments <- tables$comments
  39. reden <- tables$reden
  40. redner <- tables$redner
  41. talks <- tables$talks
  42. ```
  43. Further, we need to load a list of words that were used by Hitler but not by standard German texts.
  44. ```{r}
  45. fil <- file('../hitler_texts/hitler_words')
  46. Worte <- readLines(fil)
  47. hitlerwords <- tibble(Worte)
  48. ```
  49. ## Analysis
  50. Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`.
  51. ```{r}
  52. talks %>%
  53. left_join(redner, by=c(redner='id')) %>%
  54. group_by(fraktion) %>%
  55. summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion
  56. talks_by_fraktion
  57. ```
  58. For each party, we want to get a tibble of words with frequency.
  59. ```{r}
  60. #AfD
  61. Worte <- str_extract_all(talks_by_fraktion$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  62. afdtotal = length(Worte)
  63. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words
  64. #AfD&Fraktionslos
  65. Worte <- str_extract_all(talks_by_fraktion$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  66. afdundfraktionslostotal = length(Worte)
  67. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words
  68. #BÜNDNIS 90 / DIE GRÜNEN
  69. Worte <- str_extract_all(talks_by_fraktion$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  70. grünetotal = length(Worte)
  71. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words
  72. #CDU/CSU
  73. Worte <- str_extract_all(talks_by_fraktion$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  74. cdutotal = length(Worte)
  75. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words
  76. #DIE LINKE
  77. Worte <- str_extract_all(talks_by_fraktion$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  78. linketotal = length(Worte)
  79. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words
  80. #FDP
  81. Worte <- str_extract_all(talks_by_fraktion$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  82. fdptotal = length(Worte)
  83. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words
  84. #Fraktionslos
  85. Worte <- str_extract_all(talks_by_fraktion$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  86. fraktionslostotal = length(Worte)
  87. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words
  88. #SPD
  89. Worte <- str_extract_all(talks_by_fraktion$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  90. spdtotal = length(Worte)
  91. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words
  92. #NA
  93. Worte <- str_extract_all(talks_by_fraktion$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
  94. natotal = length(Worte)
  95. tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words
  96. #alle
  97. all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words)
  98. total <- sum(all_words$n)
  99. all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
  100. ```
  101. Now we want to extract the words that are more frequently used by a specific `fraktion`.
  102. ```{r}
  103. afd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afd_high_frequent
  104. afdundfraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
  105. grüne_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
  106. cdu_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
  107. linke_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
  108. fdp_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
  109. fraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
  110. spd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
  111. na_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> na_high_frequent
  112. ```
  113. We compare these words with `hitlerwords`.
  114. ```{r}
  115. afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afd_hitler_comparison
  116. afdundfraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afdundfraktionslos_hitler_comparison
  117. grüne_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> grüne_hitler_comparison
  118. cdu_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> cdu_hitler_comparison
  119. linke_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> linke_hitler_comparison
  120. fdp_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fdp_hitler_comparison
  121. fraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fraktionslos_hitler_comparison
  122. spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> spd_hitler_comparison
  123. na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison
  124. #not unique
  125. tibble(fraktion = c("AfD", "AfD und Fraktionslose", "Grüne", "CDU", "Linke", "FDP", "Fraktionslos", "SPD", "NA"),
  126. absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison), nrow(na_hitler_comparison)),
  127. total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words), nrow(na_words))
  128. ) %>% mutate(relative = absolute/total) -> hitler_compare
  129. ```
  130. Dead code:
  131. ```r
  132. 1000*nrow(afd_hitler_comparison) / nrow(afd_words)
  133. 1000*nrow(afdundfraktionslos_hitler_comparison) / nrow(afdundfraktionslos_words)
  134. 1000*nrow(grüne_hitler_comparison) / nrow(grüne_words)
  135. 1000*nrow(cdu_hitler_comparison) / nrow(cdu_words)
  136. 1000*nrow(linke_hitler_comparison) / nrow(linke_words)
  137. 1000*nrow(fdp_hitler_comparison) / nrow(fdp_words)
  138. 1000*nrow(fraktionslos_hitler_comparison) / nrow(fraktionslos_words)
  139. 1000*nrow(spd_hitler_comparison) / nrow(spd_words)
  140. 1000*nrow(na_hitler_comparison) / nrow(na_words)
  141. 1000*sum(afd_hitler_comparison$fraktion_n) / afdtotal
  142. 1000*sum(afdundfraktionslos_hitler_comparison$fraktion_n) / afdundfraktionslostotal
  143. 1000*sum(grüne_hitler_comparison$fraktion_n) / grünetotal
  144. 1000*sum(cdu_hitler_comparison$fraktion_n) / cdutotal
  145. 1000*sum(linke_hitler_comparison$fraktion_n) / linketotal
  146. 1000*sum(fdp_hitler_comparison$fraktion_n) / fdptotal
  147. 1000*sum(fraktionslos_hitler_comparison$fraktion_n) / fraktionslostotal
  148. 1000*sum(spd_hitler_comparison$fraktion_n) / spdtotal
  149. 1000*sum(na_hitler_comparison$fraktion_n) / natotal
  150. ```