An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

120 satır
4.3KB

  1. fractions <- c("AFD" = "AfD",
  2. "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
  3. "BÜNDNIS90/" = "BÜNDNIS 90/DIE GRÜNEN",
  4. "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90/DIE GRÜNEN",
  5. "FRAKTIONSLOS" = "Fraktionslos",
  6. "DIELINKE" = "DIE LINKE",
  7. "SPD" = "SPD",
  8. "CDU/CSU" = "CDU/CSU",
  9. "FDP" = "FDP")
  10. repair_fraction <- function(fraction) {
  11. cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "")
  12. fractions[cleaned]
  13. }
  14. # takes vector of titel and keeps longest
  15. longest_titel <- function(titel) {
  16. if (all(is.na(titel))) NA_character_
  17. else titel[which.max %$% str_length(titel)]
  18. }
  19. # takes character vector, removes duplicates and collapses
  20. collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")
  21. # expects a tibble of speaker and repairs
  22. repair_speaker <- function(speaker) {
  23. if (nrow(speaker) == 0) return(speaker)
  24. speaker %>%
  25. filter(id != "10000") %>% # invalid id's
  26. mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction
  27. group_by(id) %>%
  28. summarize(prename = head(prename, 1),
  29. lastname = head(lastname, 1),
  30. fraction = collect_unique(fraction),
  31. title = longest_titel(title),
  32. role_short = collect_unique(str_squish(role_short)),
  33. role_long = collect_unique(str_squish(role_long))) %>%
  34. ungroup() #%>%
  35. }
  36. repair_speeches <- function(speeches) {
  37. if (nrow(speeches) == 0) return(speeches)
  38. # TODO: fill with content
  39. speeches
  40. }
  41. repair_talks <- function(talks) {
  42. if (nrow(talks) == 0) return(talks)
  43. # ignore all talks which have empty content
  44. filter(talks, str_length(content) > 0)
  45. }
  46. #' Lookup name in speakers table
  47. #'
  48. #' Tries to find the correct speaker id given a name.
  49. #' This is sufficient since every prename lastname combination in the bundestag is
  50. #' unique (luckily :D)
  51. #'
  52. #' @param tb tibble
  53. #' @param speaker tibble
  54. #' @param name_variable name
  55. #'
  56. #' Tries to match the name_variable column with speaker names
  57. #'
  58. #' returns a lookup table
  59. lookup_speaker <- function(tb, speaker, name_variable) {
  60. tobereplaced <- "[-–—‑­­-­­­ ]"
  61. speaker %>%
  62. unite(name, prename, lastname, sep=".*") %>%
  63. mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
  64. rs
  65. find_match <- function(komm) {
  66. if (komm == "") return (NA_character_)
  67. # I tried with agrep (levensthein distance) but results are better that way
  68. matches <- str_which(komm, rs$name)
  69. if (length(matches) == 0) return(NA_character_)
  70. rs[head(matches, 1), ]$id
  71. }
  72. tb %>%
  73. distinct({{name_variable}}) %>%
  74. mutate(speaker = Vectorize(find_match)(str_replace_all({{name_variable}}, tobereplaced, "")))
  75. }
  76. repair_comments <- function(comments, speaker, lookup_speaker=F) {
  77. comments %>%
  78. filter(!is.na(commenter) | !is.na(content) | !is.na(fraction)) ->
  79. tb
  80. if (lookup_speaker) {
  81. cat(paste0("Looking up speaker id's for names in comments. This may take a while ...\n",
  82. "Use repair(, lookup_speaker = FALSE) to skip this.\n"))
  83. # try to find a speaker id for each actual comment
  84. tb %>%
  85. filter(!is.na(commenter)) %>%
  86. lookup_speaker(speaker, commenter) %>%
  87. left_join(tb, ., by="commenter")
  88. } else tb
  89. }
  90. #' Repair parsed tables
  91. #'
  92. #' @param parse_output tibble
  93. #' @param lookup_speaker bool
  94. #'
  95. #' If lookup_speaker is TRUE, members of the parliament mentioned in comments are looked up in speaker table.
  96. #'
  97. #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
  98. #' Since repaired tables should be a fixpoint of repair.
  99. #' @export
  100. repair <- function(parse_output, lookup_speaker = FALSE) {
  101. is_valid_res(parse_output)
  102. stopifnot("lookup_speaker must be of type logical" = is.logical(lookup_speaker))
  103. list(speaker = repair_speaker(parse_output$speaker),
  104. speeches = repair_speeches(parse_output$speeches),
  105. talks = repair_talks(parse_output$talks),
  106. comments = repair_comments(parse_output$comments,
  107. parse_output$speaker,
  108. lookup_speaker),
  109. applause = parse_output$applause)
  110. }