An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
3.7KB

  1. fraktionen <- c("AFD" = "AfD",
  2. "AFD&FRAKTIONSLOS" = "AfD&Fraktionslos",
  3. "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
  4. "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
  5. "FRAKTIONSLOS" = "Fraktionslos",
  6. "DIELINKE" = "DIE LINKE",
  7. "SPD" = "SPD",
  8. "CDU/CSU" = "CDU/CSU",
  9. "FDP" = "FDP")
  10. repair_fraktion <- function(fraktion) {
  11. cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "")
  12. fraktionen[cleaned]
  13. }
  14. # takes vector of titel and keeps longest
  15. longest_titel <- function(titel) {
  16. if (all(is.na(titel))) NA_character_
  17. else titel[which.max %$% str_length(titel)]
  18. }
  19. # takes character vector, removes duplicates and collapses
  20. collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")
  21. # expects a tibble of redner and repairs
  22. repair_redner <- function(redner) {
  23. if (nrow(redner) == 0) return(redner)
  24. redner %>%
  25. filter(id != "10000") %>% # invalid id's
  26. mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
  27. group_by(id) %>%
  28. summarize(vorname = head(vorname, 1),
  29. nachname = head(nachname, 1),
  30. fraktion = collect_unique(fraktion),
  31. titel = longest_titel(titel),
  32. rolle_kurz = collect_unique(str_squish(rolle_kurz)),
  33. rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
  34. ungroup() #%>%
  35. # arrange(id) %>%
  36. # distinct(vorname, nachname, fraktion, titel)
  37. }
  38. repair_reden <- function(reden) {
  39. if (nrow(reden) == 0) return(reden)
  40. # TODO: fill with content
  41. reden
  42. }
  43. repair_talks <- function(talks) {
  44. if (nrow(talks) == 0) return(talks)
  45. # ignore all talks which have empty content
  46. filter(talks, str_length(content) > 0)
  47. }
  48. # tries to find the correct redner id given a name
  49. # this is sufficient since every prename lastname combination in the bundestag is
  50. # unique (luckily :D)
  51. # returns a lookup table
  52. lookup_redner <- function(comments, redner) {
  53. tobereplaced <- "[-–—‑­­-­­­ ]"
  54. redner %>%
  55. unite(name, vorname, nachname, sep=".*") %>%
  56. mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
  57. rs
  58. find_match <- function(komm) {
  59. if (komm == "") return (NA_character_)
  60. # I tried with agrep (levensthein distance) but results are better that way
  61. matches <- str_which(komm, rs$name)
  62. if (length(matches) == 0) return(NA_character_)
  63. rs[head(matches, 1), ]$id
  64. }
  65. comments %>%
  66. distinct(kommentator) %>%
  67. mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
  68. }
  69. repair_comments <- function(comments, redner) {
  70. # try to find a redner id for each actual comment
  71. comments %>%
  72. filter(!is.na(kommentator)) %>%
  73. lookup_redner(redner) %>%
  74. left_join(comments, ., by="kommentator") %>%
  75. select(-kommentator)
  76. }
  77. #' Repair parsed tables
  78. #'
  79. #' TODO: Explain repair_comments argument
  80. #' (if TRUE, we try to lookup redner names in redner table)
  81. #'
  82. #' Possible test: check identical(repair(res), repair(repair(res))) == TRUE
  83. #' Since repaired tables should be a fixpoint of repair.
  84. #' @export
  85. repair <- function(parse_output, repair_comments = FALSE) {
  86. list(redner = repair_redner(parse_output$redner),
  87. reden = repair_reden(parse_output$reden),
  88. talks = repair_talks(parse_output$talks),
  89. comments = if(repair_comments) repair_comments(parse_output$comments,
  90. parse_output$redner)
  91. else parse_output$comments,
  92. applause = parse_output$applause
  93. )
  94. }