An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

97 lines
3.3KB

  1. fraktionen <- c("AFD" = "AfD",
  2. "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
  3. "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
  4. "FRAKTIONSLOS" = "Fraktionslos",
  5. "DIELINKE" = "DIE LINKE",
  6. "SPD" = "SPD",
  7. "CDU/CSU" = "CDU/CSU",
  8. "FDP" = "FDP")
  9. repair_fraktion <- function(fraktion) {
  10. cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "")
  11. fraktionen[cleaned]
  12. }
  13. # takes vector of titel and keeps longest
  14. longest_titel <- function(titel) {
  15. if (all(is.na(titel))) NA_character_
  16. else titel[which.max %$% str_length(titel)]
  17. }
  18. # takes character vector, removes duplicates and collapses
  19. collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("")
  20. # expects a tibble of redner and repairs
  21. repair_redner <- function(redner) {
  22. if (nrow(redner) == 0) return(redner)
  23. redner %>%
  24. filter(id != "10000") %>% # invalid id's
  25. mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
  26. group_by(id) %>%
  27. summarize(vorname = head(vorname, 1),
  28. nachname = head(nachname, 1),
  29. fraktion = collect_unique(fraktion),
  30. titel = longest_titel(titel),
  31. rolle_kurz = collect_unique(str_squish(rolle_kurz)),
  32. rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
  33. ungroup() #%>%
  34. # arrange(id) %>%
  35. # distinct(vorname, nachname, fraktion, titel)
  36. }
  37. repair_reden <- function(reden) {
  38. if (nrow(reden) == 0) return(reden)
  39. # TODO: fill with content
  40. reden
  41. }
  42. repair_talks <- function(talks) {
  43. if (nrow(talks) == 0) return(talks)
  44. # ignore all talks which have empty content
  45. filter(talks, str_length(content) > 0)
  46. }
  47. # tries to find the correct redner id given a name
  48. # this is sufficient since every prename lastname combination in the bundestag is
  49. # unique (luckily :D)
  50. # returns a lookup table
  51. lookup_redner <- function(comments, redner) {
  52. tobereplaced <- "[-–—‑­­-­­­ ]"
  53. redner %>%
  54. unite(name, vorname, nachname, sep=".*") %>%
  55. mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
  56. rs
  57. find_match <- function(komm) {
  58. if (komm == "") return (NA_character_)
  59. # I tried with agrep (levensthein distance) but results are better that way
  60. matches <- str_which(komm, rs$name)
  61. if (length(matches) == 0) return(NA_character_)
  62. rs[head(matches, 1), ]$id
  63. }
  64. comments %>%
  65. distinct(kommentator) %>%
  66. mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
  67. }
  68. repair_comments <- function(comments, redner) {
  69. # try to find a redner id for each actual comment
  70. comments %>%
  71. filter(!is.na(kommentator)) %>%
  72. lookup_redner(redner) %>%
  73. left_join(comments, ., by="kommentator") %>%
  74. select(-kommentator)
  75. }
  76. #' Repair parsed tables
  77. #'
  78. #' @export
  79. repair <- function(parse_output) {
  80. list(redner = repair_redner(parse_output$redner),
  81. reden = repair_reden(parse_output$reden),
  82. talks = repair_talks(parse_output$talks),
  83. #comments = repair_comments(parse_output$comments)
  84. comments = parse_output$comments,
  85. applause = parse_output$applause
  86. )
  87. }