An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

193 Zeilen
6.3KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. if (length(available_protocols) == 0)
  27. warning("The given directory is empty or does not exist.")
  28. list(redner = redner, reden = reden, talks = talks)
  29. }
  30. # this reads all currently parseable data from one xml
  31. read_one <- function(name, path) {
  32. x <- tryCatch(read_xml(paste0(path, name)),
  33. error = function(c) NULL)
  34. if (is.null(x)) return(NULL)
  35. cs <- xml_children(x)
  36. verlauf <- xml_find_first(x, "sitzungsverlauf")
  37. rednerl <- xml_find_first(x, "rednerliste")
  38. xml_children(rednerl) %>%
  39. parse_rednerliste() ->
  40. redner
  41. xml_children(verlauf) %>%
  42. xml_find_all("rede") %>%
  43. parse_redenliste() ->
  44. res
  45. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  46. }
  47. xml_get <- function(node, name) {
  48. res <- xml_text %$% xml_find_all(node, name)
  49. if (length(res) == 0) NA_character_
  50. else res
  51. }
  52. # parse one redner
  53. parse_redner <- function(redner_xml) {
  54. redner_id <- xml_attr(redner_xml, "id")
  55. nm <- xml_child(redner_xml)
  56. vorname <- xml_get(nm, "vorname")
  57. nachname <- xml_get(nm, "nachname")
  58. fraktion <- xml_get(nm, "fraktion")
  59. titel <- xml_get(nm, "titel")
  60. rolle <- xml_find_all(nm, "rolle")
  61. if (length(rolle) > 0) {
  62. rolle_lang <- xml_get(rolle, "rolle_lang")
  63. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  64. } else rolle_kurz <- rolle_lang <- NA_character_
  65. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  66. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  67. }
  68. # parse one rede
  69. # returns: - a rede (with rede id and redner id)
  70. # - all talks appearing in the rede (with corresponding content)
  71. parse_rede <- function(rede_xml) {
  72. rede_id <- xml_attr(rede_xml, "id")
  73. cs <- xml_children(rede_xml)
  74. cur_redner <- NA_character_
  75. principal_redner <- NA_character_
  76. cur_content <- ""
  77. reden <- list()
  78. comments <- list()
  79. for (node in cs) {
  80. if (xml_name(node) == "p" || xml_name(node) == "name") {
  81. klasse <- xml_attr(node, "klasse")
  82. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  83. if (!is.na(cur_redner)) {
  84. rede <- c(rede_id = rede_id,
  85. redner = cur_redner,
  86. content = cur_content)
  87. reden <- c(reden, list(rede))
  88. cur_content <- ""
  89. } else {
  90. principal_redner <- xml_child(node) %>% xml_attr("id")
  91. }
  92. if (xml_name(node) == "name") {
  93. cur_redner <- "BTP"
  94. } else {
  95. cur_redner <- xml_child(node) %>% xml_attr("id")
  96. }
  97. } else {
  98. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  99. }
  100. } else if (xml_name(node) == "kommentar") {
  101. # comments are of the form
  102. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  103. xml_text(node) %>%
  104. str_sub(2, -2) %>%
  105. str_split("–") %>%
  106. `[[`(1) %>%
  107. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  108. cs
  109. comments <- c(comments, cs)
  110. }
  111. }
  112. rede <- c(rede_id = rede_id,
  113. redner = cur_redner,
  114. content = cur_content)
  115. reden <- c(reden, list(rede))
  116. list(rede = c(id = rede_id, redner = principal_redner),
  117. parts = reden,
  118. comments = comments)
  119. }
  120. fraktionspattern <- "BÜNDNIS 90/DIE GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP"
  121. parse_comment <- function(comment, rede_id, on_redner) {
  122. base <- c(rede_id = rede_id, on_redner = on_redner)
  123. str_extract_all(comment, fraktionspattern) %>%
  124. `[[`(1) %>%
  125. str_c(collapse=",") ->
  126. by
  127. # classify comment
  128. # TODO:
  129. # - actually separate content properly
  130. # - differentiate between [AfD] and AfD in by
  131. if(str_detect(comment, "Beifall")) {
  132. c(base, type = "applause", by = by, content = comment)
  133. } else {
  134. c(base, type = "comment", by = by, content = comment)
  135. }
  136. }
  137. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  138. parse_redenliste <- function(redenliste_xml) {
  139. d <- sapply(redenliste_xml, parse_rede)
  140. reden <- simplify2array(d["rede", ])
  141. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  142. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  143. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  144. talks = tibble(rede_id = parts["rede_id", ],
  145. redner = parts["redner", ],
  146. content = parts["content", ]),
  147. comments = tibble(rede_id = comments["rede_id",],
  148. on_redner = comments["on_redner",],
  149. type = comments["type",],
  150. by = comments["by",],
  151. content = comments["content", ]))
  152. }
  153. # create a tibble of redner from a list of xml nodes representing redner
  154. parse_rednerliste <- function(rednerliste_xml) {
  155. d <- sapply(rednerliste_xml, parse_redner)
  156. tibble(id = d["id",],
  157. vorname = d["vorname",],
  158. nachname = d["nachname",],
  159. fraktion = d["fraktion",],
  160. titel = d["titel",],
  161. rolle_kurz = d["rolle_kurz",],
  162. rolle_lang = d["rolle_lang",])
  163. }
  164. # -------------------------------
  165. # EXAMPLE USE
  166. # make sure data ist downloaded via fetch.R
  167. res <- read_one("records/19126-data.xml")
  168. #
  169. # res$redner
  170. # res$reden
  171. # res$talks
  172. # -------------------------------