An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

224 Zeilen
7.7KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. lapply(res, `[[`, "comments") %>%
  27. bind_rows() %>%
  28. distinct() ->
  29. comments
  30. if (length(available_protocols) == 0)
  31. warning("The given directory is empty or does not exist.")
  32. list(redner = redner, reden = reden, talks = talks, comments = comments)
  33. }
  34. # this reads all currently parseable data from one xml
  35. read_one <- function(name, path) {
  36. x <- tryCatch(read_xml(paste0(path, name)),
  37. error = function(c) NULL)
  38. if (is.null(x)) return(NULL)
  39. cs <- xml_children(x)
  40. verlauf <- xml_find_first(x, "sitzungsverlauf")
  41. rednerl <- xml_find_first(x, "rednerliste")
  42. xml_children(rednerl) %>%
  43. parse_rednerliste() ->
  44. redner
  45. xml_children(verlauf) %>%
  46. xml_find_all("rede") %>%
  47. parse_redenliste() ->
  48. res
  49. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  50. }
  51. xml_get <- function(node, name) {
  52. res <- xml_text %$% xml_find_all(node, name)
  53. if (length(res) == 0) NA_character_
  54. else res
  55. }
  56. # parse one redner
  57. parse_redner <- function(redner_xml) {
  58. redner_id <- xml_attr(redner_xml, "id")
  59. nm <- xml_child(redner_xml)
  60. vorname <- xml_get(nm, "vorname")
  61. nachname <- xml_get(nm, "nachname")
  62. fraktion <- xml_get(nm, "fraktion")
  63. titel <- xml_get(nm, "titel")
  64. rolle <- xml_find_all(nm, "rolle")
  65. if (length(rolle) > 0) {
  66. rolle_lang <- xml_get(rolle, "rolle_lang")
  67. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  68. } else rolle_kurz <- rolle_lang <- NA_character_
  69. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  70. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  71. }
  72. # parse one rede
  73. # returns: - a rede (with rede id and redner id)
  74. # - all talks appearing in the rede (with corresponding content)
  75. parse_rede <- function(rede_xml) {
  76. rede_id <- xml_attr(rede_xml, "id")
  77. cs <- xml_children(rede_xml)
  78. cur_redner <- NA_character_
  79. principal_redner <- NA_character_
  80. cur_content <- ""
  81. reden <- list()
  82. comments <- list()
  83. for (node in cs) {
  84. if (xml_name(node) == "p" || xml_name(node) == "name") {
  85. klasse <- xml_attr(node, "klasse")
  86. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  87. if (!is.na(cur_redner)) {
  88. rede <- c(rede_id = rede_id,
  89. redner = cur_redner,
  90. content = cur_content)
  91. reden <- c(reden, list(rede))
  92. cur_content <- ""
  93. }
  94. if (is.na(principal_redner) && xml_name(node) != "name") {
  95. principal_redner <- xml_child(node) %>% xml_attr("id")
  96. }
  97. if (xml_name(node) == "name") {
  98. cur_redner <- "BTP"
  99. } else {
  100. cur_redner <- xml_child(node) %>% xml_attr("id")
  101. }
  102. } else {
  103. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  104. }
  105. } else if (xml_name(node) == "kommentar") {
  106. # comments are of the form
  107. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  108. xml_text(node) %>%
  109. str_sub(2, -2) %>%
  110. str_split("–") %>%
  111. `[[`(1) %>%
  112. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  113. cs
  114. comments <- c(comments, cs)
  115. }
  116. }
  117. rede <- c(rede_id = rede_id,
  118. redner = cur_redner,
  119. content = cur_content)
  120. reden <- c(reden, list(rede))
  121. list(rede = c(id = rede_id, redner = principal_redner),
  122. parts = reden,
  123. comments = comments)
  124. }
  125. fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  126. fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  127. parse_comment <- function(comment, rede_id, on_redner) {
  128. base <- c(rede_id = rede_id, on_redner = on_redner)
  129. str_extract_all(comment, fraktionspattern) %>%
  130. `[[`(1) %>%
  131. sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
  132. str_c(collapse=",") ->
  133. by
  134. # classify comment
  135. # TODO:
  136. # - actually separate content properly
  137. # - differentiate between [AfD] and AfD in by
  138. if(str_detect(comment, "Beifall")) {
  139. c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
  140. } else {
  141. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  142. c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
  143. }
  144. }
  145. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  146. parse_redenliste <- function(redenliste_xml) {
  147. d <- sapply(redenliste_xml, parse_rede)
  148. reden <- simplify2array(d["rede", ])
  149. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  150. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  151. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  152. talks = tibble(rede_id = parts["rede_id", ],
  153. redner = parts["redner", ],
  154. content = parts["content", ]),
  155. comments = tibble(rede_id = comments["rede_id",],
  156. on_redner = comments["on_redner",],
  157. type = comments["type",],
  158. fraktion = comments["fraktion",],
  159. kommentator = comments["kommentator",],
  160. content = comments["content", ]))
  161. }
  162. # create a tibble of redner from a list of xml nodes representing redner
  163. parse_rednerliste <- function(rednerliste_xml) {
  164. d <- sapply(rednerliste_xml, parse_redner)
  165. tibble(id = d["id",],
  166. vorname = d["vorname",],
  167. nachname = d["nachname",],
  168. fraktion = d["fraktion",],
  169. titel = d["titel",],
  170. rolle_kurz = d["rolle_kurz",],
  171. rolle_lang = d["rolle_lang",])
  172. }
  173. #' @export
  174. write_to_csv <- function(tables, path="csv/", create=F) {
  175. check_directory(path, create)
  176. write.table(tables$redner, str_c(path, "redner.csv"))
  177. write.table(tables$reden, str_c(path, "reden.csv"))
  178. write.table(tables$talks, str_c(path, "talks.csv"))
  179. write.table(tables$comments, str_c(path, "comments.csv"))
  180. }
  181. #' @export
  182. read_from_csv <- function(path="csv/") {
  183. list(redner = read.table(str_c(path, "redner.csv")) %>%
  184. tibble() %>%
  185. mutate(id = as.character(id)),
  186. reden = read.table(str_c(path, "reden.csv")) %>%
  187. tibble() %>%
  188. mutate(redner = as.character(redner)),
  189. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  190. comments = tibble %$% read.table(str_c(path, "comments.csv")))
  191. }
  192. # -------------------------------
  193. # EXAMPLE USE
  194. # make sure data ist downloaded via fetch.R
  195. # res <- read_one("records/19126-data.xml")
  196. #
  197. # res$redner
  198. # res$reden
  199. # res$talks
  200. # -------------------------------