An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

224 lines
7.7KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. lapply(res, `[[`, "comments") %>%
  27. bind_rows() %>%
  28. distinct() ->
  29. comments
  30. if (length(available_protocols) == 0)
  31. warning("The given directory is empty or does not exist.")
  32. list(redner = redner, reden = reden, talks = talks, comments = comments)
  33. }
  34. # this reads all currently parseable data from one xml
  35. read_one <- function(name, path) {
  36. x <- tryCatch(read_xml(paste0(path, name)),
  37. error = function(c) NULL)
  38. if (is.null(x)) return(NULL)
  39. cs <- xml_children(x)
  40. verlauf <- xml_find_first(x, "sitzungsverlauf")
  41. rednerl <- xml_find_first(x, "rednerliste")
  42. xml_children(rednerl) %>%
  43. parse_rednerliste() ->
  44. redner
  45. xml_children(verlauf) %>%
  46. xml_find_all("rede") %>%
  47. parse_redenliste() ->
  48. res
  49. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  50. }
  51. xml_get <- function(node, name) {
  52. res <- xml_text %$% xml_find_all(node, name)
  53. if (length(res) == 0) NA_character_
  54. else res
  55. }
  56. # parse one redner
  57. parse_redner <- function(redner_xml) {
  58. redner_id <- xml_attr(redner_xml, "id")
  59. nm <- xml_child(redner_xml)
  60. vorname <- xml_get(nm, "vorname")
  61. nachname <- xml_get(nm, "nachname")
  62. fraktion <- xml_get(nm, "fraktion")
  63. titel <- xml_get(nm, "titel")
  64. rolle <- xml_find_all(nm, "rolle")
  65. if (length(rolle) > 0) {
  66. rolle_lang <- xml_get(rolle, "rolle_lang")
  67. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  68. } else rolle_kurz <- rolle_lang <- NA_character_
  69. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  70. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  71. }
  72. # parse one rede
  73. # returns: - a rede (with rede id and redner id)
  74. # - all talks appearing in the rede (with corresponding content)
  75. parse_rede <- function(rede_xml) {
  76. rede_id <- xml_attr(rede_xml, "id")
  77. cs <- xml_children(rede_xml)
  78. cur_redner <- NA_character_
  79. principal_redner <- NA_character_
  80. cur_content <- ""
  81. reden <- list()
  82. comments <- list()
  83. for (node in cs) {
  84. if (xml_name(node) == "p" || xml_name(node) == "name") {
  85. klasse <- xml_attr(node, "klasse")
  86. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  87. if (!is.na(cur_redner)) {
  88. rede <- c(rede_id = rede_id,
  89. redner = cur_redner,
  90. content = cur_content)
  91. reden <- c(reden, list(rede))
  92. cur_content <- ""
  93. }
  94. if (is.na(principal_redner) && xml_name(node) != "name") {
  95. principal_redner <- xml_child(node) %>% xml_attr("id")
  96. }
  97. if (xml_name(node) == "name") {
  98. cur_redner <- "BTP"
  99. } else {
  100. cur_redner <- xml_child(node) %>% xml_attr("id")
  101. }
  102. } else {
  103. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  104. }
  105. } else if (xml_name(node) == "kommentar") {
  106. # comments are of the form
  107. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  108. xml_text(node) %>%
  109. str_sub(2, -2) %>%
  110. str_split("–") %>%
  111. `[[`(1) %>%
  112. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  113. cs
  114. comments <- c(comments, cs)
  115. }
  116. }
  117. rede <- c(rede_id = rede_id,
  118. redner = cur_redner,
  119. content = cur_content)
  120. reden <- c(reden, list(rede))
  121. list(rede = c(id = rede_id, redner = principal_redner),
  122. parts = reden,
  123. comments = comments)
  124. }
  125. fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  126. fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  127. parse_comment <- function(comment, rede_id, on_redner) {
  128. base <- c(rede_id = rede_id, on_redner = on_redner)
  129. str_extract_all(comment, fraktionspattern) %>%
  130. `[[`(1) %>%
  131. sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
  132. str_c(collapse=",") ->
  133. by
  134. # classify comment
  135. # TODO:
  136. # - actually separate content properly
  137. # - differentiate between [AfD] and AfD in by
  138. if(str_detect(comment, "Beifall")) {
  139. c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
  140. } else {
  141. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  142. c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
  143. }
  144. }
  145. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  146. parse_redenliste <- function(redenliste_xml) {
  147. d <- sapply(redenliste_xml, parse_rede)
  148. reden <- simplify2array(d["rede", ])
  149. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  150. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  151. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  152. talks = tibble(rede_id = parts["rede_id", ],
  153. redner = parts["redner", ],
  154. content = parts["content", ]),
  155. comments = tibble(rede_id = comments["rede_id",],
  156. on_redner = comments["on_redner",],
  157. type = comments["type",],
  158. fraktion = comments["fraktion",],
  159. kommentator = comments["kommentator",],
  160. content = comments["content", ]))
  161. }
  162. # create a tibble of redner from a list of xml nodes representing redner
  163. parse_rednerliste <- function(rednerliste_xml) {
  164. d <- sapply(rednerliste_xml, parse_redner)
  165. tibble(id = d["id",],
  166. vorname = d["vorname",],
  167. nachname = d["nachname",],
  168. fraktion = d["fraktion",],
  169. titel = d["titel",],
  170. rolle_kurz = d["rolle_kurz",],
  171. rolle_lang = d["rolle_lang",])
  172. }
  173. #' @export
  174. write_to_csv <- function(tables, path="csv/", create=F) {
  175. check_directory(path, create)
  176. write.table(tables$redner, str_c(path, "redner.csv"))
  177. write.table(tables$reden, str_c(path, "reden.csv"))
  178. write.table(tables$talks, str_c(path, "talks.csv"))
  179. write.table(tables$comments, str_c(path, "comments.csv"))
  180. }
  181. #' @export
  182. read_from_csv <- function(path="csv/") {
  183. list(redner = read.table(str_c(path, "redner.csv")) %>%
  184. tibble() %>%
  185. mutate(id = as.character(id)),
  186. reden = read.table(str_c(path, "reden.csv")) %>%
  187. tibble() %>%
  188. mutate(redner = as.character(redner)),
  189. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  190. comments = tibble %$% read.table(str_c(path, "comments.csv")))
  191. }
  192. # -------------------------------
  193. # EXAMPLE USE
  194. # make sure data ist downloaded via fetch.R
  195. # res <- read_one("records/19126-data.xml")
  196. #
  197. # res$redner
  198. # res$reden
  199. # res$talks
  200. # -------------------------------