An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

222 lines
7.7KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. lapply(res, `[[`, "comments") %>%
  27. bind_rows() %>%
  28. distinct() ->
  29. comments
  30. if (length(available_protocols) == 0)
  31. warning("The given directory is empty or does not exist.")
  32. list(redner = redner, reden = reden, talks = talks, comments = comments)
  33. }
  34. # this reads all currently parseable data from one xml
  35. read_one <- function(name, path) {
  36. x <- tryCatch(read_xml(paste0(path, name)),
  37. error = function(c) NULL)
  38. if (is.null(x)) return(NULL)
  39. cs <- xml_children(x)
  40. verlauf <- xml_find_first(x, "sitzungsverlauf")
  41. rednerl <- xml_find_first(x, "rednerliste")
  42. xml_children(rednerl) %>%
  43. parse_rednerliste() ->
  44. redner
  45. xml_children(verlauf) %>%
  46. xml_find_all("rede") %>%
  47. parse_redenliste() ->
  48. res
  49. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  50. }
  51. xml_get <- function(node, name) {
  52. res <- xml_text %$% xml_find_all(node, name)
  53. if (length(res) == 0) NA_character_
  54. else res
  55. }
  56. # parse one redner
  57. parse_redner <- function(redner_xml) {
  58. redner_id <- xml_attr(redner_xml, "id")
  59. nm <- xml_child(redner_xml)
  60. vorname <- xml_get(nm, "vorname")
  61. nachname <- xml_get(nm, "nachname")
  62. fraktion <- xml_get(nm, "fraktion")
  63. titel <- xml_get(nm, "titel")
  64. rolle <- xml_find_all(nm, "rolle")
  65. if (length(rolle) > 0) {
  66. rolle_lang <- xml_get(rolle, "rolle_lang")
  67. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  68. } else rolle_kurz <- rolle_lang <- NA_character_
  69. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  70. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  71. }
  72. # parse one rede
  73. # returns: - a rede (with rede id and redner id)
  74. # - all talks appearing in the rede (with corresponding content)
  75. parse_rede <- function(rede_xml) {
  76. rede_id <- xml_attr(rede_xml, "id")
  77. cs <- xml_children(rede_xml)
  78. cur_redner <- NA_character_
  79. principal_redner <- NA_character_
  80. cur_content <- ""
  81. reden <- list()
  82. comments <- list()
  83. for (node in cs) {
  84. if (xml_name(node) == "p" || xml_name(node) == "name") {
  85. klasse <- xml_attr(node, "klasse")
  86. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  87. if (!is.na(cur_redner)) {
  88. rede <- c(rede_id = rede_id,
  89. redner = cur_redner,
  90. content = cur_content)
  91. reden <- c(reden, list(rede))
  92. cur_content <- ""
  93. }
  94. if (is.na(principal_redner) && xml_name(node) != "name") {
  95. principal_redner <- xml_child(node) %>% xml_attr("id")
  96. }
  97. if (xml_name(node) == "name") {
  98. cur_redner <- "BTP"
  99. } else {
  100. cur_redner <- xml_child(node) %>% xml_attr("id")
  101. }
  102. } else {
  103. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  104. }
  105. } else if (xml_name(node) == "kommentar") {
  106. # comments are of the form
  107. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  108. xml_text(node) %>%
  109. str_sub(2, -2) %>%
  110. str_split("–") %>%
  111. `[[`(1) %>%
  112. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  113. cs
  114. comments <- c(comments, cs)
  115. }
  116. }
  117. rede <- c(rede_id = rede_id,
  118. redner = cur_redner,
  119. content = cur_content)
  120. reden <- c(reden, list(rede))
  121. list(rede = c(id = rede_id, redner = principal_redner),
  122. parts = reden,
  123. comments = comments)
  124. }
  125. fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  126. fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  127. parse_comment <- function(comment, rede_id, on_redner) {
  128. base <- c(rede_id = rede_id, on_redner = on_redner)
  129. str_extract_all(comment, fraktionspattern) %>%
  130. `[[`(1) %>%
  131. sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
  132. str_c(collapse=",") ->
  133. by
  134. # classify comment
  135. # TODO:
  136. # - actually separate content properly
  137. # - differentiate between [AfD] and AfD in by
  138. if(str_detect(comment, "Beifall")) {
  139. c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
  140. } else {
  141. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  142. c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
  143. }
  144. }
  145. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  146. parse_redenliste <- function(redenliste_xml) {
  147. d <- sapply(redenliste_xml, parse_rede)
  148. reden <- simplify2array(d["rede", ])
  149. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  150. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  151. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  152. talks = tibble(rede_id = parts["rede_id", ],
  153. redner = parts["redner", ],
  154. content = parts["content", ]),
  155. comments = tibble(rede_id = comments["rede_id",],
  156. on_redner = comments["on_redner",],
  157. type = comments["type",],
  158. fraktion = comments["fraktion",],
  159. kommentator = comments["kommentator",],
  160. content = comments["content", ]))
  161. }
  162. # create a tibble of redner from a list of xml nodes representing redner
  163. parse_rednerliste <- function(rednerliste_xml) {
  164. d <- sapply(rednerliste_xml, parse_redner)
  165. tibble(id = d["id",],
  166. vorname = d["vorname",],
  167. nachname = d["nachname",],
  168. fraktion = d["fraktion",],
  169. titel = d["titel",],
  170. rolle_kurz = d["rolle_kurz",],
  171. rolle_lang = d["rolle_lang",])
  172. }
  173. write_to_csv <- function(tables, path="csv/", create=F) {
  174. check_directory(path, create)
  175. write.table(tables$redner, str_c(path, "redner.csv"))
  176. write.table(tables$reden, str_c(path, "reden.csv"))
  177. write.table(tables$talks, str_c(path, "talks.csv"))
  178. write.table(tables$comments, str_c(path, "comments.csv"))
  179. }
  180. read_from_csv <- function(path="csv/") {
  181. list(redner = read.table(str_c(path, "redner.csv")) %>%
  182. tibble() %>%
  183. mutate(id = as.character(id)),
  184. reden = read.table(str_c(path, "reden.csv")) %>%
  185. tibble() %>%
  186. mutate(redner = as.character(redner)),
  187. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  188. comments = tibble %$% read.table(str_c(path, "comments.csv")))
  189. }
  190. # -------------------------------
  191. # EXAMPLE USE
  192. # make sure data ist downloaded via fetch.R
  193. # res <- read_one("records/19126-data.xml")
  194. #
  195. # res$redner
  196. # res$reden
  197. # res$talks
  198. # -------------------------------