An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

242 Zeilen
8.6KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. if (length(available_protocols) == 0)
  15. stop("The given directory is empty or does not exist.")
  16. lapply(res, `[[`, "redner") %>%
  17. bind_rows() %>%
  18. distinct() ->
  19. redner
  20. lapply(res, `[[`, "reden") %>%
  21. bind_rows() %>%
  22. distinct() %>%
  23. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  24. reden
  25. lapply(res, `[[`, "talks") %>%
  26. bind_rows() %>%
  27. distinct() ->
  28. talks
  29. lapply(res, `[[`, "comments") %>%
  30. bind_rows() %>%
  31. distinct() ->
  32. commentsandapplause
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
  39. "SPD" = str_detect(fraktion, "SPD"),
  40. "FDP" = str_detect(fraktion, "FDP"),
  41. "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
  42. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraktion, "AfD")) %>%
  44. select(-fraktion) ->
  45. applause
  46. list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. rednerl <- xml_find_first(x, "rednerliste")
  58. xml_children(rednerl) %>%
  59. parse_rednerliste() ->
  60. redner
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_redenliste(date) ->
  64. res
  65. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one redner
  73. parse_redner <- function(redner_xml) {
  74. redner_id <- xml_attr(redner_xml, "id")
  75. nm <- xml_child(redner_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraktion <- xml_get(nm, "fraktion")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one rede
  89. # returns: - a rede (with rede id and redner id)
  90. # - all talks appearing in the rede (with corresponding content)
  91. parse_rede <- function(rede_xml, date) {
  92. rede_id <- xml_attr(rede_xml, "id")
  93. cs <- xml_children(rede_xml)
  94. cur_redner <- NA_character_
  95. principal_redner <- NA_character_
  96. cur_content <- ""
  97. reden <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  103. if (!is.na(cur_redner)) {
  104. rede <- c(rede_id = rede_id,
  105. redner = cur_redner,
  106. content = cur_content)
  107. reden <- c(reden, list(rede))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_redner) && xml_name(node) != "name") {
  111. principal_redner <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_redner <- "BTP"
  115. } else {
  116. cur_redner <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. rede <- c(rede_id = rede_id,
  134. redner = cur_redner,
  135. content = cur_content)
  136. reden <- c(reden, list(rede))
  137. list(rede = c(id = rede_id, redner = principal_redner, date = date),
  138. parts = reden,
  139. comments = comments)
  140. }
  141. fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, rede_id, on_redner) {
  144. base <- c(rede_id = rede_id, on_redner = on_redner)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fraktionspattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  159. parse_redenliste <- function(redenliste_xml, date) {
  160. d <- sapply(redenliste_xml, parse_rede, date = date)
  161. reden <- simplify2array(d["rede", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(reden = tibble(id = reden["id",], redner = reden["redner",],
  165. date = reden["date",]),
  166. talks = tibble(rede_id = parts["rede_id", ],
  167. redner = parts["redner", ],
  168. content = parts["content", ]),
  169. comments = tibble(rede_id = comments["rede_id",],
  170. on_redner = comments["on_redner",],
  171. type = comments["type",],
  172. fraktion = comments["fraktion",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of redner from a list of xml nodes representing redner
  177. parse_rednerliste <- function(rednerliste_xml) {
  178. d <- sapply(rednerliste_xml, parse_redner)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraktion = d["fraktion",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' @export
  188. write_to_csv <- function(tables, path="csv/", create=F) {
  189. check_directory(path, create)
  190. write.table(tables$redner, str_c(path, "redner.csv"))
  191. write.table(tables$reden, str_c(path, "reden.csv"))
  192. write.table(tables$talks, str_c(path, "talks.csv"))
  193. write.table(tables$comments, str_c(path, "comments.csv"))
  194. write.table(tables$applause, str_c(path, "applause.csv"))
  195. }
  196. #' @export
  197. read_from_csv <- function(path="csv/") {
  198. list(redner = read.table(str_c(path, "redner.csv")) %>%
  199. tibble() %>%
  200. mutate(id = as.character(id)),
  201. reden = read.table(str_c(path, "reden.csv")) %>%
  202. tibble() %>%
  203. mutate(redner = as.character(redner)),
  204. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  205. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  206. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  207. }
  208. # -------------------------------
  209. # EXAMPLE USE
  210. # make sure data ist downloaded via fetch.R
  211. # res <- read_one("records/19126-data.xml")
  212. #
  213. # res$redner
  214. # res$reden
  215. # res$talks
  216. # -------------------------------