An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

242 рядки
8.6KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. if (length(available_protocols) == 0)
  15. stop("The given directory is empty or does not exist.")
  16. lapply(res, `[[`, "redner") %>%
  17. bind_rows() %>%
  18. distinct() ->
  19. redner
  20. lapply(res, `[[`, "reden") %>%
  21. bind_rows() %>%
  22. distinct() %>%
  23. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  24. reden
  25. lapply(res, `[[`, "talks") %>%
  26. bind_rows() %>%
  27. distinct() ->
  28. talks
  29. lapply(res, `[[`, "comments") %>%
  30. bind_rows() %>%
  31. distinct() ->
  32. commentsandapplause
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
  39. "SPD" = str_detect(fraktion, "SPD"),
  40. "FDP" = str_detect(fraktion, "FDP"),
  41. "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
  42. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraktion, "AfD")) %>%
  44. select(-fraktion) ->
  45. applause
  46. list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. rednerl <- xml_find_first(x, "rednerliste")
  58. xml_children(rednerl) %>%
  59. parse_rednerliste() ->
  60. redner
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_redenliste(date) ->
  64. res
  65. list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one redner
  73. parse_redner <- function(redner_xml) {
  74. redner_id <- xml_attr(redner_xml, "id")
  75. nm <- xml_child(redner_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraktion <- xml_get(nm, "fraktion")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one rede
  89. # returns: - a rede (with rede id and redner id)
  90. # - all talks appearing in the rede (with corresponding content)
  91. parse_rede <- function(rede_xml, date) {
  92. rede_id <- xml_attr(rede_xml, "id")
  93. cs <- xml_children(rede_xml)
  94. cur_redner <- NA_character_
  95. principal_redner <- NA_character_
  96. cur_content <- ""
  97. reden <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  103. if (!is.na(cur_redner)) {
  104. rede <- c(rede_id = rede_id,
  105. redner = cur_redner,
  106. content = cur_content)
  107. reden <- c(reden, list(rede))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_redner) && xml_name(node) != "name") {
  111. principal_redner <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_redner <- "BTP"
  115. } else {
  116. cur_redner <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. rede <- c(rede_id = rede_id,
  134. redner = cur_redner,
  135. content = cur_content)
  136. reden <- c(reden, list(rede))
  137. list(rede = c(id = rede_id, redner = principal_redner, date = date),
  138. parts = reden,
  139. comments = comments)
  140. }
  141. fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, rede_id, on_redner) {
  144. base <- c(rede_id = rede_id, on_redner = on_redner)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fraktionspattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  159. parse_redenliste <- function(redenliste_xml, date) {
  160. d <- sapply(redenliste_xml, parse_rede, date = date)
  161. reden <- simplify2array(d["rede", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(reden = tibble(id = reden["id",], redner = reden["redner",],
  165. date = reden["date",]),
  166. talks = tibble(rede_id = parts["rede_id", ],
  167. redner = parts["redner", ],
  168. content = parts["content", ]),
  169. comments = tibble(rede_id = comments["rede_id",],
  170. on_redner = comments["on_redner",],
  171. type = comments["type",],
  172. fraktion = comments["fraktion",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of redner from a list of xml nodes representing redner
  177. parse_rednerliste <- function(rednerliste_xml) {
  178. d <- sapply(rednerliste_xml, parse_redner)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraktion = d["fraktion",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' @export
  188. write_to_csv <- function(tables, path="csv/", create=F) {
  189. check_directory(path, create)
  190. write.table(tables$redner, str_c(path, "redner.csv"))
  191. write.table(tables$reden, str_c(path, "reden.csv"))
  192. write.table(tables$talks, str_c(path, "talks.csv"))
  193. write.table(tables$comments, str_c(path, "comments.csv"))
  194. write.table(tables$applause, str_c(path, "applause.csv"))
  195. }
  196. #' @export
  197. read_from_csv <- function(path="csv/") {
  198. list(redner = read.table(str_c(path, "redner.csv")) %>%
  199. tibble() %>%
  200. mutate(id = as.character(id)),
  201. reden = read.table(str_c(path, "reden.csv")) %>%
  202. tibble() %>%
  203. mutate(redner = as.character(redner)),
  204. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  205. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  206. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  207. }
  208. # -------------------------------
  209. # EXAMPLE USE
  210. # make sure data ist downloaded via fetch.R
  211. # res <- read_one("records/19126-data.xml")
  212. #
  213. # res$redner
  214. # res$reden
  215. # res$talks
  216. # -------------------------------