An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

150 строки
4.5KB

  1. source("config.R")
  2. source("../utils/helpers.R")
  3. library("xml2")
  4. library(tibble)
  5. library(dplyr)
  6. library(magrittr)
  7. library(pbapply)
  8. # for usage see the example at the end
  9. read_all <- function() {
  10. cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
  11. available_protocols <- list.files(DOWNLOAD_DIR)
  12. res <- pblapply(available_protocols, read_one)
  13. lapply(res, `[[`, "redner") %>%
  14. bind_rows() %>%
  15. distinct() ->
  16. redner
  17. lapply(res, `[[`, "reden") %>%
  18. bind_rows() %>%
  19. distinct() ->
  20. reden
  21. lapply(res, `[[`, "talks") %>%
  22. bind_rows() %>%
  23. distinct() ->
  24. talks
  25. list(redner = redner, reden = reden, talks = talks)
  26. }
  27. # this reads all currently parseable data from one xml
  28. read_one <- function(name) {
  29. x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
  30. error = function(c) NULL)
  31. if (is.null(x)) return(NULL)
  32. cs <- xml_children(x)
  33. verlauf <- xml_find_first(x, "sitzungsverlauf")
  34. rednerl <- xml_find_first(x, "rednerliste")
  35. xml_children(rednerl) %>%
  36. parse_rednerliste() ->
  37. redner
  38. xml_children(verlauf) %>%
  39. xml_find_all("rede") %>%
  40. parse_redenliste() ->
  41. res
  42. list(redner = redner, reden = res$reden, talks = res$talks)
  43. }
  44. xml_get <- function(node, name) {
  45. res <- xml_text %$% xml_find_all(node, name)
  46. if (length(res) == 0) NA_character_
  47. else res
  48. }
  49. # parse one redner
  50. parse_redner <- function(redner_xml) {
  51. redner_id <- xml_attr(redner_xml, "id")
  52. nm <- xml_child(redner_xml)
  53. vorname <- xml_get(nm, "vorname")
  54. nachname <- xml_get(nm, "nachname")
  55. fraktion <- xml_get(nm, "fraktion")
  56. titel <- xml_get(nm, "titel")
  57. rolle <- xml_find_all(nm, "rolle")
  58. if (length(rolle) > 0) {
  59. rolle_lang <- xml_get(rolle, "rolle_lang")
  60. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  61. } else rolle_kurz <- rolle_lang <- NA_character_
  62. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  63. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  64. }
  65. # parse one rede
  66. # returns: - a rede (with rede id and redner id)
  67. # - all talks appearing in the rede (with corresponding content)
  68. parse_rede <- function(rede_xml) {
  69. rede_id <- xml_attr(rede_xml, "id")
  70. cs <- xml_children(rede_xml)
  71. cur_redner <- NA_character_
  72. principal_redner <- NA_character_
  73. cur_content <- ""
  74. reden <- list()
  75. for (node in cs) {
  76. if (xml_name(node) == "p") {
  77. klasse <- xml_attr(node, "klasse")
  78. if (!is.na(klasse) && klasse == "redner") {
  79. if (!is.na(cur_redner)) {
  80. rede <- c(rede_id = rede_id,
  81. redner = cur_redner,
  82. content = cur_content)
  83. reden <- c(reden, list(rede))
  84. cur_content <- ""
  85. } else {
  86. principal_redner <- xml_child(node) %>% xml_attr("id")
  87. }
  88. cur_redner <- xml_child(node) %>% xml_attr("id")
  89. } else {
  90. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  91. }
  92. }
  93. }
  94. rede <- c(rede_id = rede_id,
  95. redner = cur_redner,
  96. content = cur_content)
  97. reden <- c(reden, list(rede))
  98. list(rede = c(id = rede_id, redner = principal_redner),
  99. parts = reden)
  100. }
  101. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  102. parse_redenliste <- function(redenliste_xml) {
  103. d <- sapply(redenliste_xml, parse_rede)
  104. reden <- simplify2array(d["rede", ])
  105. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  106. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  107. talks = tibble(rede_id = parts["rede_id", ],
  108. redner = parts["redner", ],
  109. content = parts["content", ]))
  110. }
  111. # create a tibble of redner from a list of xml nodes representing redner
  112. parse_rednerliste <- function(rednerliste_xml) {
  113. d <- sapply(rednerliste_xml, parse_redner)
  114. tibble(id = d["id",],
  115. vorname = d["vorname",],
  116. nachname = d["nachname",],
  117. fraktion = d["fraktion",],
  118. titel = d["titel",],
  119. rolle_kurz = d["rolle_kurz",],
  120. rolle_lang = d["rolle_lang",])
  121. }
  122. # -------------------------------
  123. # EXAMPLE USE
  124. # make sure data ist downloaded via fetch.R
  125. # res <- read_one("19126-data.xml")
  126. #
  127. # res$redner
  128. # res$reden
  129. # res$talks
  130. # -------------------------------