An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

150 linhas
4.5KB

  1. source("config.R")
  2. source("../utils/helpers.R")
  3. library("xml2")
  4. library(tibble)
  5. library(dplyr)
  6. library(magrittr)
  7. library(pbapply)
  8. # for usage see the example at the end
  9. read_all <- function() {
  10. cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
  11. available_protocols <- list.files(DOWNLOAD_DIR)
  12. res <- pblapply(available_protocols, read_one)
  13. lapply(res, `[[`, "redner") %>%
  14. bind_rows() %>%
  15. distinct() ->
  16. redner
  17. lapply(res, `[[`, "reden") %>%
  18. bind_rows() %>%
  19. distinct() ->
  20. reden
  21. lapply(res, `[[`, "talks") %>%
  22. bind_rows() %>%
  23. distinct() ->
  24. talks
  25. list(redner = redner, reden = reden, talks = talks)
  26. }
  27. # this reads all currently parseable data from one xml
  28. read_one <- function(name) {
  29. x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
  30. error = function(c) NULL)
  31. if (is.null(x)) return(NULL)
  32. cs <- xml_children(x)
  33. verlauf <- xml_find_first(x, "sitzungsverlauf")
  34. rednerl <- xml_find_first(x, "rednerliste")
  35. xml_children(rednerl) %>%
  36. parse_rednerliste() ->
  37. redner
  38. xml_children(verlauf) %>%
  39. xml_find_all("rede") %>%
  40. parse_redenliste() ->
  41. res
  42. list(redner = redner, reden = res$reden, talks = res$talks)
  43. }
  44. xml_get <- function(node, name) {
  45. res <- xml_text %$% xml_find_all(node, name)
  46. if (length(res) == 0) NA_character_
  47. else res
  48. }
  49. # parse one redner
  50. parse_redner <- function(redner_xml) {
  51. redner_id <- xml_attr(redner_xml, "id")
  52. nm <- xml_child(redner_xml)
  53. vorname <- xml_get(nm, "vorname")
  54. nachname <- xml_get(nm, "nachname")
  55. fraktion <- xml_get(nm, "fraktion")
  56. titel <- xml_get(nm, "titel")
  57. rolle <- xml_find_all(nm, "rolle")
  58. if (length(rolle) > 0) {
  59. rolle_lang <- xml_get(rolle, "rolle_lang")
  60. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  61. } else rolle_kurz <- rolle_lang <- NA_character_
  62. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  63. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  64. }
  65. # parse one rede
  66. # returns: - a rede (with rede id and redner id)
  67. # - all talks appearing in the rede (with corresponding content)
  68. parse_rede <- function(rede_xml) {
  69. rede_id <- xml_attr(rede_xml, "id")
  70. cs <- xml_children(rede_xml)
  71. cur_redner <- NA_character_
  72. principal_redner <- NA_character_
  73. cur_content <- ""
  74. reden <- list()
  75. for (node in cs) {
  76. if (xml_name(node) == "p") {
  77. klasse <- xml_attr(node, "klasse")
  78. if (!is.na(klasse) && klasse == "redner") {
  79. if (!is.na(cur_redner)) {
  80. rede <- c(rede_id = rede_id,
  81. redner = cur_redner,
  82. content = cur_content)
  83. reden <- c(reden, list(rede))
  84. cur_content <- ""
  85. } else {
  86. principal_redner <- xml_child(node) %>% xml_attr("id")
  87. }
  88. cur_redner <- xml_child(node) %>% xml_attr("id")
  89. } else {
  90. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  91. }
  92. }
  93. }
  94. rede <- c(rede_id = rede_id,
  95. redner = cur_redner,
  96. content = cur_content)
  97. reden <- c(reden, list(rede))
  98. list(rede = c(id = rede_id, redner = principal_redner),
  99. parts = reden)
  100. }
  101. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  102. parse_redenliste <- function(redenliste_xml) {
  103. d <- sapply(redenliste_xml, parse_rede)
  104. reden <- simplify2array(d["rede", ])
  105. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  106. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  107. talks = tibble(rede_id = parts["rede_id", ],
  108. redner = parts["redner", ],
  109. content = parts["content", ]))
  110. }
  111. # create a tibble of redner from a list of xml nodes representing redner
  112. parse_rednerliste <- function(rednerliste_xml) {
  113. d <- sapply(rednerliste_xml, parse_redner)
  114. tibble(id = d["id",],
  115. vorname = d["vorname",],
  116. nachname = d["nachname",],
  117. fraktion = d["fraktion",],
  118. titel = d["titel",],
  119. rolle_kurz = d["rolle_kurz",],
  120. rolle_lang = d["rolle_lang",])
  121. }
  122. # -------------------------------
  123. # EXAMPLE USE
  124. # make sure data ist downloaded via fetch.R
  125. # res <- read_one("19126-data.xml")
  126. #
  127. # res$redner
  128. # res$reden
  129. # res$talks
  130. # -------------------------------