An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

147 wiersze
4.5KB

  1. source("config.R")
  2. source("../utils/helpers.R")
  3. library("xml2")
  4. library(tibble)
  5. library(dplyr)
  6. library(magrittr)
  7. # for usage see the example at the end
  8. read_all <- function() {
  9. available_protocols <- list.files(DOWNLOAD_DIR)
  10. res <- lapply(available_protocols, read_one)
  11. sapply(res, `[[`, "redner") %>%
  12. bind_rows() %>%
  13. distinct() ->
  14. redner
  15. sapply(res, `[[`, "reden") %>%
  16. bind_rows() %>%
  17. distinct() ->
  18. reden
  19. sapply(res, `[[`, "talks") %>%
  20. bind_rows() %>%
  21. distinct() ->
  22. talks
  23. list(redner = redner, reden = reden, talks = talks)
  24. }
  25. # this reads all currently parseable data from one xml
  26. read_one <- function(name) {
  27. print(paste("reading", name))
  28. x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
  29. error = function(c) NULL)
  30. if (is.null(x)) return(NULL)
  31. cs <- xml_children(x)
  32. verlauf <- xml_find_first(x, "sitzungsverlauf")
  33. rednerl <- xml_find_first(x, "rednerliste")
  34. xml_children(rednerl) %>%
  35. parse_rednerliste() ->
  36. redner
  37. xml_children(verlauf) %>%
  38. xml_find_all("rede") %>%
  39. parse_redenliste() ->
  40. res
  41. list(redner = redner, reden = res$reden, talks = res$talks)
  42. }
  43. xml_get <- function(node, name) {
  44. res <- xml_text %$% xml_find_all(node, name)
  45. if (length(res) == 0) NA_character_
  46. else res
  47. }
  48. # parse one redner
  49. parse_redner <- function(redner_xml) {
  50. redner_id <- xml_attr(redner_xml, "id")
  51. nm <- xml_child(redner_xml)
  52. vorname <- xml_get(nm, "vorname")
  53. nachname <- xml_get(nm, "nachname")
  54. fraktion <- xml_get(nm, "fraktion")
  55. titel <- xml_get(nm, "titel")
  56. rolle <- xml_find_all(nm, "rolle")
  57. if (length(rolle) > 0) {
  58. rolle_lang <- xml_get(rolle, "rolle_lang")
  59. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  60. } else rolle_kurz <- rolle_lang <- NA_character_
  61. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  62. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  63. }
  64. # parse one rede
  65. # returns: - a rede (with rede id and redner id)
  66. # - all talks appearing in the rede (with corresponding content)
  67. parse_rede <- function(rede_xml) {
  68. rede_id <- xml_attr(rede_xml, "id")
  69. cs <- xml_children(rede_xml)
  70. cur_redner <- NA_character_
  71. principal_redner <- NA_character_
  72. cur_content <- ""
  73. reden <- list()
  74. for (node in cs) {
  75. if (xml_name(node) == "p") {
  76. klasse <- xml_attr(node, "klasse")
  77. if (!is.na(klasse) && klasse == "redner") {
  78. if (!is.na(cur_redner)) {
  79. rede <- c(rede_id = rede_id,
  80. redner = cur_redner,
  81. content = cur_content)
  82. reden <- c(reden, list(rede))
  83. cur_content <- ""
  84. } else {
  85. principal_redner <- xml_child(node) %>% xml_attr("id")
  86. }
  87. cur_redner <- xml_child(node) %>% xml_attr("id")
  88. } else {
  89. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  90. }
  91. }
  92. }
  93. rede <- c(rede_id = rede_id,
  94. redner = cur_redner,
  95. content = cur_content)
  96. reden <- c(reden, list(rede))
  97. list(rede = c(id = rede_id, redner = principal_redner),
  98. parts = reden)
  99. }
  100. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  101. parse_redenliste <- function(redenliste_xml) {
  102. d <- sapply(redenliste_xml, parse_rede)
  103. reden <- simplify2array(d["rede", ])
  104. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  105. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  106. talks = tibble(rede_id = parts["rede_id", ],
  107. redner = parts["redner", ],
  108. content = parts["content", ]))
  109. }
  110. # create a tibble of redner from a list of xml nodes representing redner
  111. parse_rednerliste <- function(rednerliste_xml) {
  112. d <- sapply(rednerliste_xml, parse_redner)
  113. tibble(id = d["id",],
  114. vorname = d["vorname",],
  115. nachname = d["nachname",],
  116. fraktion = d["fraktion",],
  117. titel = d["titel",],
  118. rolle_kurz = d["rolle_kurz",],
  119. rolle_lang = d["rolle_lang",])
  120. }
  121. # -------------------------------
  122. # EXAMPLE USE
  123. # make sure data ist downloaded via fetch.R
  124. res <- read_one("19126-data.xml")
  125. res$redner
  126. res$reden
  127. res$talks
  128. # -------------------------------