An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 line
3.8KB

  1. source("config.R")
  2. source("../utils/helpers.R")
  3. library("xml2")
  4. library(tibble)
  5. library(magrittr)
  6. # for usage see the example at the end
  7. # this reads all currently parseable data from one xml
  8. read_one <- function(name) {
  9. x <- read_xml(paste0(DOWNLOAD_DIR, name))
  10. cs <- xml_children(x)
  11. verlauf <- xml_find_first(x, "sitzungsverlauf")
  12. rednerl <- xml_find_first(x, "rednerliste")
  13. xml_children(rednerl) %>%
  14. parse_rednerliste() ->
  15. redner
  16. xml_children(verlauf) %>%
  17. xml_find_all("rede") %>%
  18. parse_redenliste() ->
  19. res
  20. list(redner = redner, reden = res$reden, talks = res$talks)
  21. }
  22. xml_get <- function(node, name) {
  23. res <- xml_text %$% xml_find_all(node, name)
  24. if (length(res) == 0) NA_character_
  25. else res
  26. }
  27. # parse one redner
  28. parse_redner <- function(redner_xml) {
  29. redner_id <- xml_attr(redner_xml, "id")
  30. nm <- xml_child(redner_xml)
  31. vorname <- xml_get(nm, "vorname")
  32. nachname <- xml_get(nm, "nachname")
  33. fraktion <- xml_get(nm, "fraktion")
  34. titel <- xml_get(nm, "titel")
  35. rolle <- xml_find_all(nm, "rolle")
  36. if (length(rolle) > 0) {
  37. rolle_lang <- xml_get(rolle, "rolle_lang")
  38. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  39. } else rolle_kurz <- rolle_lang <- NA_character_
  40. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  41. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  42. }
  43. # parse one rede
  44. # returns: - a rede (with rede id and redner id)
  45. # - all talks appearing in the rede (with corresponding content)
  46. parse_rede <- function(rede_xml) {
  47. rede_id <- xml_attr(rede_xml, "id")
  48. cs <- xml_children(rede_xml)
  49. cur_redner <- NA_character_
  50. principal_redner <- NA_character_
  51. cur_content <- ""
  52. reden <- list()
  53. for (node in cs) {
  54. if (xml_name(node) == "p") {
  55. if (xml_attr(node, "klasse") == "redner") {
  56. if (!is.na(cur_redner)) {
  57. rede <- c(rede_id = rede_id,
  58. redner = cur_redner,
  59. content = cur_content)
  60. reden <- c(reden, list(rede))
  61. cur_content <- ""
  62. } else {
  63. principal_redner <- xml_child(node) %>% xml_attr("id")
  64. }
  65. cur_redner <- xml_child(node) %>% xml_attr("id")
  66. } else {
  67. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  68. }
  69. }
  70. }
  71. rede <- c(rede_id = rede_id,
  72. redner = cur_redner,
  73. content = cur_content)
  74. reden <- c(reden, list(rede))
  75. list(rede = c(id = rede_id, redner = principal_redner),
  76. parts = reden)
  77. }
  78. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  79. parse_redenliste <- function(redenliste_xml) {
  80. d <- sapply(redenliste_xml, parse_rede)
  81. reden <- simplify2array(d["rede", ])
  82. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  83. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  84. talks = tibble(rede_id = parts["rede_id", ],
  85. redner = parts["redner", ],
  86. content = parts["content", ]))
  87. }
  88. # create a tibble of redner from a list of xml nodes representing redner
  89. parse_rednerliste <- function(rednerliste_xml) {
  90. d <- sapply(rednerliste_xml, parse_redner)
  91. tibble(id = d["id",],
  92. vorname = d["vorname",],
  93. nachname = d["nachname",],
  94. fraktion = d["fraktion",],
  95. titel = d["titel",],
  96. rolle_kurz = d["rolle_kurz",],
  97. rolle_lang = d["rolle_lang",])
  98. }
  99. # -------------------------------
  100. # EXAMPLE USE
  101. # make sure data ist downloaded via fetch.R
  102. res <- read_one("19038-data.xml")
  103. res$redner
  104. res$reden
  105. res$talks
  106. # -------------------------------