An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
4.4KB

  1. # for usage see the example at the end
  2. read_all <- function(path="records/") {
  3. cat("Reading all records from", path, "\n")
  4. available_protocols <- list.files(path)
  5. res <- pblapply(available_protocols, read_one, path=path)
  6. lapply(res, `[[`, "redner") %>%
  7. bind_rows() %>%
  8. distinct() ->
  9. redner
  10. lapply(res, `[[`, "reden") %>%
  11. bind_rows() %>%
  12. distinct() ->
  13. reden
  14. lapply(res, `[[`, "talks") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. talks
  18. list(redner = redner, reden = reden, talks = talks)
  19. }
  20. # this reads all currently parseable data from one xml
  21. read_one <- function(name, path) {
  22. x <- tryCatch(read_xml(paste0(path, name)),
  23. error = function(c) NULL)
  24. if (is.null(x)) return(NULL)
  25. cs <- xml_children(x)
  26. verlauf <- xml_find_first(x, "sitzungsverlauf")
  27. rednerl <- xml_find_first(x, "rednerliste")
  28. xml_children(rednerl) %>%
  29. parse_rednerliste() ->
  30. redner
  31. xml_children(verlauf) %>%
  32. xml_find_all("rede") %>%
  33. parse_redenliste() ->
  34. res
  35. list(redner = redner, reden = res$reden, talks = res$talks)
  36. }
  37. xml_get <- function(node, name) {
  38. res <- xml_text %$% xml_find_all(node, name)
  39. if (length(res) == 0) NA_character_
  40. else res
  41. }
  42. # parse one redner
  43. parse_redner <- function(redner_xml) {
  44. redner_id <- xml_attr(redner_xml, "id")
  45. nm <- xml_child(redner_xml)
  46. vorname <- xml_get(nm, "vorname")
  47. nachname <- xml_get(nm, "nachname")
  48. fraktion <- xml_get(nm, "fraktion")
  49. titel <- xml_get(nm, "titel")
  50. rolle <- xml_find_all(nm, "rolle")
  51. if (length(rolle) > 0) {
  52. rolle_lang <- xml_get(rolle, "rolle_lang")
  53. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  54. } else rolle_kurz <- rolle_lang <- NA_character_
  55. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  56. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  57. }
  58. # parse one rede
  59. # returns: - a rede (with rede id and redner id)
  60. # - all talks appearing in the rede (with corresponding content)
  61. parse_rede <- function(rede_xml) {
  62. rede_id <- xml_attr(rede_xml, "id")
  63. cs <- xml_children(rede_xml)
  64. cur_redner <- NA_character_
  65. principal_redner <- NA_character_
  66. cur_content <- ""
  67. reden <- list()
  68. for (node in cs) {
  69. if (xml_name(node) == "p") {
  70. klasse <- xml_attr(node, "klasse")
  71. if (!is.na(klasse) && klasse == "redner") {
  72. if (!is.na(cur_redner)) {
  73. rede <- c(rede_id = rede_id,
  74. redner = cur_redner,
  75. content = cur_content)
  76. reden <- c(reden, list(rede))
  77. cur_content <- ""
  78. } else {
  79. principal_redner <- xml_child(node) %>% xml_attr("id")
  80. }
  81. cur_redner <- xml_child(node) %>% xml_attr("id")
  82. } else {
  83. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  84. }
  85. }
  86. }
  87. rede <- c(rede_id = rede_id,
  88. redner = cur_redner,
  89. content = cur_content)
  90. reden <- c(reden, list(rede))
  91. list(rede = c(id = rede_id, redner = principal_redner),
  92. parts = reden)
  93. }
  94. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  95. parse_redenliste <- function(redenliste_xml) {
  96. d <- sapply(redenliste_xml, parse_rede)
  97. reden <- simplify2array(d["rede", ])
  98. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  99. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  100. talks = tibble(rede_id = parts["rede_id", ],
  101. redner = parts["redner", ],
  102. content = parts["content", ]))
  103. }
  104. # create a tibble of redner from a list of xml nodes representing redner
  105. parse_rednerliste <- function(rednerliste_xml) {
  106. d <- sapply(rednerliste_xml, parse_redner)
  107. tibble(id = d["id",],
  108. vorname = d["vorname",],
  109. nachname = d["nachname",],
  110. fraktion = d["fraktion",],
  111. titel = d["titel",],
  112. rolle_kurz = d["rolle_kurz",],
  113. rolle_lang = d["rolle_lang",])
  114. }
  115. # -------------------------------
  116. # EXAMPLE USE
  117. # make sure data ist downloaded via fetch.R
  118. # res <- read_one("19126-data.xml")
  119. #
  120. # res$redner
  121. # res$reden
  122. # res$talks
  123. # -------------------------------