An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

142 řádky
4.4KB

  1. # for usage see the example at the end
  2. read_all <- function(path="records/") {
  3. cat("Reading all records from", path, "\n")
  4. available_protocols <- list.files(path)
  5. res <- pblapply(available_protocols, read_one, path=path)
  6. lapply(res, `[[`, "redner") %>%
  7. bind_rows() %>%
  8. distinct() ->
  9. redner
  10. lapply(res, `[[`, "reden") %>%
  11. bind_rows() %>%
  12. distinct() ->
  13. reden
  14. lapply(res, `[[`, "talks") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. talks
  18. list(redner = redner, reden = reden, talks = talks)
  19. }
  20. # this reads all currently parseable data from one xml
  21. read_one <- function(name, path) {
  22. x <- tryCatch(read_xml(paste0(path, name)),
  23. error = function(c) NULL)
  24. if (is.null(x)) return(NULL)
  25. cs <- xml_children(x)
  26. verlauf <- xml_find_first(x, "sitzungsverlauf")
  27. rednerl <- xml_find_first(x, "rednerliste")
  28. xml_children(rednerl) %>%
  29. parse_rednerliste() ->
  30. redner
  31. xml_children(verlauf) %>%
  32. xml_find_all("rede") %>%
  33. parse_redenliste() ->
  34. res
  35. list(redner = redner, reden = res$reden, talks = res$talks)
  36. }
  37. xml_get <- function(node, name) {
  38. res <- xml_text %$% xml_find_all(node, name)
  39. if (length(res) == 0) NA_character_
  40. else res
  41. }
  42. # parse one redner
  43. parse_redner <- function(redner_xml) {
  44. redner_id <- xml_attr(redner_xml, "id")
  45. nm <- xml_child(redner_xml)
  46. vorname <- xml_get(nm, "vorname")
  47. nachname <- xml_get(nm, "nachname")
  48. fraktion <- xml_get(nm, "fraktion")
  49. titel <- xml_get(nm, "titel")
  50. rolle <- xml_find_all(nm, "rolle")
  51. if (length(rolle) > 0) {
  52. rolle_lang <- xml_get(rolle, "rolle_lang")
  53. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  54. } else rolle_kurz <- rolle_lang <- NA_character_
  55. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  56. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  57. }
  58. # parse one rede
  59. # returns: - a rede (with rede id and redner id)
  60. # - all talks appearing in the rede (with corresponding content)
  61. parse_rede <- function(rede_xml) {
  62. rede_id <- xml_attr(rede_xml, "id")
  63. cs <- xml_children(rede_xml)
  64. cur_redner <- NA_character_
  65. principal_redner <- NA_character_
  66. cur_content <- ""
  67. reden <- list()
  68. for (node in cs) {
  69. if (xml_name(node) == "p") {
  70. klasse <- xml_attr(node, "klasse")
  71. if (!is.na(klasse) && klasse == "redner") {
  72. if (!is.na(cur_redner)) {
  73. rede <- c(rede_id = rede_id,
  74. redner = cur_redner,
  75. content = cur_content)
  76. reden <- c(reden, list(rede))
  77. cur_content <- ""
  78. } else {
  79. principal_redner <- xml_child(node) %>% xml_attr("id")
  80. }
  81. cur_redner <- xml_child(node) %>% xml_attr("id")
  82. } else {
  83. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  84. }
  85. }
  86. }
  87. rede <- c(rede_id = rede_id,
  88. redner = cur_redner,
  89. content = cur_content)
  90. reden <- c(reden, list(rede))
  91. list(rede = c(id = rede_id, redner = principal_redner),
  92. parts = reden)
  93. }
  94. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  95. parse_redenliste <- function(redenliste_xml) {
  96. d <- sapply(redenliste_xml, parse_rede)
  97. reden <- simplify2array(d["rede", ])
  98. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  99. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  100. talks = tibble(rede_id = parts["rede_id", ],
  101. redner = parts["redner", ],
  102. content = parts["content", ]))
  103. }
  104. # create a tibble of redner from a list of xml nodes representing redner
  105. parse_rednerliste <- function(rednerliste_xml) {
  106. d <- sapply(rednerliste_xml, parse_redner)
  107. tibble(id = d["id",],
  108. vorname = d["vorname",],
  109. nachname = d["nachname",],
  110. fraktion = d["fraktion",],
  111. titel = d["titel",],
  112. rolle_kurz = d["rolle_kurz",],
  113. rolle_lang = d["rolle_lang",])
  114. }
  115. # -------------------------------
  116. # EXAMPLE USE
  117. # make sure data ist downloaded via fetch.R
  118. # res <- read_one("19126-data.xml")
  119. #
  120. # res$redner
  121. # res$reden
  122. # res$talks
  123. # -------------------------------