An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

152 řádky
4.7KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. if (length(available_protocols) == 0)
  27. warning("The given directory is empty or does not exist.")
  28. list(redner = redner, reden = reden, talks = talks)
  29. }
  30. # this reads all currently parseable data from one xml
  31. read_one <- function(name, path) {
  32. x <- tryCatch(read_xml(paste0(path, name)),
  33. error = function(c) NULL)
  34. if (is.null(x)) return(NULL)
  35. cs <- xml_children(x)
  36. verlauf <- xml_find_first(x, "sitzungsverlauf")
  37. rednerl <- xml_find_first(x, "rednerliste")
  38. xml_children(rednerl) %>%
  39. parse_rednerliste() ->
  40. redner
  41. xml_children(verlauf) %>%
  42. xml_find_all("rede") %>%
  43. parse_redenliste() ->
  44. res
  45. list(redner = redner, reden = res$reden, talks = res$talks)
  46. }
  47. xml_get <- function(node, name) {
  48. res <- xml_text %$% xml_find_all(node, name)
  49. if (length(res) == 0) NA_character_
  50. else res
  51. }
  52. # parse one redner
  53. parse_redner <- function(redner_xml) {
  54. redner_id <- xml_attr(redner_xml, "id")
  55. nm <- xml_child(redner_xml)
  56. vorname <- xml_get(nm, "vorname")
  57. nachname <- xml_get(nm, "nachname")
  58. fraktion <- xml_get(nm, "fraktion")
  59. titel <- xml_get(nm, "titel")
  60. rolle <- xml_find_all(nm, "rolle")
  61. if (length(rolle) > 0) {
  62. rolle_lang <- xml_get(rolle, "rolle_lang")
  63. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  64. } else rolle_kurz <- rolle_lang <- NA_character_
  65. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  66. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  67. }
  68. # parse one rede
  69. # returns: - a rede (with rede id and redner id)
  70. # - all talks appearing in the rede (with corresponding content)
  71. parse_rede <- function(rede_xml) {
  72. rede_id <- xml_attr(rede_xml, "id")
  73. cs <- xml_children(rede_xml)
  74. cur_redner <- NA_character_
  75. principal_redner <- NA_character_
  76. cur_content <- ""
  77. reden <- list()
  78. for (node in cs) {
  79. if (xml_name(node) == "p") {
  80. klasse <- xml_attr(node, "klasse")
  81. if (!is.na(klasse) && klasse == "redner") {
  82. if (!is.na(cur_redner)) {
  83. rede <- c(rede_id = rede_id,
  84. redner = cur_redner,
  85. content = cur_content)
  86. reden <- c(reden, list(rede))
  87. cur_content <- ""
  88. } else {
  89. principal_redner <- xml_child(node) %>% xml_attr("id")
  90. }
  91. cur_redner <- xml_child(node) %>% xml_attr("id")
  92. } else {
  93. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  94. }
  95. }
  96. }
  97. rede <- c(rede_id = rede_id,
  98. redner = cur_redner,
  99. content = cur_content)
  100. reden <- c(reden, list(rede))
  101. list(rede = c(id = rede_id, redner = principal_redner),
  102. parts = reden)
  103. }
  104. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  105. parse_redenliste <- function(redenliste_xml) {
  106. d <- sapply(redenliste_xml, parse_rede)
  107. reden <- simplify2array(d["rede", ])
  108. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  109. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  110. talks = tibble(rede_id = parts["rede_id", ],
  111. redner = parts["redner", ],
  112. content = parts["content", ]))
  113. }
  114. # create a tibble of redner from a list of xml nodes representing redner
  115. parse_rednerliste <- function(rednerliste_xml) {
  116. d <- sapply(rednerliste_xml, parse_redner)
  117. tibble(id = d["id",],
  118. vorname = d["vorname",],
  119. nachname = d["nachname",],
  120. fraktion = d["fraktion",],
  121. titel = d["titel",],
  122. rolle_kurz = d["rolle_kurz",],
  123. rolle_lang = d["rolle_lang",])
  124. }
  125. # -------------------------------
  126. # EXAMPLE USE
  127. # make sure data ist downloaded via fetch.R
  128. # res <- read_one("19126-data.xml")
  129. #
  130. # res$redner
  131. # res$reden
  132. # res$talks
  133. # -------------------------------