An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

150 lines
4.6KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. list(redner = redner, reden = reden, talks = talks)
  27. }
  28. # this reads all currently parseable data from one xml
  29. read_one <- function(name, path) {
  30. x <- tryCatch(read_xml(paste0(path, name)),
  31. error = function(c) NULL)
  32. if (is.null(x)) return(NULL)
  33. cs <- xml_children(x)
  34. verlauf <- xml_find_first(x, "sitzungsverlauf")
  35. rednerl <- xml_find_first(x, "rednerliste")
  36. xml_children(rednerl) %>%
  37. parse_rednerliste() ->
  38. redner
  39. xml_children(verlauf) %>%
  40. xml_find_all("rede") %>%
  41. parse_redenliste() ->
  42. res
  43. list(redner = redner, reden = res$reden, talks = res$talks)
  44. }
  45. xml_get <- function(node, name) {
  46. res <- xml_text %$% xml_find_all(node, name)
  47. if (length(res) == 0) NA_character_
  48. else res
  49. }
  50. # parse one redner
  51. parse_redner <- function(redner_xml) {
  52. redner_id <- xml_attr(redner_xml, "id")
  53. nm <- xml_child(redner_xml)
  54. vorname <- xml_get(nm, "vorname")
  55. nachname <- xml_get(nm, "nachname")
  56. fraktion <- xml_get(nm, "fraktion")
  57. titel <- xml_get(nm, "titel")
  58. rolle <- xml_find_all(nm, "rolle")
  59. if (length(rolle) > 0) {
  60. rolle_lang <- xml_get(rolle, "rolle_lang")
  61. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  62. } else rolle_kurz <- rolle_lang <- NA_character_
  63. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  64. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  65. }
  66. # parse one rede
  67. # returns: - a rede (with rede id and redner id)
  68. # - all talks appearing in the rede (with corresponding content)
  69. parse_rede <- function(rede_xml) {
  70. rede_id <- xml_attr(rede_xml, "id")
  71. cs <- xml_children(rede_xml)
  72. cur_redner <- NA_character_
  73. principal_redner <- NA_character_
  74. cur_content <- ""
  75. reden <- list()
  76. for (node in cs) {
  77. if (xml_name(node) == "p") {
  78. klasse <- xml_attr(node, "klasse")
  79. if (!is.na(klasse) && klasse == "redner") {
  80. if (!is.na(cur_redner)) {
  81. rede <- c(rede_id = rede_id,
  82. redner = cur_redner,
  83. content = cur_content)
  84. reden <- c(reden, list(rede))
  85. cur_content <- ""
  86. } else {
  87. principal_redner <- xml_child(node) %>% xml_attr("id")
  88. }
  89. cur_redner <- xml_child(node) %>% xml_attr("id")
  90. } else {
  91. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  92. }
  93. }
  94. }
  95. rede <- c(rede_id = rede_id,
  96. redner = cur_redner,
  97. content = cur_content)
  98. reden <- c(reden, list(rede))
  99. list(rede = c(id = rede_id, redner = principal_redner),
  100. parts = reden)
  101. }
  102. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  103. parse_redenliste <- function(redenliste_xml) {
  104. d <- sapply(redenliste_xml, parse_rede)
  105. reden <- simplify2array(d["rede", ])
  106. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  107. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  108. talks = tibble(rede_id = parts["rede_id", ],
  109. redner = parts["redner", ],
  110. content = parts["content", ]))
  111. }
  112. # create a tibble of redner from a list of xml nodes representing redner
  113. parse_rednerliste <- function(rednerliste_xml) {
  114. d <- sapply(rednerliste_xml, parse_redner)
  115. tibble(id = d["id",],
  116. vorname = d["vorname",],
  117. nachname = d["nachname",],
  118. fraktion = d["fraktion",],
  119. titel = d["titel",],
  120. rolle_kurz = d["rolle_kurz",],
  121. rolle_lang = d["rolle_lang",])
  122. }
  123. # -------------------------------
  124. # EXAMPLE USE
  125. # make sure data ist downloaded via fetch.R
  126. # res <- read_one("19126-data.xml")
  127. #
  128. # res$redner
  129. # res$reden
  130. # res$talks
  131. # -------------------------------