An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

152 строки
4.7KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "redner") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. redner
  18. lapply(res, `[[`, "reden") %>%
  19. bind_rows() %>%
  20. distinct() ->
  21. reden
  22. lapply(res, `[[`, "talks") %>%
  23. bind_rows() %>%
  24. distinct() ->
  25. talks
  26. if (length(available_protocols) == 0)
  27. warning("The given directory is empty or does not exist.")
  28. list(redner = redner, reden = reden, talks = talks)
  29. }
  30. # this reads all currently parseable data from one xml
  31. read_one <- function(name, path) {
  32. x <- tryCatch(read_xml(paste0(path, name)),
  33. error = function(c) NULL)
  34. if (is.null(x)) return(NULL)
  35. cs <- xml_children(x)
  36. verlauf <- xml_find_first(x, "sitzungsverlauf")
  37. rednerl <- xml_find_first(x, "rednerliste")
  38. xml_children(rednerl) %>%
  39. parse_rednerliste() ->
  40. redner
  41. xml_children(verlauf) %>%
  42. xml_find_all("rede") %>%
  43. parse_redenliste() ->
  44. res
  45. list(redner = redner, reden = res$reden, talks = res$talks)
  46. }
  47. xml_get <- function(node, name) {
  48. res <- xml_text %$% xml_find_all(node, name)
  49. if (length(res) == 0) NA_character_
  50. else res
  51. }
  52. # parse one redner
  53. parse_redner <- function(redner_xml) {
  54. redner_id <- xml_attr(redner_xml, "id")
  55. nm <- xml_child(redner_xml)
  56. vorname <- xml_get(nm, "vorname")
  57. nachname <- xml_get(nm, "nachname")
  58. fraktion <- xml_get(nm, "fraktion")
  59. titel <- xml_get(nm, "titel")
  60. rolle <- xml_find_all(nm, "rolle")
  61. if (length(rolle) > 0) {
  62. rolle_lang <- xml_get(rolle, "rolle_lang")
  63. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  64. } else rolle_kurz <- rolle_lang <- NA_character_
  65. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  66. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  67. }
  68. # parse one rede
  69. # returns: - a rede (with rede id and redner id)
  70. # - all talks appearing in the rede (with corresponding content)
  71. parse_rede <- function(rede_xml) {
  72. rede_id <- xml_attr(rede_xml, "id")
  73. cs <- xml_children(rede_xml)
  74. cur_redner <- NA_character_
  75. principal_redner <- NA_character_
  76. cur_content <- ""
  77. reden <- list()
  78. for (node in cs) {
  79. if (xml_name(node) == "p") {
  80. klasse <- xml_attr(node, "klasse")
  81. if (!is.na(klasse) && klasse == "redner") {
  82. if (!is.na(cur_redner)) {
  83. rede <- c(rede_id = rede_id,
  84. redner = cur_redner,
  85. content = cur_content)
  86. reden <- c(reden, list(rede))
  87. cur_content <- ""
  88. } else {
  89. principal_redner <- xml_child(node) %>% xml_attr("id")
  90. }
  91. cur_redner <- xml_child(node) %>% xml_attr("id")
  92. } else {
  93. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  94. }
  95. }
  96. }
  97. rede <- c(rede_id = rede_id,
  98. redner = cur_redner,
  99. content = cur_content)
  100. reden <- c(reden, list(rede))
  101. list(rede = c(id = rede_id, redner = principal_redner),
  102. parts = reden)
  103. }
  104. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  105. parse_redenliste <- function(redenliste_xml) {
  106. d <- sapply(redenliste_xml, parse_rede)
  107. reden <- simplify2array(d["rede", ])
  108. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  109. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  110. talks = tibble(rede_id = parts["rede_id", ],
  111. redner = parts["redner", ],
  112. content = parts["content", ]))
  113. }
  114. # create a tibble of redner from a list of xml nodes representing redner
  115. parse_rednerliste <- function(rednerliste_xml) {
  116. d <- sapply(rednerliste_xml, parse_redner)
  117. tibble(id = d["id",],
  118. vorname = d["vorname",],
  119. nachname = d["nachname",],
  120. fraktion = d["fraktion",],
  121. titel = d["titel",],
  122. rolle_kurz = d["rolle_kurz",],
  123. rolle_lang = d["rolle_lang",])
  124. }
  125. # -------------------------------
  126. # EXAMPLE USE
  127. # make sure data ist downloaded via fetch.R
  128. # res <- read_one("19126-data.xml")
  129. #
  130. # res$redner
  131. # res$reden
  132. # res$talks
  133. # -------------------------------