An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

121 linhas
3.8KB

  1. source("config.R")
  2. source("../utils/helpers.R")
  3. library("xml2")
  4. library(tibble)
  5. library(magrittr)
  6. # for usage see the example at the end
  7. # this reads all currently parseable data from one xml
  8. read_one <- function(name) {
  9. x <- read_xml(paste0(DOWNLOAD_DIR, name))
  10. cs <- xml_children(x)
  11. verlauf <- xml_find_first(x, "sitzungsverlauf")
  12. rednerl <- xml_find_first(x, "rednerliste")
  13. xml_children(rednerl) %>%
  14. parse_rednerliste() ->
  15. redner
  16. xml_children(verlauf) %>%
  17. xml_find_all("rede") %>%
  18. parse_redenliste() ->
  19. res
  20. list(redner = redner, reden = res$reden, talks = res$talks)
  21. }
  22. xml_get <- function(node, name) {
  23. res <- xml_text %$% xml_find_all(node, name)
  24. if (length(res) == 0) NA_character_
  25. else res
  26. }
  27. # parse one redner
  28. parse_redner <- function(redner_xml) {
  29. redner_id <- xml_attr(redner_xml, "id")
  30. nm <- xml_child(redner_xml)
  31. vorname <- xml_get(nm, "vorname")
  32. nachname <- xml_get(nm, "nachname")
  33. fraktion <- xml_get(nm, "fraktion")
  34. titel <- xml_get(nm, "titel")
  35. rolle <- xml_find_all(nm, "rolle")
  36. if (length(rolle) > 0) {
  37. rolle_lang <- xml_get(rolle, "rolle_lang")
  38. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  39. } else rolle_kurz <- rolle_lang <- NA_character_
  40. c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
  41. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  42. }
  43. # parse one rede
  44. # returns: - a rede (with rede id and redner id)
  45. # - all talks appearing in the rede (with corresponding content)
  46. parse_rede <- function(rede_xml) {
  47. rede_id <- xml_attr(rede_xml, "id")
  48. cs <- xml_children(rede_xml)
  49. cur_redner <- NA_character_
  50. principal_redner <- NA_character_
  51. cur_content <- ""
  52. reden <- list()
  53. for (node in cs) {
  54. if (xml_name(node) == "p") {
  55. if (xml_attr(node, "klasse") == "redner") {
  56. if (!is.na(cur_redner)) {
  57. rede <- c(rede_id = rede_id,
  58. redner = cur_redner,
  59. content = cur_content)
  60. reden <- c(reden, list(rede))
  61. cur_content <- ""
  62. } else {
  63. principal_redner <- xml_child(node) %>% xml_attr("id")
  64. }
  65. cur_redner <- xml_child(node) %>% xml_attr("id")
  66. } else {
  67. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  68. }
  69. }
  70. }
  71. rede <- c(rede_id = rede_id,
  72. redner = cur_redner,
  73. content = cur_content)
  74. reden <- c(reden, list(rede))
  75. list(rede = c(id = rede_id, redner = principal_redner),
  76. parts = reden)
  77. }
  78. # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
  79. parse_redenliste <- function(redenliste_xml) {
  80. d <- sapply(redenliste_xml, parse_rede)
  81. reden <- simplify2array(d["rede", ])
  82. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  83. list(reden = tibble(id = reden["id",], redner = reden["redner",]),
  84. talks = tibble(rede_id = parts["rede_id", ],
  85. redner = parts["redner", ],
  86. content = parts["content", ]))
  87. }
  88. # create a tibble of redner from a list of xml nodes representing redner
  89. parse_rednerliste <- function(rednerliste_xml) {
  90. d <- sapply(rednerliste_xml, parse_redner)
  91. tibble(id = d["id",],
  92. vorname = d["vorname",],
  93. nachname = d["nachname",],
  94. fraktion = d["fraktion",],
  95. titel = d["titel",],
  96. rolle_kurz = d["rolle_kurz",],
  97. rolle_lang = d["rolle_lang",])
  98. }
  99. # -------------------------------
  100. # EXAMPLE USE
  101. # make sure data ist downloaded via fetch.R
  102. res <- read_one("19038-data.xml")
  103. res$redner
  104. res$reden
  105. res$talks
  106. # -------------------------------