|
- # for usage see the example at the end
-
- #' Parse xml records
- #'
- #' Creates a list of tibbles containing relevant information from all records
- #' stored in the input directory.
- #'
- #' @param path character
- #'
- #' @export
- read_all <- function(path="records/") {
- cat("Reading all records from", path, "\n")
- available_protocols <- list.files(path)
- res <- pblapply(available_protocols, read_one, path=path)
-
- lapply(res, `[[`, "redner") %>%
- bind_rows() %>%
- distinct() ->
- redner
-
- lapply(res, `[[`, "reden") %>%
- bind_rows() %>%
- distinct() ->
- reden
-
- lapply(res, `[[`, "talks") %>%
- bind_rows() %>%
- distinct() ->
- talks
-
- if (length(available_protocols) == 0)
- warning("The given directory is empty or does not exist.")
- list(redner = redner, reden = reden, talks = talks)
- }
-
- # this reads all currently parseable data from one xml
- read_one <- function(name, path) {
- x <- tryCatch(read_xml(paste0(path, name)),
- error = function(c) NULL)
- if (is.null(x)) return(NULL)
- cs <- xml_children(x)
-
- verlauf <- xml_find_first(x, "sitzungsverlauf")
- rednerl <- xml_find_first(x, "rednerliste")
-
- xml_children(rednerl) %>%
- parse_rednerliste() ->
- redner
-
- xml_children(verlauf) %>%
- xml_find_all("rede") %>%
- parse_redenliste() ->
- res
-
- list(redner = redner, reden = res$reden, talks = res$talks)
- }
-
- xml_get <- function(node, name) {
- res <- xml_text %$% xml_find_all(node, name)
- if (length(res) == 0) NA_character_
- else res
- }
-
- # parse one redner
- parse_redner <- function(redner_xml) {
- redner_id <- xml_attr(redner_xml, "id")
- nm <- xml_child(redner_xml)
- vorname <- xml_get(nm, "vorname")
- nachname <- xml_get(nm, "nachname")
- fraktion <- xml_get(nm, "fraktion")
- titel <- xml_get(nm, "titel")
- rolle <- xml_find_all(nm, "rolle")
- if (length(rolle) > 0) {
- rolle_lang <- xml_get(rolle, "rolle_lang")
- rolle_kurz <- xml_get(rolle, "rolle_kurz")
- } else rolle_kurz <- rolle_lang <- NA_character_
- c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
- rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
- }
-
- # parse one rede
- # returns: - a rede (with rede id and redner id)
- # - all talks appearing in the rede (with corresponding content)
- parse_rede <- function(rede_xml) {
- rede_id <- xml_attr(rede_xml, "id")
- cs <- xml_children(rede_xml)
- cur_redner <- NA_character_
- principal_redner <- NA_character_
- cur_content <- ""
- reden <- list()
- for (node in cs) {
- if (xml_name(node) == "p") {
- klasse <- xml_attr(node, "klasse")
- if (!is.na(klasse) && klasse == "redner") {
- if (!is.na(cur_redner)) {
- rede <- c(rede_id = rede_id,
- redner = cur_redner,
- content = cur_content)
- reden <- c(reden, list(rede))
- cur_content <- ""
- } else {
- principal_redner <- xml_child(node) %>% xml_attr("id")
- }
- cur_redner <- xml_child(node) %>% xml_attr("id")
- } else {
- cur_content <- paste0(cur_content, xml_text(node), sep="\n")
- }
- }
- }
- rede <- c(rede_id = rede_id,
- redner = cur_redner,
- content = cur_content)
- reden <- c(reden, list(rede))
- list(rede = c(id = rede_id, redner = principal_redner),
- parts = reden)
- }
-
- # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
- parse_redenliste <- function(redenliste_xml) {
- d <- sapply(redenliste_xml, parse_rede)
- reden <- simplify2array(d["rede", ])
- parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
- list(reden = tibble(id = reden["id",], redner = reden["redner",]),
- talks = tibble(rede_id = parts["rede_id", ],
- redner = parts["redner", ],
- content = parts["content", ]))
- }
-
- # create a tibble of redner from a list of xml nodes representing redner
- parse_rednerliste <- function(rednerliste_xml) {
- d <- sapply(rednerliste_xml, parse_redner)
- tibble(id = d["id",],
- vorname = d["vorname",],
- nachname = d["nachname",],
- fraktion = d["fraktion",],
- titel = d["titel",],
- rolle_kurz = d["rolle_kurz",],
- rolle_lang = d["rolle_lang",])
- }
-
- # -------------------------------
- # EXAMPLE USE
-
- # make sure data ist downloaded via fetch.R
- # res <- read_one("19126-data.xml")
- #
- # res$redner
- # res$reden
- # res$talks
-
- # -------------------------------
|