# for usage see the example at the end #' Parse xml records #' #' Creates a list of tibbles containing relevant information from all records #' stored in the input directory. #' #' @param path character #' #' @export read_all <- function(path="records/") { cat("Reading all records from", path, "\n") available_protocols <- list.files(path) res <- pblapply(available_protocols, read_one, path=path) lapply(res, `[[`, "redner") %>% bind_rows() %>% distinct() -> redner lapply(res, `[[`, "reden") %>% bind_rows() %>% distinct() -> reden lapply(res, `[[`, "talks") %>% bind_rows() %>% distinct() -> talks if (length(available_protocols) == 0) warning("The given directory is empty or does not exist.") list(redner = redner, reden = reden, talks = talks) } # this reads all currently parseable data from one xml read_one <- function(name, path) { x <- tryCatch(read_xml(paste0(path, name)), error = function(c) NULL) if (is.null(x)) return(NULL) cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") rednerl <- xml_find_first(x, "rednerliste") xml_children(rednerl) %>% parse_rednerliste() -> redner xml_children(verlauf) %>% xml_find_all("rede") %>% parse_redenliste() -> res list(redner = redner, reden = res$reden, talks = res$talks) } xml_get <- function(node, name) { res <- xml_text %$% xml_find_all(node, name) if (length(res) == 0) NA_character_ else res } # parse one redner parse_redner <- function(redner_xml) { redner_id <- xml_attr(redner_xml, "id") nm <- xml_child(redner_xml) vorname <- xml_get(nm, "vorname") nachname <- xml_get(nm, "nachname") fraktion <- xml_get(nm, "fraktion") titel <- xml_get(nm, "titel") rolle <- xml_find_all(nm, "rolle") if (length(rolle) > 0) { rolle_lang <- xml_get(rolle, "rolle_lang") rolle_kurz <- xml_get(rolle, "rolle_kurz") } else rolle_kurz <- rolle_lang <- NA_character_ c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) } # parse one rede # returns: - a rede (with rede id and redner id) # - all talks appearing in the rede (with corresponding content) parse_rede <- function(rede_xml) { rede_id <- xml_attr(rede_xml, "id") cs <- xml_children(rede_xml) cur_redner <- NA_character_ principal_redner <- NA_character_ cur_content <- "" reden <- list() for (node in cs) { if (xml_name(node) == "p") { klasse <- xml_attr(node, "klasse") if (!is.na(klasse) && klasse == "redner") { if (!is.na(cur_redner)) { rede <- c(rede_id = rede_id, redner = cur_redner, content = cur_content) reden <- c(reden, list(rede)) cur_content <- "" } else { principal_redner <- xml_child(node) %>% xml_attr("id") } cur_redner <- xml_child(node) %>% xml_attr("id") } else { cur_content <- paste0(cur_content, xml_text(node), sep="\n") } } } rede <- c(rede_id = rede_id, redner = cur_redner, content = cur_content) reden <- c(reden, list(rede)) list(rede = c(id = rede_id, redner = principal_redner), parts = reden) } # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden parse_redenliste <- function(redenliste_xml) { d <- sapply(redenliste_xml, parse_rede) reden <- simplify2array(d["rede", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) list(reden = tibble(id = reden["id",], redner = reden["redner",]), talks = tibble(rede_id = parts["rede_id", ], redner = parts["redner", ], content = parts["content", ])) } # create a tibble of redner from a list of xml nodes representing redner parse_rednerliste <- function(rednerliste_xml) { d <- sapply(rednerliste_xml, parse_redner) tibble(id = d["id",], vorname = d["vorname",], nachname = d["nachname",], fraktion = d["fraktion",], titel = d["titel",], rolle_kurz = d["rolle_kurz",], rolle_lang = d["rolle_lang",]) } # ------------------------------- # EXAMPLE USE # make sure data ist downloaded via fetch.R # res <- read_one("19126-data.xml") # # res$redner # res$reden # res$talks # -------------------------------