| @@ -0,0 +1,120 @@ | |||||
| source("config.R") | |||||
| source("../utils/helpers.R") | |||||
| library("xml2") | |||||
| library(tibble) | |||||
| library(magrittr) | |||||
| # for usage see the example at the end | |||||
| # this reads all currently parseable data from one xml | |||||
| read_one <- function(name) { | |||||
| x <- read_xml(paste0(DOWNLOAD_DIR, name)) | |||||
| cs <- xml_children(x) | |||||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | |||||
| rednerl <- xml_find_first(x, "rednerliste") | |||||
| xml_children(rednerl) %>% | |||||
| parse_rednerliste() -> | |||||
| redner | |||||
| xml_children(verlauf) %>% | |||||
| xml_find_all("rede") %>% | |||||
| parse_redenliste() -> | |||||
| res | |||||
| list(redner = redner, reden = res$reden, talks = res$talks) | |||||
| } | |||||
| xml_get <- function(node, name) { | |||||
| res <- xml_text %$% xml_find_all(node, name) | |||||
| if (length(res) == 0) NA_character_ | |||||
| else res | |||||
| } | |||||
| # parse one redner | |||||
| parse_redner <- function(redner_xml) { | |||||
| redner_id <- xml_attr(redner_xml, "id") | |||||
| nm <- xml_child(redner_xml) | |||||
| vorname <- xml_get(nm, "vorname") | |||||
| nachname <- xml_get(nm, "nachname") | |||||
| fraktion <- xml_get(nm, "fraktion") | |||||
| titel <- xml_get(nm, "titel") | |||||
| rolle <- xml_find_all(nm, "rolle") | |||||
| if (length(rolle) > 0) { | |||||
| rolle_lang <- xml_get(rolle, "rolle_lang") | |||||
| rolle_kurz <- xml_get(rolle, "rolle_kurz") | |||||
| } else rolle_kurz <- rolle_lang <- NA_character_ | |||||
| c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||||
| rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) | |||||
| } | |||||
| # parse one rede | |||||
| # returns: - a rede (with rede id and redner id) | |||||
| # - all talks appearing in the rede (with corresponding content) | |||||
| parse_rede <- function(rede_xml) { | |||||
| rede_id <- xml_attr(rede_xml, "id") | |||||
| cs <- xml_children(rede_xml) | |||||
| cur_redner <- NA_character_ | |||||
| principal_redner <- NA_character_ | |||||
| cur_content <- "" | |||||
| reden <- list() | |||||
| for (node in cs) { | |||||
| if (xml_name(node) == "p") { | |||||
| if (xml_attr(node, "klasse") == "redner") { | |||||
| if (!is.na(cur_redner)) { | |||||
| rede <- c(rede_id = rede_id, | |||||
| redner = cur_redner, | |||||
| content = cur_content) | |||||
| reden <- c(reden, list(rede)) | |||||
| cur_content <- "" | |||||
| } else { | |||||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||||
| } | |||||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||||
| } else { | |||||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | |||||
| } | |||||
| } | |||||
| } | |||||
| rede <- c(rede_id = rede_id, | |||||
| redner = cur_redner, | |||||
| content = cur_content) | |||||
| reden <- c(reden, list(rede)) | |||||
| list(rede = c(id = rede_id, redner = principal_redner), | |||||
| parts = reden) | |||||
| } | |||||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||||
| parse_redenliste <- function(redenliste_xml) { | |||||
| d <- sapply(redenliste_xml, parse_rede) | |||||
| reden <- simplify2array(d["rede", ]) | |||||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | |||||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | |||||
| talks = tibble(rede_id = parts["rede_id", ], | |||||
| redner = parts["redner", ], | |||||
| content = parts["content", ])) | |||||
| } | |||||
| # create a tibble of redner from a list of xml nodes representing redner | |||||
| parse_rednerliste <- function(rednerliste_xml) { | |||||
| d <- sapply(rednerliste_xml, parse_redner) | |||||
| tibble(id = d["id",], | |||||
| vorname = d["vorname",], | |||||
| nachname = d["nachname",], | |||||
| fraktion = d["fraktion",], | |||||
| titel = d["titel",], | |||||
| rolle_kurz = d["rolle_kurz",], | |||||
| rolle_lang = d["rolle_lang",]) | |||||
| } | |||||
| # ------------------------------- | |||||
| # EXAMPLE USE | |||||
| # make sure data ist downloaded via fetch.R | |||||
| res <- read_one("19038-data.xml") | |||||
| res$redner | |||||
| res$reden | |||||
| res$talks | |||||
| # ------------------------------- | |||||