| @@ -0,0 +1,120 @@ | |||
| source("config.R") | |||
| source("../utils/helpers.R") | |||
| library("xml2") | |||
| library(tibble) | |||
| library(magrittr) | |||
| # for usage see the example at the end | |||
| # this reads all currently parseable data from one xml | |||
| read_one <- function(name) { | |||
| x <- read_xml(paste0(DOWNLOAD_DIR, name)) | |||
| cs <- xml_children(x) | |||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | |||
| rednerl <- xml_find_first(x, "rednerliste") | |||
| xml_children(rednerl) %>% | |||
| parse_rednerliste() -> | |||
| redner | |||
| xml_children(verlauf) %>% | |||
| xml_find_all("rede") %>% | |||
| parse_redenliste() -> | |||
| res | |||
| list(redner = redner, reden = res$reden, talks = res$talks) | |||
| } | |||
| xml_get <- function(node, name) { | |||
| res <- xml_text %$% xml_find_all(node, name) | |||
| if (length(res) == 0) NA_character_ | |||
| else res | |||
| } | |||
| # parse one redner | |||
| parse_redner <- function(redner_xml) { | |||
| redner_id <- xml_attr(redner_xml, "id") | |||
| nm <- xml_child(redner_xml) | |||
| vorname <- xml_get(nm, "vorname") | |||
| nachname <- xml_get(nm, "nachname") | |||
| fraktion <- xml_get(nm, "fraktion") | |||
| titel <- xml_get(nm, "titel") | |||
| rolle <- xml_find_all(nm, "rolle") | |||
| if (length(rolle) > 0) { | |||
| rolle_lang <- xml_get(rolle, "rolle_lang") | |||
| rolle_kurz <- xml_get(rolle, "rolle_kurz") | |||
| } else rolle_kurz <- rolle_lang <- NA_character_ | |||
| c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||
| rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) | |||
| } | |||
| # parse one rede | |||
| # returns: - a rede (with rede id and redner id) | |||
| # - all talks appearing in the rede (with corresponding content) | |||
| parse_rede <- function(rede_xml) { | |||
| rede_id <- xml_attr(rede_xml, "id") | |||
| cs <- xml_children(rede_xml) | |||
| cur_redner <- NA_character_ | |||
| principal_redner <- NA_character_ | |||
| cur_content <- "" | |||
| reden <- list() | |||
| for (node in cs) { | |||
| if (xml_name(node) == "p") { | |||
| if (xml_attr(node, "klasse") == "redner") { | |||
| if (!is.na(cur_redner)) { | |||
| rede <- c(rede_id = rede_id, | |||
| redner = cur_redner, | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| cur_content <- "" | |||
| } else { | |||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||
| } else { | |||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | |||
| } | |||
| } | |||
| } | |||
| rede <- c(rede_id = rede_id, | |||
| redner = cur_redner, | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| list(rede = c(id = rede_id, redner = principal_redner), | |||
| parts = reden) | |||
| } | |||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||
| parse_redenliste <- function(redenliste_xml) { | |||
| d <- sapply(redenliste_xml, parse_rede) | |||
| reden <- simplify2array(d["rede", ]) | |||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | |||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | |||
| talks = tibble(rede_id = parts["rede_id", ], | |||
| redner = parts["redner", ], | |||
| content = parts["content", ])) | |||
| } | |||
| # create a tibble of redner from a list of xml nodes representing redner | |||
| parse_rednerliste <- function(rednerliste_xml) { | |||
| d <- sapply(rednerliste_xml, parse_redner) | |||
| tibble(id = d["id",], | |||
| vorname = d["vorname",], | |||
| nachname = d["nachname",], | |||
| fraktion = d["fraktion",], | |||
| titel = d["titel",], | |||
| rolle_kurz = d["rolle_kurz",], | |||
| rolle_lang = d["rolle_lang",]) | |||
| } | |||
| # ------------------------------- | |||
| # EXAMPLE USE | |||
| # make sure data ist downloaded via fetch.R | |||
| res <- read_one("19038-data.xml") | |||
| res$redner | |||
| res$reden | |||
| res$talks | |||
| # ------------------------------- | |||