# for usage see the example at the end #' Parse xml records #' #' Creates a list of tibbles containing relevant information from all records #' stored in the input directory. #' #' @param path character #' #' @export read_all <- function(path="records/") { cat("Reading all records from", path, "\n") available_protocols <- list.files(path) res <- pblapply(available_protocols, read_one, path=path) lapply(res, `[[`, "redner") %>% bind_rows() %>% distinct() -> redner lapply(res, `[[`, "reden") %>% bind_rows() %>% distinct() -> reden lapply(res, `[[`, "talks") %>% bind_rows() %>% distinct() -> talks lapply(res, `[[`, "comments") %>% bind_rows() %>% distinct() -> comments if (length(available_protocols) == 0) warning("The given directory is empty or does not exist.") list(redner = redner, reden = reden, talks = talks, comments = comments) } # this reads all currently parseable data from one xml read_one <- function(name, path) { x <- tryCatch(read_xml(paste0(path, name)), error = function(c) NULL) if (is.null(x)) return(NULL) cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") rednerl <- xml_find_first(x, "rednerliste") xml_children(rednerl) %>% parse_rednerliste() -> redner xml_children(verlauf) %>% xml_find_all("rede") %>% parse_redenliste() -> res list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) } xml_get <- function(node, name) { res <- xml_text %$% xml_find_all(node, name) if (length(res) == 0) NA_character_ else res } # parse one redner parse_redner <- function(redner_xml) { redner_id <- xml_attr(redner_xml, "id") nm <- xml_child(redner_xml) vorname <- xml_get(nm, "vorname") nachname <- xml_get(nm, "nachname") fraktion <- xml_get(nm, "fraktion") titel <- xml_get(nm, "titel") rolle <- xml_find_all(nm, "rolle") if (length(rolle) > 0) { rolle_lang <- xml_get(rolle, "rolle_lang") rolle_kurz <- xml_get(rolle, "rolle_kurz") } else rolle_kurz <- rolle_lang <- NA_character_ c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) } # parse one rede # returns: - a rede (with rede id and redner id) # - all talks appearing in the rede (with corresponding content) parse_rede <- function(rede_xml) { rede_id <- xml_attr(rede_xml, "id") cs <- xml_children(rede_xml) cur_redner <- NA_character_ principal_redner <- NA_character_ cur_content <- "" reden <- list() comments <- list() for (node in cs) { if (xml_name(node) == "p" || xml_name(node) == "name") { klasse <- xml_attr(node, "klasse") if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { if (!is.na(cur_redner)) { rede <- c(rede_id = rede_id, redner = cur_redner, content = cur_content) reden <- c(reden, list(rede)) cur_content <- "" } if (is.na(principal_redner) && xml_name(node) != "name") { principal_redner <- xml_child(node) %>% xml_attr("id") } if (xml_name(node) == "name") { cur_redner <- "BTP" } else { cur_redner <- xml_child(node) %>% xml_attr("id") } } else { cur_content <- paste0(cur_content, xml_text(node), sep="\n") } } else if (xml_name(node) == "kommentar") { # comments are of the form # (blabla [Fraktion] – blabla liasdf – bla) xml_text(node) %>% str_sub(2, -2) %>% str_split("–") %>% `[[`(1) %>% lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> cs comments <- c(comments, cs) } } rede <- c(rede_id = rede_id, redner = cur_redner, content = cur_content) reden <- c(reden, list(rede)) list(rede = c(id = rede_id, redner = principal_redner), parts = reden, comments = comments) } fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") parse_comment <- function(comment, rede_id, on_redner) { base <- c(rede_id = rede_id, on_redner = on_redner) str_extract_all(comment, fraktionspattern) %>% `[[`(1) %>% sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% str_c(collapse=",") -> by # classify comment # TODO: # - actually separate content properly # - differentiate between [AfD] and AfD in by if(str_detect(comment, "Beifall")) { c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4]) } } # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden parse_redenliste <- function(redenliste_xml) { d <- sapply(redenliste_xml, parse_rede) reden <- simplify2array(d["rede", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) list(reden = tibble(id = reden["id",], redner = reden["redner",]), talks = tibble(rede_id = parts["rede_id", ], redner = parts["redner", ], content = parts["content", ]), comments = tibble(rede_id = comments["rede_id",], on_redner = comments["on_redner",], type = comments["type",], fraktion = comments["fraktion",], kommentator = comments["kommentator",], content = comments["content", ])) } # create a tibble of redner from a list of xml nodes representing redner parse_rednerliste <- function(rednerliste_xml) { d <- sapply(rednerliste_xml, parse_redner) tibble(id = d["id",], vorname = d["vorname",], nachname = d["nachname",], fraktion = d["fraktion",], titel = d["titel",], rolle_kurz = d["rolle_kurz",], rolle_lang = d["rolle_lang",]) } #' @export write_to_csv <- function(tables, path="csv/", create=F) { check_directory(path, create) write.table(tables$redner, str_c(path, "redner.csv")) write.table(tables$reden, str_c(path, "reden.csv")) write.table(tables$talks, str_c(path, "talks.csv")) write.table(tables$comments, str_c(path, "comments.csv")) } #' @export read_from_csv <- function(path="csv/") { list(redner = read.table(str_c(path, "redner.csv")) %>% tibble() %>% mutate(id = as.character(id)), reden = read.table(str_c(path, "reden.csv")) %>% tibble() %>% mutate(redner = as.character(redner)), talks = tibble %$% read.table(str_c(path, "talks.csv")), comments = tibble %$% read.table(str_c(path, "comments.csv"))) } # ------------------------------- # EXAMPLE USE # make sure data ist downloaded via fetch.R # res <- read_one("records/19126-data.xml") # # res$redner # res$reden # res$talks # -------------------------------