From f0f6a9ca3f2c624c84313b45b0c80bf0ac5ac5e6 Mon Sep 17 00:00:00 2001 From: flavis Date: Thu, 17 Jun 2021 15:24:51 +0200 Subject: [PATCH] add first parsing utilities --- scraping/parse.R | 120 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 scraping/parse.R diff --git a/scraping/parse.R b/scraping/parse.R new file mode 100644 index 0000000..7415431 --- /dev/null +++ b/scraping/parse.R @@ -0,0 +1,120 @@ +source("config.R") +source("../utils/helpers.R") +library("xml2") +library(tibble) +library(magrittr) + +# for usage see the example at the end + +# this reads all currently parseable data from one xml +read_one <- function(name) { + x <- read_xml(paste0(DOWNLOAD_DIR, name)) + cs <- xml_children(x) + + verlauf <- xml_find_first(x, "sitzungsverlauf") + rednerl <- xml_find_first(x, "rednerliste") + + xml_children(rednerl) %>% + parse_rednerliste() -> + redner + + xml_children(verlauf) %>% + xml_find_all("rede") %>% + parse_redenliste() -> + res + list(redner = redner, reden = res$reden, talks = res$talks) +} + +xml_get <- function(node, name) { + res <- xml_text %$% xml_find_all(node, name) + if (length(res) == 0) NA_character_ + else res +} + +# parse one redner +parse_redner <- function(redner_xml) { + redner_id <- xml_attr(redner_xml, "id") + nm <- xml_child(redner_xml) + vorname <- xml_get(nm, "vorname") + nachname <- xml_get(nm, "nachname") + fraktion <- xml_get(nm, "fraktion") + titel <- xml_get(nm, "titel") + rolle <- xml_find_all(nm, "rolle") + if (length(rolle) > 0) { + rolle_lang <- xml_get(rolle, "rolle_lang") + rolle_kurz <- xml_get(rolle, "rolle_kurz") + } else rolle_kurz <- rolle_lang <- NA_character_ + c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, + rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) +} + +# parse one rede +# returns: - a rede (with rede id and redner id) +# - all talks appearing in the rede (with corresponding content) +parse_rede <- function(rede_xml) { + rede_id <- xml_attr(rede_xml, "id") + cs <- xml_children(rede_xml) + cur_redner <- NA_character_ + principal_redner <- NA_character_ + cur_content <- "" + reden <- list() + for (node in cs) { + if (xml_name(node) == "p") { + if (xml_attr(node, "klasse") == "redner") { + if (!is.na(cur_redner)) { + rede <- c(rede_id = rede_id, + redner = cur_redner, + content = cur_content) + reden <- c(reden, list(rede)) + cur_content <- "" + } else { + principal_redner <- xml_child(node) %>% xml_attr("id") + } + cur_redner <- xml_child(node) %>% xml_attr("id") + } else { + cur_content <- paste0(cur_content, xml_text(node), sep="\n") + } + } + } + rede <- c(rede_id = rede_id, + redner = cur_redner, + content = cur_content) + reden <- c(reden, list(rede)) + list(rede = c(id = rede_id, redner = principal_redner), + parts = reden) +} + +# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden +parse_redenliste <- function(redenliste_xml) { + d <- sapply(redenliste_xml, parse_rede) + reden <- simplify2array(d["rede", ]) + parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) + list(reden = tibble(id = reden["id",], redner = reden["redner",]), + talks = tibble(rede_id = parts["rede_id", ], + redner = parts["redner", ], + content = parts["content", ])) +} + +# create a tibble of redner from a list of xml nodes representing redner +parse_rednerliste <- function(rednerliste_xml) { + d <- sapply(rednerliste_xml, parse_redner) + tibble(id = d["id",], + vorname = d["vorname",], + nachname = d["nachname",], + fraktion = d["fraktion",], + titel = d["titel",], + rolle_kurz = d["rolle_kurz",], + rolle_lang = d["rolle_lang",]) +} + +# ------------------------------- +# EXAMPLE USE + +# make sure data ist downloaded via fetch.R +res <- read_one("19038-data.xml") + +res$redner +res$reden +res$talks + +# -------------------------------