diff --git a/scraping/parse.R b/scraping/parse.R index 7415431..b27f5ad 100644 --- a/scraping/parse.R +++ b/scraping/parse.R @@ -2,13 +2,38 @@ source("config.R") source("../utils/helpers.R") library("xml2") library(tibble) +library(dplyr) library(magrittr) # for usage see the example at the end +read_all <- function() { + available_protocols <- list.files(DOWNLOAD_DIR) + res <- lapply(available_protocols, read_one) + + sapply(res, `[[`, "redner") %>% + bind_rows() %>% + distinct() -> + redner + + sapply(res, `[[`, "reden") %>% + bind_rows() %>% + distinct() -> + reden + + sapply(res, `[[`, "talks") %>% + bind_rows() %>% + distinct() -> + talks + list(redner = redner, reden = reden, talks = talks) +} + # this reads all currently parseable data from one xml read_one <- function(name) { - x <- read_xml(paste0(DOWNLOAD_DIR, name)) + print(paste("reading", name)) + x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), + error = function(c) NULL) + if (is.null(x)) return(NULL) cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") @@ -60,7 +85,8 @@ parse_rede <- function(rede_xml) { reden <- list() for (node in cs) { if (xml_name(node) == "p") { - if (xml_attr(node, "klasse") == "redner") { + klasse <- xml_attr(node, "klasse") + if (!is.na(klasse) && klasse == "redner") { if (!is.na(cur_redner)) { rede <- c(rede_id = rede_id, redner = cur_redner, @@ -111,7 +137,7 @@ parse_rednerliste <- function(rednerliste_xml) { # EXAMPLE USE # make sure data ist downloaded via fetch.R -res <- read_one("19038-data.xml") +res <- read_one("19126-data.xml") res$redner res$reden