From 92358482c65f94bf01c2bd8a9d288093eef4165b Mon Sep 17 00:00:00 2001 From: flavis Date: Thu, 24 Jun 2021 16:09:14 +0200 Subject: [PATCH] fix parsing, add progressbar --- scraping/parse.R | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/scraping/parse.R b/scraping/parse.R index b27f5ad..cf5dd3c 100644 --- a/scraping/parse.R +++ b/scraping/parse.R @@ -4,33 +4,35 @@ library("xml2") library(tibble) library(dplyr) library(magrittr) +library(pbapply) # for usage see the example at the end read_all <- function() { + cat("Reading all protocols from", DOWNLOAD_DIR, "\n") available_protocols <- list.files(DOWNLOAD_DIR) - res <- lapply(available_protocols, read_one) + res <- pblapply(available_protocols, read_one) - sapply(res, `[[`, "redner") %>% + lapply(res, `[[`, "redner") %>% bind_rows() %>% distinct() -> redner - sapply(res, `[[`, "reden") %>% + lapply(res, `[[`, "reden") %>% bind_rows() %>% distinct() -> reden - sapply(res, `[[`, "talks") %>% + lapply(res, `[[`, "talks") %>% bind_rows() %>% distinct() -> talks + list(redner = redner, reden = reden, talks = talks) } # this reads all currently parseable data from one xml read_one <- function(name) { - print(paste("reading", name)) x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), error = function(c) NULL) if (is.null(x)) return(NULL) @@ -47,6 +49,7 @@ read_one <- function(name) { xml_find_all("rede") %>% parse_redenliste() -> res + list(redner = redner, reden = res$reden, talks = res$talks) } @@ -137,10 +140,10 @@ parse_rednerliste <- function(rednerliste_xml) { # EXAMPLE USE # make sure data ist downloaded via fetch.R -res <- read_one("19126-data.xml") - -res$redner -res$reden -res$talks +# res <- read_one("19126-data.xml") +# +# res$redner +# res$reden +# res$talks # -------------------------------