Parcourir la source

fix parsing, add progressbar

package
flavis il y a 4 ans
Parent
révision
92358482c6
1 fichiers modifiés avec 13 ajouts et 10 suppressions
  1. +13
    -10
      scraping/parse.R

+ 13
- 10
scraping/parse.R Voir le fichier

@@ -4,33 +4,35 @@ library("xml2")
library(tibble) library(tibble)
library(dplyr) library(dplyr)
library(magrittr) library(magrittr)
library(pbapply)


# for usage see the example at the end # for usage see the example at the end


read_all <- function() { read_all <- function() {
cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
available_protocols <- list.files(DOWNLOAD_DIR) available_protocols <- list.files(DOWNLOAD_DIR)
res <- lapply(available_protocols, read_one)
res <- pblapply(available_protocols, read_one)


sapply(res, `[[`, "redner") %>%
lapply(res, `[[`, "redner") %>%
bind_rows() %>% bind_rows() %>%
distinct() -> distinct() ->
redner redner
sapply(res, `[[`, "reden") %>%
lapply(res, `[[`, "reden") %>%
bind_rows() %>% bind_rows() %>%
distinct() -> distinct() ->
reden reden


sapply(res, `[[`, "talks") %>%
lapply(res, `[[`, "talks") %>%
bind_rows() %>% bind_rows() %>%
distinct() -> distinct() ->
talks talks

list(redner = redner, reden = reden, talks = talks) list(redner = redner, reden = reden, talks = talks)
} }


# this reads all currently parseable data from one xml # this reads all currently parseable data from one xml
read_one <- function(name) { read_one <- function(name) {
print(paste("reading", name))
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
error = function(c) NULL) error = function(c) NULL)
if (is.null(x)) return(NULL) if (is.null(x)) return(NULL)
@@ -47,6 +49,7 @@ read_one <- function(name) {
xml_find_all("rede") %>% xml_find_all("rede") %>%
parse_redenliste() -> parse_redenliste() ->
res res

list(redner = redner, reden = res$reden, talks = res$talks) list(redner = redner, reden = res$reden, talks = res$talks)
} }


@@ -137,10 +140,10 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE # EXAMPLE USE


# make sure data ist downloaded via fetch.R # make sure data ist downloaded via fetch.R
res <- read_one("19126-data.xml")
res$redner
res$reden
res$talks
# res <- read_one("19126-data.xml")
#
# res$redner
# res$reden
# res$talks


# ------------------------------- # -------------------------------

Chargement…
Annuler
Enregistrer