Просмотр исходного кода

fix parsing, add progressbar

package
flavis 4 лет назад
Родитель
Сommit
92358482c6
1 измененных файлов: 13 добавлений и 10 удалений
  1. +13
    -10
      scraping/parse.R

+ 13
- 10
scraping/parse.R Просмотреть файл

@@ -4,33 +4,35 @@ library("xml2")
library(tibble)
library(dplyr)
library(magrittr)
library(pbapply)

# for usage see the example at the end

read_all <- function() {
cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
available_protocols <- list.files(DOWNLOAD_DIR)
res <- lapply(available_protocols, read_one)
res <- pblapply(available_protocols, read_one)

sapply(res, `[[`, "redner") %>%
lapply(res, `[[`, "redner") %>%
bind_rows() %>%
distinct() ->
redner
sapply(res, `[[`, "reden") %>%
lapply(res, `[[`, "reden") %>%
bind_rows() %>%
distinct() ->
reden

sapply(res, `[[`, "talks") %>%
lapply(res, `[[`, "talks") %>%
bind_rows() %>%
distinct() ->
talks

list(redner = redner, reden = reden, talks = talks)
}

# this reads all currently parseable data from one xml
read_one <- function(name) {
print(paste("reading", name))
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
error = function(c) NULL)
if (is.null(x)) return(NULL)
@@ -47,6 +49,7 @@ read_one <- function(name) {
xml_find_all("rede") %>%
parse_redenliste() ->
res

list(redner = redner, reden = res$reden, talks = res$talks)
}

@@ -137,10 +140,10 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
res <- read_one("19126-data.xml")
res$redner
res$reden
res$talks
# res <- read_one("19126-data.xml")
#
# res$redner
# res$reden
# res$talks

# -------------------------------

Загрузка…
Отмена
Сохранить