소스 검색

fix parsing, add progressbar

package
flavis 4 년 전
부모
커밋
92358482c6
1개의 변경된 파일13개의 추가작업 그리고 10개의 파일을 삭제
  1. +13
    -10
      scraping/parse.R

+ 13
- 10
scraping/parse.R 파일 보기

@@ -4,33 +4,35 @@ library("xml2")
library(tibble)
library(dplyr)
library(magrittr)
library(pbapply)

# for usage see the example at the end

read_all <- function() {
cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
available_protocols <- list.files(DOWNLOAD_DIR)
res <- lapply(available_protocols, read_one)
res <- pblapply(available_protocols, read_one)

sapply(res, `[[`, "redner") %>%
lapply(res, `[[`, "redner") %>%
bind_rows() %>%
distinct() ->
redner
sapply(res, `[[`, "reden") %>%
lapply(res, `[[`, "reden") %>%
bind_rows() %>%
distinct() ->
reden

sapply(res, `[[`, "talks") %>%
lapply(res, `[[`, "talks") %>%
bind_rows() %>%
distinct() ->
talks

list(redner = redner, reden = reden, talks = talks)
}

# this reads all currently parseable data from one xml
read_one <- function(name) {
print(paste("reading", name))
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
error = function(c) NULL)
if (is.null(x)) return(NULL)
@@ -47,6 +49,7 @@ read_one <- function(name) {
xml_find_all("rede") %>%
parse_redenliste() ->
res

list(redner = redner, reden = res$reden, talks = res$talks)
}

@@ -137,10 +140,10 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
res <- read_one("19126-data.xml")
res$redner
res$reden
res$talks
# res <- read_one("19126-data.xml")
#
# res$redner
# res$reden
# res$talks

# -------------------------------

불러오는 중...
취소
저장