소스 검색

add a read_all method, that parses all available protocols

package
flavis 4 년 전
부모
커밋
2c7b99c942
1개의 변경된 파일29개의 추가작업 그리고 3개의 파일을 삭제
  1. +29
    -3
      scraping/parse.R

+ 29
- 3
scraping/parse.R 파일 보기

@@ -2,13 +2,38 @@ source("config.R")
source("../utils/helpers.R")
library("xml2")
library(tibble)
library(dplyr)
library(magrittr)

# for usage see the example at the end

read_all <- function() {
available_protocols <- list.files(DOWNLOAD_DIR)
res <- lapply(available_protocols, read_one)

sapply(res, `[[`, "redner") %>%
bind_rows() %>%
distinct() ->
redner
sapply(res, `[[`, "reden") %>%
bind_rows() %>%
distinct() ->
reden

sapply(res, `[[`, "talks") %>%
bind_rows() %>%
distinct() ->
talks
list(redner = redner, reden = reden, talks = talks)
}

# this reads all currently parseable data from one xml
read_one <- function(name) {
x <- read_xml(paste0(DOWNLOAD_DIR, name))
print(paste("reading", name))
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
error = function(c) NULL)
if (is.null(x)) return(NULL)
cs <- xml_children(x)

verlauf <- xml_find_first(x, "sitzungsverlauf")
@@ -60,7 +85,8 @@ parse_rede <- function(rede_xml) {
reden <- list()
for (node in cs) {
if (xml_name(node) == "p") {
if (xml_attr(node, "klasse") == "redner") {
klasse <- xml_attr(node, "klasse")
if (!is.na(klasse) && klasse == "redner") {
if (!is.na(cur_redner)) {
rede <- c(rede_id = rede_id,
redner = cur_redner,
@@ -111,7 +137,7 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
res <- read_one("19038-data.xml")
res <- read_one("19126-data.xml")

res$redner
res$reden


불러오는 중...
취소
저장