|
|
|
@@ -2,13 +2,38 @@ source("config.R") |
|
|
|
source("../utils/helpers.R") |
|
|
|
library("xml2") |
|
|
|
library(tibble) |
|
|
|
library(dplyr) |
|
|
|
library(magrittr) |
|
|
|
|
|
|
|
# for usage see the example at the end |
|
|
|
|
|
|
|
read_all <- function() { |
|
|
|
available_protocols <- list.files(DOWNLOAD_DIR) |
|
|
|
res <- lapply(available_protocols, read_one) |
|
|
|
|
|
|
|
sapply(res, `[[`, "redner") %>% |
|
|
|
bind_rows() %>% |
|
|
|
distinct() -> |
|
|
|
redner |
|
|
|
|
|
|
|
sapply(res, `[[`, "reden") %>% |
|
|
|
bind_rows() %>% |
|
|
|
distinct() -> |
|
|
|
reden |
|
|
|
|
|
|
|
sapply(res, `[[`, "talks") %>% |
|
|
|
bind_rows() %>% |
|
|
|
distinct() -> |
|
|
|
talks |
|
|
|
list(redner = redner, reden = reden, talks = talks) |
|
|
|
} |
|
|
|
|
|
|
|
# this reads all currently parseable data from one xml |
|
|
|
read_one <- function(name) { |
|
|
|
x <- read_xml(paste0(DOWNLOAD_DIR, name)) |
|
|
|
print(paste("reading", name)) |
|
|
|
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), |
|
|
|
error = function(c) NULL) |
|
|
|
if (is.null(x)) return(NULL) |
|
|
|
cs <- xml_children(x) |
|
|
|
|
|
|
|
verlauf <- xml_find_first(x, "sitzungsverlauf") |
|
|
|
@@ -60,7 +85,8 @@ parse_rede <- function(rede_xml) { |
|
|
|
reden <- list() |
|
|
|
for (node in cs) { |
|
|
|
if (xml_name(node) == "p") { |
|
|
|
if (xml_attr(node, "klasse") == "redner") { |
|
|
|
klasse <- xml_attr(node, "klasse") |
|
|
|
if (!is.na(klasse) && klasse == "redner") { |
|
|
|
if (!is.na(cur_redner)) { |
|
|
|
rede <- c(rede_id = rede_id, |
|
|
|
redner = cur_redner, |
|
|
|
@@ -111,7 +137,7 @@ parse_rednerliste <- function(rednerliste_xml) { |
|
|
|
# EXAMPLE USE |
|
|
|
|
|
|
|
# make sure data ist downloaded via fetch.R |
|
|
|
res <- read_one("19038-data.xml") |
|
|
|
res <- read_one("19126-data.xml") |
|
|
|
|
|
|
|
res$redner |
|
|
|
res$reden |
|
|
|
|