|
|
|
@@ -20,7 +20,8 @@ read_all <- function(path="records/") { |
|
|
|
|
|
|
|
lapply(res, `[[`, "reden") %>% |
|
|
|
bind_rows() %>% |
|
|
|
distinct() -> |
|
|
|
distinct() %>% |
|
|
|
mutate(date = as.Date(date, format="%d.%m.%Y")) -> |
|
|
|
reden |
|
|
|
|
|
|
|
lapply(res, `[[`, "talks") %>% |
|
|
|
@@ -31,11 +32,26 @@ read_all <- function(path="records/") { |
|
|
|
lapply(res, `[[`, "comments") %>% |
|
|
|
bind_rows() %>% |
|
|
|
distinct() -> |
|
|
|
comments |
|
|
|
commentsandapplause |
|
|
|
|
|
|
|
if (length(available_protocols) == 0) |
|
|
|
warning("The given directory is empty or does not exist.") |
|
|
|
list(redner = redner, reden = reden, talks = talks, comments = comments) |
|
|
|
|
|
|
|
filter(commentsandapplause, type == "comment") %>% |
|
|
|
select(-type) -> |
|
|
|
comments |
|
|
|
filter(commentsandapplause, type == "applause") %>% |
|
|
|
select(-type, -kommentator, -content) %>% |
|
|
|
mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"), |
|
|
|
"SPD" = str_detect(fraktion, "SPD"), |
|
|
|
"FDP" = str_detect(fraktion, "FDP"), |
|
|
|
"DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), |
|
|
|
"BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), |
|
|
|
"AfD" = str_detect(fraktion, "AfD")) %>% |
|
|
|
select(-fraktion) -> |
|
|
|
applause |
|
|
|
|
|
|
|
list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) |
|
|
|
} |
|
|
|
|
|
|
|
# this reads all currently parseable data from one xml |
|
|
|
@@ -43,6 +59,8 @@ read_one <- function(name, path) { |
|
|
|
x <- tryCatch(read_xml(paste0(path, name)), |
|
|
|
error = function(c) NULL) |
|
|
|
if (is.null(x)) return(NULL) |
|
|
|
# extract date of session |
|
|
|
date <- xml_attr(x, "sitzung-datum") |
|
|
|
cs <- xml_children(x) |
|
|
|
|
|
|
|
verlauf <- xml_find_first(x, "sitzungsverlauf") |
|
|
|
@@ -54,7 +72,7 @@ read_one <- function(name, path) { |
|
|
|
|
|
|
|
xml_children(verlauf) %>% |
|
|
|
xml_find_all("rede") %>% |
|
|
|
parse_redenliste() -> |
|
|
|
parse_redenliste(date) -> |
|
|
|
res |
|
|
|
|
|
|
|
list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) |
|
|
|
@@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) { |
|
|
|
# parse one rede |
|
|
|
# returns: - a rede (with rede id and redner id) |
|
|
|
# - all talks appearing in the rede (with corresponding content) |
|
|
|
parse_rede <- function(rede_xml) { |
|
|
|
parse_rede <- function(rede_xml, date) { |
|
|
|
rede_id <- xml_attr(rede_xml, "id") |
|
|
|
cs <- xml_children(rede_xml) |
|
|
|
cur_redner <- NA_character_ |
|
|
|
@@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) { |
|
|
|
redner = cur_redner, |
|
|
|
content = cur_content) |
|
|
|
reden <- c(reden, list(rede)) |
|
|
|
list(rede = c(id = rede_id, redner = principal_redner), |
|
|
|
list(rede = c(id = rede_id, redner = principal_redner, date = date), |
|
|
|
parts = reden, |
|
|
|
comments = comments) |
|
|
|
} |
|
|
|
@@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN |
|
|
|
|
|
|
|
parse_comment <- function(comment, rede_id, on_redner) { |
|
|
|
base <- c(rede_id = rede_id, on_redner = on_redner) |
|
|
|
str_extract_all(comment, fraktionspattern) %>% |
|
|
|
`[[`(1) %>% |
|
|
|
sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% |
|
|
|
str_c(collapse=",") -> |
|
|
|
by |
|
|
|
# classify comment |
|
|
|
# TODO: |
|
|
|
# - actually separate content properly |
|
|
|
# - differentiate between [AfD] and AfD in by |
|
|
|
if(str_detect(comment, "Beifall")) { |
|
|
|
str_extract_all(comment, fraktionspattern) %>% |
|
|
|
`[[`(1) %>% |
|
|
|
sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% |
|
|
|
str_c(collapse=",") -> |
|
|
|
by |
|
|
|
c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) |
|
|
|
} else { |
|
|
|
ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] |
|
|
|
@@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) { |
|
|
|
} |
|
|
|
|
|
|
|
# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden |
|
|
|
parse_redenliste <- function(redenliste_xml) { |
|
|
|
d <- sapply(redenliste_xml, parse_rede) |
|
|
|
parse_redenliste <- function(redenliste_xml, date) { |
|
|
|
d <- sapply(redenliste_xml, parse_rede, date = date) |
|
|
|
reden <- simplify2array(d["rede", ]) |
|
|
|
parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) |
|
|
|
comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) |
|
|
|
list(reden = tibble(id = reden["id",], redner = reden["redner",]), |
|
|
|
list(reden = tibble(id = reden["id",], redner = reden["redner",], |
|
|
|
date = reden["date",]), |
|
|
|
talks = tibble(rede_id = parts["rede_id", ], |
|
|
|
redner = parts["redner", ], |
|
|
|
content = parts["content", ]), |
|
|
|
@@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) { |
|
|
|
write.table(tables$reden, str_c(path, "reden.csv")) |
|
|
|
write.table(tables$talks, str_c(path, "talks.csv")) |
|
|
|
write.table(tables$comments, str_c(path, "comments.csv")) |
|
|
|
write.table(tables$applause, str_c(path, "applause.csv")) |
|
|
|
} |
|
|
|
|
|
|
|
#' @export |
|
|
|
@@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") { |
|
|
|
tibble() %>% |
|
|
|
mutate(redner = as.character(redner)), |
|
|
|
talks = tibble %$% read.table(str_c(path, "talks.csv")), |
|
|
|
comments = tibble %$% read.table(str_c(path, "comments.csv"))) |
|
|
|
comments = tibble %$% read.table(str_c(path, "comments.csv")), |
|
|
|
applause = tibble %$% read.table(str_c(path, "applause.csv"))) |
|
|
|
} |
|
|
|
|
|
|
|
# ------------------------------- |
|
|
|
|