diff --git a/R/parse.R b/R/parse.R index 4954302..de4de17 100644 --- a/R/parse.R +++ b/R/parse.R @@ -52,7 +52,7 @@ read_one <- function(name, path) { parse_redenliste() -> res - list(redner = redner, reden = res$reden, talks = res$talks) + list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) } xml_get <- function(node, name) { @@ -88,10 +88,11 @@ parse_rede <- function(rede_xml) { principal_redner <- NA_character_ cur_content <- "" reden <- list() + comments <- list() for (node in cs) { - if (xml_name(node) == "p") { + if (xml_name(node) == "p" || xml_name(node) == "name") { klasse <- xml_attr(node, "klasse") - if (!is.na(klasse) && klasse == "redner") { + if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { if (!is.na(cur_redner)) { rede <- c(rede_id = rede_id, redner = cur_redner, @@ -101,10 +102,24 @@ parse_rede <- function(rede_xml) { } else { principal_redner <- xml_child(node) %>% xml_attr("id") } - cur_redner <- xml_child(node) %>% xml_attr("id") + if (xml_name(node) == "name") { + cur_redner <- "BTP" + } else { + cur_redner <- xml_child(node) %>% xml_attr("id") + } } else { cur_content <- paste0(cur_content, xml_text(node), sep="\n") } + } else if (xml_name(node) == "kommentar") { + # comments are of the form + # (blabla [Fraktion] – blabla liasdf – bla) + xml_text(node) %>% + str_sub(2, -2) %>% + str_split("–") %>% + `[[`(1) %>% + lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> + cs + comments <- c(comments, cs) } } rede <- c(rede_id = rede_id, @@ -112,7 +127,27 @@ parse_rede <- function(rede_xml) { content = cur_content) reden <- c(reden, list(rede)) list(rede = c(id = rede_id, redner = principal_redner), - parts = reden) + parts = reden, + comments = comments) +} + +fraktionspattern <- "BÜNDNIS 90/DIE GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP" + +parse_comment <- function(comment, rede_id, on_redner) { + base <- c(rede_id = rede_id, on_redner = on_redner) + str_extract_all(comment, fraktionspattern) %>% + `[[`(1) %>% + str_c(collapse=",") -> + by + # classify comment + # TODO: + # - actually separate content properly + # - differentiate between [AfD] and AfD in by + if(str_detect(comment, "Beifall")) { + c(base, type = "applause", by = by, content = comment) + } else { + c(base, type = "comment", by = by, content = comment) + } } # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden @@ -120,10 +155,16 @@ parse_redenliste <- function(redenliste_xml) { d <- sapply(redenliste_xml, parse_rede) reden <- simplify2array(d["rede", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) + comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) list(reden = tibble(id = reden["id",], redner = reden["redner",]), talks = tibble(rede_id = parts["rede_id", ], redner = parts["redner", ], - content = parts["content", ])) + content = parts["content", ]), + comments = tibble(rede_id = comments["rede_id",], + on_redner = comments["on_redner",], + type = comments["type",], + by = comments["by",], + content = comments["content", ])) } # create a tibble of redner from a list of xml nodes representing redner @@ -142,7 +183,7 @@ parse_rednerliste <- function(rednerliste_xml) { # EXAMPLE USE # make sure data ist downloaded via fetch.R -# res <- read_one("19126-data.xml") +res <- read_one("records/19126-data.xml") # # res$redner # res$reden