| @@ -52,7 +52,7 @@ read_one <- function(name, path) { | |||
| parse_redenliste() -> | |||
| res | |||
| list(redner = redner, reden = res$reden, talks = res$talks) | |||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | |||
| } | |||
| xml_get <- function(node, name) { | |||
| @@ -88,10 +88,11 @@ parse_rede <- function(rede_xml) { | |||
| principal_redner <- NA_character_ | |||
| cur_content <- "" | |||
| reden <- list() | |||
| comments <- list() | |||
| for (node in cs) { | |||
| if (xml_name(node) == "p") { | |||
| if (xml_name(node) == "p" || xml_name(node) == "name") { | |||
| klasse <- xml_attr(node, "klasse") | |||
| if (!is.na(klasse) && klasse == "redner") { | |||
| if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { | |||
| if (!is.na(cur_redner)) { | |||
| rede <- c(rede_id = rede_id, | |||
| redner = cur_redner, | |||
| @@ -101,10 +102,24 @@ parse_rede <- function(rede_xml) { | |||
| } else { | |||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||
| if (xml_name(node) == "name") { | |||
| cur_redner <- "BTP" | |||
| } else { | |||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| } else { | |||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | |||
| } | |||
| } else if (xml_name(node) == "kommentar") { | |||
| # comments are of the form | |||
| # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar> | |||
| xml_text(node) %>% | |||
| str_sub(2, -2) %>% | |||
| str_split("–") %>% | |||
| `[[`(1) %>% | |||
| lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> | |||
| cs | |||
| comments <- c(comments, cs) | |||
| } | |||
| } | |||
| rede <- c(rede_id = rede_id, | |||
| @@ -112,7 +127,27 @@ parse_rede <- function(rede_xml) { | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| list(rede = c(id = rede_id, redner = principal_redner), | |||
| parts = reden) | |||
| parts = reden, | |||
| comments = comments) | |||
| } | |||
| fraktionspattern <- "BÜNDNIS 90/DIE GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP" | |||
| parse_comment <- function(comment, rede_id, on_redner) { | |||
| base <- c(rede_id = rede_id, on_redner = on_redner) | |||
| str_extract_all(comment, fraktionspattern) %>% | |||
| `[[`(1) %>% | |||
| str_c(collapse=",") -> | |||
| by | |||
| # classify comment | |||
| # TODO: | |||
| # - actually separate content properly | |||
| # - differentiate between [AfD] and AfD in by | |||
| if(str_detect(comment, "Beifall")) { | |||
| c(base, type = "applause", by = by, content = comment) | |||
| } else { | |||
| c(base, type = "comment", by = by, content = comment) | |||
| } | |||
| } | |||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||
| @@ -120,10 +155,16 @@ parse_redenliste <- function(redenliste_xml) { | |||
| d <- sapply(redenliste_xml, parse_rede) | |||
| reden <- simplify2array(d["rede", ]) | |||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | |||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | |||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | |||
| talks = tibble(rede_id = parts["rede_id", ], | |||
| redner = parts["redner", ], | |||
| content = parts["content", ])) | |||
| content = parts["content", ]), | |||
| comments = tibble(rede_id = comments["rede_id",], | |||
| on_redner = comments["on_redner",], | |||
| type = comments["type",], | |||
| by = comments["by",], | |||
| content = comments["content", ])) | |||
| } | |||
| # create a tibble of redner from a list of xml nodes representing redner | |||
| @@ -142,7 +183,7 @@ parse_rednerliste <- function(rednerliste_xml) { | |||
| # EXAMPLE USE | |||
| # make sure data ist downloaded via fetch.R | |||
| # res <- read_one("19126-data.xml") | |||
| res <- read_one("records/19126-data.xml") | |||
| # | |||
| # res$redner | |||
| # res$reden | |||