| @@ -52,7 +52,7 @@ read_one <- function(name, path) { | |||||
| parse_redenliste() -> | parse_redenliste() -> | ||||
| res | res | ||||
| list(redner = redner, reden = res$reden, talks = res$talks) | |||||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | |||||
| } | } | ||||
| xml_get <- function(node, name) { | xml_get <- function(node, name) { | ||||
| @@ -88,10 +88,11 @@ parse_rede <- function(rede_xml) { | |||||
| principal_redner <- NA_character_ | principal_redner <- NA_character_ | ||||
| cur_content <- "" | cur_content <- "" | ||||
| reden <- list() | reden <- list() | ||||
| comments <- list() | |||||
| for (node in cs) { | for (node in cs) { | ||||
| if (xml_name(node) == "p") { | |||||
| if (xml_name(node) == "p" || xml_name(node) == "name") { | |||||
| klasse <- xml_attr(node, "klasse") | klasse <- xml_attr(node, "klasse") | ||||
| if (!is.na(klasse) && klasse == "redner") { | |||||
| if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { | |||||
| if (!is.na(cur_redner)) { | if (!is.na(cur_redner)) { | ||||
| rede <- c(rede_id = rede_id, | rede <- c(rede_id = rede_id, | ||||
| redner = cur_redner, | redner = cur_redner, | ||||
| @@ -101,10 +102,24 @@ parse_rede <- function(rede_xml) { | |||||
| } else { | } else { | ||||
| principal_redner <- xml_child(node) %>% xml_attr("id") | principal_redner <- xml_child(node) %>% xml_attr("id") | ||||
| } | } | ||||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||||
| if (xml_name(node) == "name") { | |||||
| cur_redner <- "BTP" | |||||
| } else { | |||||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||||
| } | |||||
| } else { | } else { | ||||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | cur_content <- paste0(cur_content, xml_text(node), sep="\n") | ||||
| } | } | ||||
| } else if (xml_name(node) == "kommentar") { | |||||
| # comments are of the form | |||||
| # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar> | |||||
| xml_text(node) %>% | |||||
| str_sub(2, -2) %>% | |||||
| str_split("–") %>% | |||||
| `[[`(1) %>% | |||||
| lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> | |||||
| cs | |||||
| comments <- c(comments, cs) | |||||
| } | } | ||||
| } | } | ||||
| rede <- c(rede_id = rede_id, | rede <- c(rede_id = rede_id, | ||||
| @@ -112,7 +127,27 @@ parse_rede <- function(rede_xml) { | |||||
| content = cur_content) | content = cur_content) | ||||
| reden <- c(reden, list(rede)) | reden <- c(reden, list(rede)) | ||||
| list(rede = c(id = rede_id, redner = principal_redner), | list(rede = c(id = rede_id, redner = principal_redner), | ||||
| parts = reden) | |||||
| parts = reden, | |||||
| comments = comments) | |||||
| } | |||||
| fraktionspattern <- "BÜNDNIS 90/DIE GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP" | |||||
| parse_comment <- function(comment, rede_id, on_redner) { | |||||
| base <- c(rede_id = rede_id, on_redner = on_redner) | |||||
| str_extract_all(comment, fraktionspattern) %>% | |||||
| `[[`(1) %>% | |||||
| str_c(collapse=",") -> | |||||
| by | |||||
| # classify comment | |||||
| # TODO: | |||||
| # - actually separate content properly | |||||
| # - differentiate between [AfD] and AfD in by | |||||
| if(str_detect(comment, "Beifall")) { | |||||
| c(base, type = "applause", by = by, content = comment) | |||||
| } else { | |||||
| c(base, type = "comment", by = by, content = comment) | |||||
| } | |||||
| } | } | ||||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | ||||
| @@ -120,10 +155,16 @@ parse_redenliste <- function(redenliste_xml) { | |||||
| d <- sapply(redenliste_xml, parse_rede) | d <- sapply(redenliste_xml, parse_rede) | ||||
| reden <- simplify2array(d["rede", ]) | reden <- simplify2array(d["rede", ]) | ||||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | ||||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | |||||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | list(reden = tibble(id = reden["id",], redner = reden["redner",]), | ||||
| talks = tibble(rede_id = parts["rede_id", ], | talks = tibble(rede_id = parts["rede_id", ], | ||||
| redner = parts["redner", ], | redner = parts["redner", ], | ||||
| content = parts["content", ])) | |||||
| content = parts["content", ]), | |||||
| comments = tibble(rede_id = comments["rede_id",], | |||||
| on_redner = comments["on_redner",], | |||||
| type = comments["type",], | |||||
| by = comments["by",], | |||||
| content = comments["content", ])) | |||||
| } | } | ||||
| # create a tibble of redner from a list of xml nodes representing redner | # create a tibble of redner from a list of xml nodes representing redner | ||||
| @@ -142,7 +183,7 @@ parse_rednerliste <- function(rednerliste_xml) { | |||||
| # EXAMPLE USE | # EXAMPLE USE | ||||
| # make sure data ist downloaded via fetch.R | # make sure data ist downloaded via fetch.R | ||||
| # res <- read_one("19126-data.xml") | |||||
| res <- read_one("records/19126-data.xml") | |||||
| # | # | ||||
| # res$redner | # res$redner | ||||
| # res$reden | # res$reden | ||||