ソースを参照

properly extract BTP talks and add bare comment parsing

genderequality-alternative
flavis 4年前
コミット
ab55363c4e
1個のファイルの変更48行の追加7行の削除
  1. +48
    -7
      R/parse.R

+ 48
- 7
R/parse.R ファイルの表示

@@ -52,7 +52,7 @@ read_one <- function(name, path) {
parse_redenliste() ->
res

list(redner = redner, reden = res$reden, talks = res$talks)
list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
}

xml_get <- function(node, name) {
@@ -88,10 +88,11 @@ parse_rede <- function(rede_xml) {
principal_redner <- NA_character_
cur_content <- ""
reden <- list()
comments <- list()
for (node in cs) {
if (xml_name(node) == "p") {
if (xml_name(node) == "p" || xml_name(node) == "name") {
klasse <- xml_attr(node, "klasse")
if (!is.na(klasse) && klasse == "redner") {
if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
if (!is.na(cur_redner)) {
rede <- c(rede_id = rede_id,
redner = cur_redner,
@@ -101,10 +102,24 @@ parse_rede <- function(rede_xml) {
} else {
principal_redner <- xml_child(node) %>% xml_attr("id")
}
cur_redner <- xml_child(node) %>% xml_attr("id")
if (xml_name(node) == "name") {
cur_redner <- "BTP"
} else {
cur_redner <- xml_child(node) %>% xml_attr("id")
}
} else {
cur_content <- paste0(cur_content, xml_text(node), sep="\n")
}
} else if (xml_name(node) == "kommentar") {
# comments are of the form
# <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
xml_text(node) %>%
str_sub(2, -2) %>%
str_split("–") %>%
`[[`(1) %>%
lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) ->
cs
comments <- c(comments, cs)
}
}
rede <- c(rede_id = rede_id,
@@ -112,7 +127,27 @@ parse_rede <- function(rede_xml) {
content = cur_content)
reden <- c(reden, list(rede))
list(rede = c(id = rede_id, redner = principal_redner),
parts = reden)
parts = reden,
comments = comments)
}

fraktionspattern <- "BÜNDNIS 90/DIE GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP"

parse_comment <- function(comment, rede_id, on_redner) {
base <- c(rede_id = rede_id, on_redner = on_redner)
str_extract_all(comment, fraktionspattern) %>%
`[[`(1) %>%
str_c(collapse=",") ->
by
# classify comment
# TODO:
# - actually separate content properly
# - differentiate between [AfD] and AfD in by
if(str_detect(comment, "Beifall")) {
c(base, type = "applause", by = by, content = comment)
} else {
c(base, type = "comment", by = by, content = comment)
}
}

# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
@@ -120,10 +155,16 @@ parse_redenliste <- function(redenliste_xml) {
d <- sapply(redenliste_xml, parse_rede)
reden <- simplify2array(d["rede", ])
parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
list(reden = tibble(id = reden["id",], redner = reden["redner",]),
talks = tibble(rede_id = parts["rede_id", ],
redner = parts["redner", ],
content = parts["content", ]))
content = parts["content", ]),
comments = tibble(rede_id = comments["rede_id",],
on_redner = comments["on_redner",],
type = comments["type",],
by = comments["by",],
content = comments["content", ]))
}

# create a tibble of redner from a list of xml nodes representing redner
@@ -142,7 +183,7 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
# res <- read_one("19126-data.xml")
res <- read_one("records/19126-data.xml")
#
# res$redner
# res$reden


読み込み中…
キャンセル
保存