Просмотр исходного кода

improve comment parsing and add comment commentator redner matching

genderequality-alternative
flavis 4 лет назад
Родитель
Сommit
f09cbac172
2 измененных файлов: 56 добавлений и 10 удалений
  1. +14
    -6
      R/parse.R
  2. +42
    -4
      R/repair.R

+ 14
- 6
R/parse.R Просмотреть файл

@@ -28,9 +28,14 @@ read_all <- function(path="records/") {
distinct() ->
talks

lapply(res, `[[`, "comments") %>%
bind_rows() %>%
distinct() ->
comments

if (length(available_protocols) == 0)
warning("The given directory is empty or does not exist.")
list(redner = redner, reden = reden, talks = talks)
list(redner = redner, reden = reden, talks = talks, comments = comments)
}

# this reads all currently parseable data from one xml
@@ -99,7 +104,8 @@ parse_rede <- function(rede_xml) {
content = cur_content)
reden <- c(reden, list(rede))
cur_content <- ""
} else {
}
if (is.na(principal_redner) && xml_name(node) != "name") {
principal_redner <- xml_child(node) %>% xml_attr("id")
}
if (xml_name(node) == "name") {
@@ -144,9 +150,10 @@ parse_comment <- function(comment, rede_id, on_redner) {
# - actually separate content properly
# - differentiate between [AfD] and AfD in by
if(str_detect(comment, "Beifall")) {
c(base, type = "applause", by = by, content = comment)
c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
} else {
c(base, type = "comment", by = by, content = comment)
ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
}
}

@@ -163,7 +170,8 @@ parse_redenliste <- function(redenliste_xml) {
comments = tibble(rede_id = comments["rede_id",],
on_redner = comments["on_redner",],
type = comments["type",],
by = comments["by",],
fraktion = comments["fraktion",],
kommentator = comments["kommentator",],
content = comments["content", ]))
}

@@ -183,7 +191,7 @@ parse_rednerliste <- function(rednerliste_xml) {
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
res <- read_one("records/19126-data.xml")
# res <- read_one("records/19126-data.xml")
#
# res$redner
# res$reden


+ 42
- 4
R/repair.R Просмотреть файл

@@ -24,12 +24,19 @@ collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse
# expects a tibble of redner and repairs
repair_redner <- function(redner) {
if (nrow(redner) == 0) return(redner)
redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
group_by(id, vorname, nachname) %>%
summarize(fraktion = collect_unique(fraktion),
redner %>%
filter(id != "10000") %>% # invalid id's
mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
group_by(id) %>%
summarize(vorname = head(vorname, 1),
nachname = head(nachname, 1),
fraktion = collect_unique(fraktion),
titel = longest_titel(titel),
rolle_kurz = collect_unique(str_squish(rolle_kurz)),
rolle_lang = collect_unique(str_squish(rolle_lang)))
rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
ungroup() #%>%
# arrange(id) %>%
# distinct(vorname, nachname, fraktion, titel)
}

repair_reden <- function(reden) {
@@ -44,6 +51,37 @@ repair_talks <- function(talks) {
talks
}

# tries to find the correct redner id given a name
# this is sufficient since every prename lastname combination in the bundestag is
# unique (luckily :D)
# returns a lookup table
lookup_redner <- function(comments, redner) {
tobereplaced <- "[-–—‑­­-­­­ ]"
redner %>%
unite(name, vorname, nachname, sep=".*") %>%
mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
rs
find_match <- function(komm) {
if (komm == "") return (NA_character_)
# I tried with agrep (levensthein distance) but results are better that way
matches <- str_which(komm, rs$name)
if (length(matches) == 0) return(NA_character_)
rs[head(matches, 1), ]$id
}
comments %>%
distinct(kommentator) %>%
mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, "")))
}

repair_comments <- function(comments, redner) {
# try to find a redner id for each actual comment
comments %>%
filter(!is.na(kommentator)) %>%
lookup_redner(redner) %>%
left_join(comments, ., by="kommentator") %>%
select(-kommentator)
}

#' Repair parsed tables
#'
#' @export


Загрузка…
Отмена
Сохранить