From 4fe45feec9233cfa39b3d801ba1e3b106f6828b0 Mon Sep 17 00:00:00 2001 From: flavis Date: Mon, 2 Aug 2021 16:12:18 +0200 Subject: [PATCH] solve most of predefined challenges --- DESCRIPTION | 1 + NAMESPACE | 3 ++ R/analyze.R | 19 +++++++++ R/hateimparlament-package.R | 1 + R/parse.R | 54 +++++++++++++++++--------- R/repair.R | 7 ++-- vignettes/funwithdata.Rmd | 77 +++++++++++++++++++++++++++++++++++-- 7 files changed, 138 insertions(+), 24 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 71133cb..a0170ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ Imports: rvest, stringr, tibble, + tidyr, xml2 Suggests: rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 9b21836..d99fd6f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,19 @@ # Generated by roxygen2: do not edit by hand +export(bar_plot_fraktionen) export(fetch_all) export(find_word) export(join_redner) export(read_all) export(read_from_csv) export(repair) +export(word_usage_by_date) export(write_to_csv) import(dplyr) import(pbapply) import(purrr) import(stringr) import(tibble) +import(tidyr) import(utils) import(xml2) diff --git a/R/analyze.R b/R/analyze.R index dfb12b7..18dd453 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -29,3 +29,22 @@ bar_plot_fraktionen <- function(tb) { scale_fill_manual(values = party_colors) + geom_bar(stat = "identity") } + +# Counts how many talks do match a given pattern and summarises by date +# +#' @export +word_usage_by_date <- function(res, patterns, name, tidy=F) { + tb <- res$talks + nms <- names(patterns) + for (i in seq_along(patterns)) { + if (!is.null(nms)) name <- nms[[i]] + else name <- patterns[[i]] + tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) + } + left_join(tb, res$reden, by=c("rede_id" = "id")) %>% + group_by(date) %>% + summarize(across(where(is.numeric), sum)) %>% + arrange(date) -> tb + if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count") + else tb +} diff --git a/R/hateimparlament-package.R b/R/hateimparlament-package.R index 542609b..eab8891 100644 --- a/R/hateimparlament-package.R +++ b/R/hateimparlament-package.R @@ -6,6 +6,7 @@ #' @import stringr #' @import xml2 #' @import utils +#' @import tidyr #' @import purrr #' @keywords internal "_PACKAGE" diff --git a/R/parse.R b/R/parse.R index d3d2476..40b6045 100644 --- a/R/parse.R +++ b/R/parse.R @@ -20,7 +20,8 @@ read_all <- function(path="records/") { lapply(res, `[[`, "reden") %>% bind_rows() %>% - distinct() -> + distinct() %>% + mutate(date = as.Date(date, format="%d.%m.%Y")) -> reden lapply(res, `[[`, "talks") %>% @@ -31,11 +32,26 @@ read_all <- function(path="records/") { lapply(res, `[[`, "comments") %>% bind_rows() %>% distinct() -> - comments + commentsandapplause if (length(available_protocols) == 0) warning("The given directory is empty or does not exist.") - list(redner = redner, reden = reden, talks = talks, comments = comments) + + filter(commentsandapplause, type == "comment") %>% + select(-type) -> + comments + filter(commentsandapplause, type == "applause") %>% + select(-type, -kommentator, -content) %>% + mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"), + "SPD" = str_detect(fraktion, "SPD"), + "FDP" = str_detect(fraktion, "FDP"), + "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), + "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), + "AfD" = str_detect(fraktion, "AfD")) %>% + select(-fraktion) -> + applause + + list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) } # this reads all currently parseable data from one xml @@ -43,6 +59,8 @@ read_one <- function(name, path) { x <- tryCatch(read_xml(paste0(path, name)), error = function(c) NULL) if (is.null(x)) return(NULL) + # extract date of session + date <- xml_attr(x, "sitzung-datum") cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") @@ -54,7 +72,7 @@ read_one <- function(name, path) { xml_children(verlauf) %>% xml_find_all("rede") %>% - parse_redenliste() -> + parse_redenliste(date) -> res list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) @@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) { # parse one rede # returns: - a rede (with rede id and redner id) # - all talks appearing in the rede (with corresponding content) -parse_rede <- function(rede_xml) { +parse_rede <- function(rede_xml, date) { rede_id <- xml_attr(rede_xml, "id") cs <- xml_children(rede_xml) cur_redner <- NA_character_ @@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) { redner = cur_redner, content = cur_content) reden <- c(reden, list(rede)) - list(rede = c(id = rede_id, redner = principal_redner), + list(rede = c(id = rede_id, redner = principal_redner, date = date), parts = reden, comments = comments) } @@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN parse_comment <- function(comment, rede_id, on_redner) { base <- c(rede_id = rede_id, on_redner = on_redner) - str_extract_all(comment, fraktionspattern) %>% - `[[`(1) %>% - sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% - str_c(collapse=",") -> - by # classify comment - # TODO: - # - actually separate content properly - # - differentiate between [AfD] and AfD in by if(str_detect(comment, "Beifall")) { + str_extract_all(comment, fraktionspattern) %>% + `[[`(1) %>% + sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% + str_c(collapse=",") -> + by c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] @@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) { } # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden -parse_redenliste <- function(redenliste_xml) { - d <- sapply(redenliste_xml, parse_rede) +parse_redenliste <- function(redenliste_xml, date) { + d <- sapply(redenliste_xml, parse_rede, date = date) reden <- simplify2array(d["rede", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) - list(reden = tibble(id = reden["id",], redner = reden["redner",]), + list(reden = tibble(id = reden["id",], redner = reden["redner",], + date = reden["date",]), talks = tibble(rede_id = parts["rede_id", ], redner = parts["redner", ], content = parts["content", ]), @@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) { write.table(tables$reden, str_c(path, "reden.csv")) write.table(tables$talks, str_c(path, "talks.csv")) write.table(tables$comments, str_c(path, "comments.csv")) + write.table(tables$applause, str_c(path, "applause.csv")) } #' @export @@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") { tibble() %>% mutate(redner = as.character(redner)), talks = tibble %$% read.table(str_c(path, "talks.csv")), - comments = tibble %$% read.table(str_c(path, "comments.csv"))) + comments = tibble %$% read.table(str_c(path, "comments.csv")), + applause = tibble %$% read.table(str_c(path, "applause.csv"))) } # ------------------------------- diff --git a/R/repair.R b/R/repair.R index 53ff264..21d5738 100644 --- a/R/repair.R +++ b/R/repair.R @@ -47,8 +47,8 @@ repair_reden <- function(reden) { repair_talks <- function(talks) { if (nrow(talks) == 0) return(talks) - # TODO: fill with content - talks + # ignore all talks which have empty content + filter(talks, str_length(content) > 0) } # tries to find the correct redner id given a name @@ -90,6 +90,7 @@ repair <- function(parse_output) { reden = repair_reden(parse_output$reden), talks = repair_talks(parse_output$talks), #comments = repair_comments(parse_output$comments) - comments = parse_output$comments + comments = parse_output$comments, + applause = parse_output$applause ) } diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 2e6bc28..6fc9370 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -48,8 +48,8 @@ talks <- res$talks ## Analysis Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: -```{r, fig.width=10} -join_redner(reden, res) %>% +```{r, fig.width=8} +join_redner(res$reden, res) %>% group_by(fraktion) %>% summarize(n = n()) %>% arrange(n) %>% @@ -58,7 +58,7 @@ join_redner(reden, res) %>% ### Count a word occurence -```{r, fig.width=10} +```{r, fig.width=8} find_word(res, "hitler") %>% filter(occurences > 0) %>% join_redner(res) %>% @@ -68,3 +68,74 @@ find_word(res, "hitler") %>% arrange(desc(n)) %>% bar_plot_fraktionen() ``` + +### Who gives the most speeches? + +```{r} +res$reden %>% + group_by(redner) %>% + summarize(n = n()) %>% + arrange(-n) %>% + left_join(res$redner, by=c("redner" = "id")) %>% + head(10) +``` + +### Who talks the longest? + +```{r} +res$talks %>% + mutate(content_len = str_length(content)) %>% + group_by(redner) %>% + summarize(avg_content_len = mean(content_len)) %>% + arrange(-avg_content_len) %>% + left_join(res$redner, by=c("redner" = "id")) %>% + head(10) +``` + +### Which party gives the most applause to which parties? + +```{r} +res$applause %>% + left_join(res$redner, by=c("on_redner" = "id")) %>% + select(on_fraktion = fraktion, where(is.logical)) %>% + group_by(on_fraktion) %>% + arrange(on_fraktion) %>% + summarize("AfD" = sum(`AfD`), + "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), + "CDU/CSU" = sum(`CDU_CSU`), + "DIE LINKE" = sum(`DIE_LINKE`), + "FDP" = sum(`FDP`), + "SPD" = sum(`SPD`)) +``` + +### Which party comments the most on which parties? + +```{r} +res$comments %>% + left_join(res$redner, by=c("on_redner" = "id")) %>% + select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% + group_by(on_fraktion) %>% + summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), + `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), + `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T), + `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T), + `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T), + `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) +``` + +### When are which topics discussed the most? + +```{r, fig.width=8} +pandemic_pattern <- "(?i)virus|corona|covid|lockdown" +climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" +pension_pattern <- "(?i)rente|pension|altersarmut" + +word_usage_by_date(res, c(pandemic = pandemic_pattern, + climate = climate_pattern, + pension = pension_pattern)) %>% + ggplot(aes(x = date, y = count, color = pattern)) + + xlab("date of session") + + ylab("occurence of word per session") + + labs(color = "Topic") + + geom_point() +```