| @@ -21,6 +21,7 @@ Imports: | |||
| rvest, | |||
| stringr, | |||
| tibble, | |||
| tidyr, | |||
| xml2 | |||
| Suggests: | |||
| rmarkdown, | |||
| @@ -1,16 +1,19 @@ | |||
| # Generated by roxygen2: do not edit by hand | |||
| export(bar_plot_fraktionen) | |||
| export(fetch_all) | |||
| export(find_word) | |||
| export(join_redner) | |||
| export(read_all) | |||
| export(read_from_csv) | |||
| export(repair) | |||
| export(word_usage_by_date) | |||
| export(write_to_csv) | |||
| import(dplyr) | |||
| import(pbapply) | |||
| import(purrr) | |||
| import(stringr) | |||
| import(tibble) | |||
| import(tidyr) | |||
| import(utils) | |||
| import(xml2) | |||
| @@ -12,20 +12,45 @@ join_redner <- function(tb, res, fraktion_only = F) { | |||
| else joined | |||
| } | |||
| #' @export | |||
| party_colors <- c( | |||
| SPD="#DF0B25", | |||
| "CDU/CSU"="#000000", | |||
| AfD="#1A9FDD", | |||
| "AfD&Fraktionslos"="#1A9FDD", | |||
| "DIE LINKE"="#BC3475", | |||
| "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", | |||
| FDP="#FEEB34", | |||
| "CDU/CSU"="#000000", | |||
| SPD="#DF0B25", | |||
| "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", | |||
| "DIE LINKE"="#BC3475", | |||
| "AfD&Fraktionslos"="#1A9FDD", | |||
| Fraktionslos="#FEEB34" | |||
| ) | |||
| #' @export | |||
| bar_plot_fraktionen <- function(tb) { | |||
| ggplot(tb, aes(x = reorder(fraktion, -n), y = n, fill = fraktion)) + | |||
| scale_fill_manual(values = party_colors) + | |||
| geom_bar(stat = "identity") | |||
| bar_plot_fraktionen <- function(tb, variable, fill, title=NULL, xlab = "Fraction", | |||
| ylab="n", filllab="Fraction") { | |||
| ggplot(tb, aes(x = reorder(fraktion, -{{variable}}), y = {{variable}}, fill = {{fill}})) + | |||
| scale_fill_manual(values = party_colors) + | |||
| xlab(xlab) + | |||
| ylab(ylab) + | |||
| labs(fill = filllab) + | |||
| ggtitle(title) + | |||
| geom_bar(stat = "identity") | |||
| } | |||
| # Counts how many talks do match a given pattern and summarises by date | |||
| # | |||
| #' @export | |||
| word_usage_by_date <- function(res, patterns, name, tidy=F) { | |||
| tb <- res$talks | |||
| nms <- names(patterns) | |||
| for (i in seq_along(patterns)) { | |||
| if (!is.null(nms)) name <- nms[[i]] | |||
| else name <- patterns[[i]] | |||
| tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) | |||
| } | |||
| left_join(tb, res$reden, by=c("rede_id" = "id")) %>% | |||
| group_by(date) %>% | |||
| summarize(across(where(is.numeric), sum)) %>% | |||
| arrange(date) -> tb | |||
| if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count") | |||
| else tb | |||
| } | |||
| @@ -6,6 +6,7 @@ | |||
| #' @import stringr | |||
| #' @import xml2 | |||
| #' @import utils | |||
| #' @import tidyr | |||
| #' @import purrr | |||
| #' @keywords internal | |||
| "_PACKAGE" | |||
| @@ -20,7 +20,8 @@ read_all <- function(path="records/") { | |||
| lapply(res, `[[`, "reden") %>% | |||
| bind_rows() %>% | |||
| distinct() -> | |||
| distinct() %>% | |||
| mutate(date = as.Date(date, format="%d.%m.%Y")) -> | |||
| reden | |||
| lapply(res, `[[`, "talks") %>% | |||
| @@ -31,11 +32,26 @@ read_all <- function(path="records/") { | |||
| lapply(res, `[[`, "comments") %>% | |||
| bind_rows() %>% | |||
| distinct() -> | |||
| comments | |||
| commentsandapplause | |||
| if (length(available_protocols) == 0) | |||
| warning("The given directory is empty or does not exist.") | |||
| list(redner = redner, reden = reden, talks = talks, comments = comments) | |||
| filter(commentsandapplause, type == "comment") %>% | |||
| select(-type) -> | |||
| comments | |||
| filter(commentsandapplause, type == "applause") %>% | |||
| select(-type, -kommentator, -content) %>% | |||
| mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"), | |||
| "SPD" = str_detect(fraktion, "SPD"), | |||
| "FDP" = str_detect(fraktion, "FDP"), | |||
| "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), | |||
| "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), | |||
| "AfD" = str_detect(fraktion, "AfD")) %>% | |||
| select(-fraktion) -> | |||
| applause | |||
| list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) | |||
| } | |||
| # this reads all currently parseable data from one xml | |||
| @@ -43,6 +59,8 @@ read_one <- function(name, path) { | |||
| x <- tryCatch(read_xml(paste0(path, name)), | |||
| error = function(c) NULL) | |||
| if (is.null(x)) return(NULL) | |||
| # extract date of session | |||
| date <- xml_attr(x, "sitzung-datum") | |||
| cs <- xml_children(x) | |||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | |||
| @@ -54,7 +72,7 @@ read_one <- function(name, path) { | |||
| xml_children(verlauf) %>% | |||
| xml_find_all("rede") %>% | |||
| parse_redenliste() -> | |||
| parse_redenliste(date) -> | |||
| res | |||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | |||
| @@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) { | |||
| # parse one rede | |||
| # returns: - a rede (with rede id and redner id) | |||
| # - all talks appearing in the rede (with corresponding content) | |||
| parse_rede <- function(rede_xml) { | |||
| parse_rede <- function(rede_xml, date) { | |||
| rede_id <- xml_attr(rede_xml, "id") | |||
| cs <- xml_children(rede_xml) | |||
| cur_redner <- NA_character_ | |||
| @@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) { | |||
| redner = cur_redner, | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| list(rede = c(id = rede_id, redner = principal_redner), | |||
| list(rede = c(id = rede_id, redner = principal_redner, date = date), | |||
| parts = reden, | |||
| comments = comments) | |||
| } | |||
| @@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN | |||
| parse_comment <- function(comment, rede_id, on_redner) { | |||
| base <- c(rede_id = rede_id, on_redner = on_redner) | |||
| str_extract_all(comment, fraktionspattern) %>% | |||
| `[[`(1) %>% | |||
| sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% | |||
| str_c(collapse=",") -> | |||
| by | |||
| # classify comment | |||
| # TODO: | |||
| # - actually separate content properly | |||
| # - differentiate between [AfD] and AfD in by | |||
| if(str_detect(comment, "Beifall")) { | |||
| str_extract_all(comment, fraktionspattern) %>% | |||
| `[[`(1) %>% | |||
| sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% | |||
| str_c(collapse=",") -> | |||
| by | |||
| c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) | |||
| } else { | |||
| ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] | |||
| @@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) { | |||
| } | |||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||
| parse_redenliste <- function(redenliste_xml) { | |||
| d <- sapply(redenliste_xml, parse_rede) | |||
| parse_redenliste <- function(redenliste_xml, date) { | |||
| d <- sapply(redenliste_xml, parse_rede, date = date) | |||
| reden <- simplify2array(d["rede", ]) | |||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | |||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | |||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | |||
| list(reden = tibble(id = reden["id",], redner = reden["redner",], | |||
| date = reden["date",]), | |||
| talks = tibble(rede_id = parts["rede_id", ], | |||
| redner = parts["redner", ], | |||
| content = parts["content", ]), | |||
| @@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) { | |||
| write.table(tables$reden, str_c(path, "reden.csv")) | |||
| write.table(tables$talks, str_c(path, "talks.csv")) | |||
| write.table(tables$comments, str_c(path, "comments.csv")) | |||
| write.table(tables$applause, str_c(path, "applause.csv")) | |||
| } | |||
| #' @export | |||
| @@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") { | |||
| tibble() %>% | |||
| mutate(redner = as.character(redner)), | |||
| talks = tibble %$% read.table(str_c(path, "talks.csv")), | |||
| comments = tibble %$% read.table(str_c(path, "comments.csv"))) | |||
| comments = tibble %$% read.table(str_c(path, "comments.csv")), | |||
| applause = tibble %$% read.table(str_c(path, "applause.csv"))) | |||
| } | |||
| # ------------------------------- | |||
| @@ -47,8 +47,8 @@ repair_reden <- function(reden) { | |||
| repair_talks <- function(talks) { | |||
| if (nrow(talks) == 0) return(talks) | |||
| # TODO: fill with content | |||
| talks | |||
| # ignore all talks which have empty content | |||
| filter(talks, str_length(content) > 0) | |||
| } | |||
| # tries to find the correct redner id given a name | |||
| @@ -90,6 +90,7 @@ repair <- function(parse_output) { | |||
| reden = repair_reden(parse_output$reden), | |||
| talks = repair_talks(parse_output$talks), | |||
| #comments = repair_comments(parse_output$comments) | |||
| comments = parse_output$comments | |||
| comments = parse_output$comments, | |||
| applause = parse_output$applause | |||
| ) | |||
| } | |||
| @@ -18,6 +18,8 @@ knitr::opts_chunk$set( | |||
| library(hateimparlament) | |||
| library(dplyr) | |||
| library(ggplot2) | |||
| library(stringr) | |||
| library(tidyr) | |||
| ``` | |||
| ## Preparation of data | |||
| @@ -48,23 +50,125 @@ talks <- res$talks | |||
| ## Analysis | |||
| Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | |||
| ```{r, fig.width=10} | |||
| join_redner(reden, res) %>% | |||
| ```{r, fig.width=7} | |||
| join_redner(res$reden, res) %>% | |||
| group_by(fraktion) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(n) %>% | |||
| bar_plot_fraktionen() | |||
| bar_plot_fraktionen(n, fill = fraktion, title="Number of speeches given by fraction", ylab="Number of speeches") | |||
| ``` | |||
| ### Count a word occurence | |||
| or counting the occurences of a given word: | |||
| ```{r, fig.width=10} | |||
| find_word(res, "hitler") %>% | |||
| ```{r, fig.width=7} | |||
| find_word(res, "Kohleausstieg") %>% | |||
| filter(occurences > 0) %>% | |||
| join_redner(res) %>% | |||
| select(content, fraktion) %>% | |||
| filter(!is.na(fraktion)) %>% | |||
| group_by(fraktion) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(desc(n)) %>% | |||
| bar_plot_fraktionen() | |||
| bar_plot_fraktionen(n, fill = fraktion, | |||
| title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||
| ylab = "Number of uses of 'Kohleausstieg'") | |||
| ``` | |||
| ### Who gives the most speeches? | |||
| ```{r} | |||
| res$reden %>% | |||
| group_by(redner) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(-n) %>% | |||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||
| head(10) | |||
| ``` | |||
| ### Who talks the longest? | |||
| ```{r} | |||
| res$talks %>% | |||
| mutate(content_len = str_length(content)) %>% | |||
| group_by(redner) %>% | |||
| summarize(avg_content_len = mean(content_len)) %>% | |||
| arrange(-avg_content_len) %>% | |||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||
| head(10) | |||
| ``` | |||
| ### Which party gives the most applause to which parties? | |||
| ```{r} | |||
| res$applause %>% | |||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||
| select(on_fraktion = fraktion, where(is.logical)) %>% | |||
| group_by(on_fraktion) %>% | |||
| arrange(on_fraktion) %>% | |||
| summarize("AfD" = sum(`AfD`), | |||
| "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), | |||
| "CDU/CSU" = sum(`CDU_CSU`), | |||
| "DIE LINKE" = sum(`DIE_LINKE`), | |||
| "FDP" = sum(`FDP`), | |||
| "SPD" = sum(`SPD`)) -> tb | |||
| ``` | |||
| For plotting our results we reorganize them a bit and produce a bar plot: | |||
| ```{r, fig.width=7} | |||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||
| filter(!is.na(on_fraktion)) %>% | |||
| rename(fraktion = on_fraktion) %>% | |||
| bar_plot_fraktionen(value, | |||
| fill = by_fraktion, | |||
| title = "Number of rounds of applauses from fractions to fractions", | |||
| xlab = "Applauded fraction", | |||
| ylab = "Rounds of applauses", | |||
| filllab = "Applauding fraction") | |||
| ``` | |||
| ### Which party comments the most on which parties? | |||
| ```{r} | |||
| res$comments %>% | |||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||
| select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% | |||
| group_by(on_fraktion) %>% | |||
| summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), | |||
| `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), | |||
| `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T), | |||
| `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T), | |||
| `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T), | |||
| `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) -> tb | |||
| ``` | |||
| Analogously we plot the results: | |||
| ```{r, fig.width=7} | |||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||
| filter(!is.na(on_fraktion)) %>% | |||
| rename(fraktion = on_fraktion) %>% | |||
| bar_plot_fraktionen(value, | |||
| fill = by_fraktion, | |||
| title = "Number of comments from fractions to fractions", | |||
| xlab = "Commented fraction", | |||
| ylab = "Number of comments", | |||
| filllab = "Commenting fraction") | |||
| ``` | |||
| ### When are which topics discussed the most? | |||
| ```{r, fig.width=7} | |||
| pandemic_pattern <- "(?i)virus|corona|covid|lockdown" | |||
| climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" | |||
| pension_pattern <- "(?i)rente|pension|altersarmut" | |||
| word_usage_by_date(res, c(pandemic = pandemic_pattern, | |||
| climate = climate_pattern, | |||
| pension = pension_pattern)) %>% | |||
| ggplot(aes(x = date, y = count, color = pattern)) + | |||
| xlab("date of session") + | |||
| ylab("occurence of word per session") + | |||
| labs(color = "Topic") + | |||
| geom_point() | |||
| ``` | |||