| @@ -21,6 +21,7 @@ Imports: | |||||
| rvest, | rvest, | ||||
| stringr, | stringr, | ||||
| tibble, | tibble, | ||||
| tidyr, | |||||
| xml2 | xml2 | ||||
| Suggests: | Suggests: | ||||
| rmarkdown, | rmarkdown, | ||||
| @@ -1,16 +1,19 @@ | |||||
| # Generated by roxygen2: do not edit by hand | # Generated by roxygen2: do not edit by hand | ||||
| export(bar_plot_fraktionen) | |||||
| export(fetch_all) | export(fetch_all) | ||||
| export(find_word) | export(find_word) | ||||
| export(join_redner) | export(join_redner) | ||||
| export(read_all) | export(read_all) | ||||
| export(read_from_csv) | export(read_from_csv) | ||||
| export(repair) | export(repair) | ||||
| export(word_usage_by_date) | |||||
| export(write_to_csv) | export(write_to_csv) | ||||
| import(dplyr) | import(dplyr) | ||||
| import(pbapply) | import(pbapply) | ||||
| import(purrr) | import(purrr) | ||||
| import(stringr) | import(stringr) | ||||
| import(tibble) | import(tibble) | ||||
| import(tidyr) | |||||
| import(utils) | import(utils) | ||||
| import(xml2) | import(xml2) | ||||
| @@ -12,20 +12,45 @@ join_redner <- function(tb, res, fraktion_only = F) { | |||||
| else joined | else joined | ||||
| } | } | ||||
| #' @export | |||||
| party_colors <- c( | party_colors <- c( | ||||
| SPD="#DF0B25", | |||||
| "CDU/CSU"="#000000", | |||||
| AfD="#1A9FDD", | AfD="#1A9FDD", | ||||
| "AfD&Fraktionslos"="#1A9FDD", | |||||
| "DIE LINKE"="#BC3475", | |||||
| "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", | |||||
| FDP="#FEEB34", | FDP="#FEEB34", | ||||
| "CDU/CSU"="#000000", | |||||
| SPD="#DF0B25", | |||||
| "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", | |||||
| "DIE LINKE"="#BC3475", | |||||
| "AfD&Fraktionslos"="#1A9FDD", | |||||
| Fraktionslos="#FEEB34" | Fraktionslos="#FEEB34" | ||||
| ) | ) | ||||
| #' @export | #' @export | ||||
| bar_plot_fraktionen <- function(tb) { | |||||
| ggplot(tb, aes(x = reorder(fraktion, -n), y = n, fill = fraktion)) + | |||||
| scale_fill_manual(values = party_colors) + | |||||
| geom_bar(stat = "identity") | |||||
| bar_plot_fraktionen <- function(tb, variable, fill, title=NULL, xlab = "Fraction", | |||||
| ylab="n", filllab="Fraction") { | |||||
| ggplot(tb, aes(x = reorder(fraktion, -{{variable}}), y = {{variable}}, fill = {{fill}})) + | |||||
| scale_fill_manual(values = party_colors) + | |||||
| xlab(xlab) + | |||||
| ylab(ylab) + | |||||
| labs(fill = filllab) + | |||||
| ggtitle(title) + | |||||
| geom_bar(stat = "identity") | |||||
| } | |||||
| # Counts how many talks do match a given pattern and summarises by date | |||||
| # | |||||
| #' @export | |||||
| word_usage_by_date <- function(res, patterns, name, tidy=F) { | |||||
| tb <- res$talks | |||||
| nms <- names(patterns) | |||||
| for (i in seq_along(patterns)) { | |||||
| if (!is.null(nms)) name <- nms[[i]] | |||||
| else name <- patterns[[i]] | |||||
| tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) | |||||
| } | |||||
| left_join(tb, res$reden, by=c("rede_id" = "id")) %>% | |||||
| group_by(date) %>% | |||||
| summarize(across(where(is.numeric), sum)) %>% | |||||
| arrange(date) -> tb | |||||
| if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count") | |||||
| else tb | |||||
| } | } | ||||
| @@ -6,6 +6,7 @@ | |||||
| #' @import stringr | #' @import stringr | ||||
| #' @import xml2 | #' @import xml2 | ||||
| #' @import utils | #' @import utils | ||||
| #' @import tidyr | |||||
| #' @import purrr | #' @import purrr | ||||
| #' @keywords internal | #' @keywords internal | ||||
| "_PACKAGE" | "_PACKAGE" | ||||
| @@ -20,7 +20,8 @@ read_all <- function(path="records/") { | |||||
| lapply(res, `[[`, "reden") %>% | lapply(res, `[[`, "reden") %>% | ||||
| bind_rows() %>% | bind_rows() %>% | ||||
| distinct() -> | |||||
| distinct() %>% | |||||
| mutate(date = as.Date(date, format="%d.%m.%Y")) -> | |||||
| reden | reden | ||||
| lapply(res, `[[`, "talks") %>% | lapply(res, `[[`, "talks") %>% | ||||
| @@ -31,11 +32,26 @@ read_all <- function(path="records/") { | |||||
| lapply(res, `[[`, "comments") %>% | lapply(res, `[[`, "comments") %>% | ||||
| bind_rows() %>% | bind_rows() %>% | ||||
| distinct() -> | distinct() -> | ||||
| comments | |||||
| commentsandapplause | |||||
| if (length(available_protocols) == 0) | if (length(available_protocols) == 0) | ||||
| warning("The given directory is empty or does not exist.") | warning("The given directory is empty or does not exist.") | ||||
| list(redner = redner, reden = reden, talks = talks, comments = comments) | |||||
| filter(commentsandapplause, type == "comment") %>% | |||||
| select(-type) -> | |||||
| comments | |||||
| filter(commentsandapplause, type == "applause") %>% | |||||
| select(-type, -kommentator, -content) %>% | |||||
| mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"), | |||||
| "SPD" = str_detect(fraktion, "SPD"), | |||||
| "FDP" = str_detect(fraktion, "FDP"), | |||||
| "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), | |||||
| "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), | |||||
| "AfD" = str_detect(fraktion, "AfD")) %>% | |||||
| select(-fraktion) -> | |||||
| applause | |||||
| list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) | |||||
| } | } | ||||
| # this reads all currently parseable data from one xml | # this reads all currently parseable data from one xml | ||||
| @@ -43,6 +59,8 @@ read_one <- function(name, path) { | |||||
| x <- tryCatch(read_xml(paste0(path, name)), | x <- tryCatch(read_xml(paste0(path, name)), | ||||
| error = function(c) NULL) | error = function(c) NULL) | ||||
| if (is.null(x)) return(NULL) | if (is.null(x)) return(NULL) | ||||
| # extract date of session | |||||
| date <- xml_attr(x, "sitzung-datum") | |||||
| cs <- xml_children(x) | cs <- xml_children(x) | ||||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | verlauf <- xml_find_first(x, "sitzungsverlauf") | ||||
| @@ -54,7 +72,7 @@ read_one <- function(name, path) { | |||||
| xml_children(verlauf) %>% | xml_children(verlauf) %>% | ||||
| xml_find_all("rede") %>% | xml_find_all("rede") %>% | ||||
| parse_redenliste() -> | |||||
| parse_redenliste(date) -> | |||||
| res | res | ||||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | ||||
| @@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) { | |||||
| # parse one rede | # parse one rede | ||||
| # returns: - a rede (with rede id and redner id) | # returns: - a rede (with rede id and redner id) | ||||
| # - all talks appearing in the rede (with corresponding content) | # - all talks appearing in the rede (with corresponding content) | ||||
| parse_rede <- function(rede_xml) { | |||||
| parse_rede <- function(rede_xml, date) { | |||||
| rede_id <- xml_attr(rede_xml, "id") | rede_id <- xml_attr(rede_xml, "id") | ||||
| cs <- xml_children(rede_xml) | cs <- xml_children(rede_xml) | ||||
| cur_redner <- NA_character_ | cur_redner <- NA_character_ | ||||
| @@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) { | |||||
| redner = cur_redner, | redner = cur_redner, | ||||
| content = cur_content) | content = cur_content) | ||||
| reden <- c(reden, list(rede)) | reden <- c(reden, list(rede)) | ||||
| list(rede = c(id = rede_id, redner = principal_redner), | |||||
| list(rede = c(id = rede_id, redner = principal_redner, date = date), | |||||
| parts = reden, | parts = reden, | ||||
| comments = comments) | comments = comments) | ||||
| } | } | ||||
| @@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN | |||||
| parse_comment <- function(comment, rede_id, on_redner) { | parse_comment <- function(comment, rede_id, on_redner) { | ||||
| base <- c(rede_id = rede_id, on_redner = on_redner) | base <- c(rede_id = rede_id, on_redner = on_redner) | ||||
| str_extract_all(comment, fraktionspattern) %>% | |||||
| `[[`(1) %>% | |||||
| sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% | |||||
| str_c(collapse=",") -> | |||||
| by | |||||
| # classify comment | # classify comment | ||||
| # TODO: | |||||
| # - actually separate content properly | |||||
| # - differentiate between [AfD] and AfD in by | |||||
| if(str_detect(comment, "Beifall")) { | if(str_detect(comment, "Beifall")) { | ||||
| str_extract_all(comment, fraktionspattern) %>% | |||||
| `[[`(1) %>% | |||||
| sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% | |||||
| str_c(collapse=",") -> | |||||
| by | |||||
| c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) | c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) | ||||
| } else { | } else { | ||||
| ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] | ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] | ||||
| @@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) { | |||||
| } | } | ||||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | ||||
| parse_redenliste <- function(redenliste_xml) { | |||||
| d <- sapply(redenliste_xml, parse_rede) | |||||
| parse_redenliste <- function(redenliste_xml, date) { | |||||
| d <- sapply(redenliste_xml, parse_rede, date = date) | |||||
| reden <- simplify2array(d["rede", ]) | reden <- simplify2array(d["rede", ]) | ||||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | ||||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | ||||
| list(reden = tibble(id = reden["id",], redner = reden["redner",]), | |||||
| list(reden = tibble(id = reden["id",], redner = reden["redner",], | |||||
| date = reden["date",]), | |||||
| talks = tibble(rede_id = parts["rede_id", ], | talks = tibble(rede_id = parts["rede_id", ], | ||||
| redner = parts["redner", ], | redner = parts["redner", ], | ||||
| content = parts["content", ]), | content = parts["content", ]), | ||||
| @@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) { | |||||
| write.table(tables$reden, str_c(path, "reden.csv")) | write.table(tables$reden, str_c(path, "reden.csv")) | ||||
| write.table(tables$talks, str_c(path, "talks.csv")) | write.table(tables$talks, str_c(path, "talks.csv")) | ||||
| write.table(tables$comments, str_c(path, "comments.csv")) | write.table(tables$comments, str_c(path, "comments.csv")) | ||||
| write.table(tables$applause, str_c(path, "applause.csv")) | |||||
| } | } | ||||
| #' @export | #' @export | ||||
| @@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") { | |||||
| tibble() %>% | tibble() %>% | ||||
| mutate(redner = as.character(redner)), | mutate(redner = as.character(redner)), | ||||
| talks = tibble %$% read.table(str_c(path, "talks.csv")), | talks = tibble %$% read.table(str_c(path, "talks.csv")), | ||||
| comments = tibble %$% read.table(str_c(path, "comments.csv"))) | |||||
| comments = tibble %$% read.table(str_c(path, "comments.csv")), | |||||
| applause = tibble %$% read.table(str_c(path, "applause.csv"))) | |||||
| } | } | ||||
| # ------------------------------- | # ------------------------------- | ||||
| @@ -47,8 +47,8 @@ repair_reden <- function(reden) { | |||||
| repair_talks <- function(talks) { | repair_talks <- function(talks) { | ||||
| if (nrow(talks) == 0) return(talks) | if (nrow(talks) == 0) return(talks) | ||||
| # TODO: fill with content | |||||
| talks | |||||
| # ignore all talks which have empty content | |||||
| filter(talks, str_length(content) > 0) | |||||
| } | } | ||||
| # tries to find the correct redner id given a name | # tries to find the correct redner id given a name | ||||
| @@ -90,6 +90,7 @@ repair <- function(parse_output) { | |||||
| reden = repair_reden(parse_output$reden), | reden = repair_reden(parse_output$reden), | ||||
| talks = repair_talks(parse_output$talks), | talks = repair_talks(parse_output$talks), | ||||
| #comments = repair_comments(parse_output$comments) | #comments = repair_comments(parse_output$comments) | ||||
| comments = parse_output$comments | |||||
| comments = parse_output$comments, | |||||
| applause = parse_output$applause | |||||
| ) | ) | ||||
| } | } | ||||
| @@ -18,6 +18,8 @@ knitr::opts_chunk$set( | |||||
| library(hateimparlament) | library(hateimparlament) | ||||
| library(dplyr) | library(dplyr) | ||||
| library(ggplot2) | library(ggplot2) | ||||
| library(stringr) | |||||
| library(tidyr) | |||||
| ``` | ``` | ||||
| ## Preparation of data | ## Preparation of data | ||||
| @@ -48,23 +50,125 @@ talks <- res$talks | |||||
| ## Analysis | ## Analysis | ||||
| Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | ||||
| ```{r, fig.width=10} | |||||
| join_redner(reden, res) %>% | |||||
| ```{r, fig.width=7} | |||||
| join_redner(res$reden, res) %>% | |||||
| group_by(fraktion) %>% | group_by(fraktion) %>% | ||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| arrange(n) %>% | arrange(n) %>% | ||||
| bar_plot_fraktionen() | |||||
| bar_plot_fraktionen(n, fill = fraktion, title="Number of speeches given by fraction", ylab="Number of speeches") | |||||
| ``` | ``` | ||||
| ### Count a word occurence | |||||
| or counting the occurences of a given word: | |||||
| ```{r, fig.width=10} | |||||
| find_word(res, "hitler") %>% | |||||
| ```{r, fig.width=7} | |||||
| find_word(res, "Kohleausstieg") %>% | |||||
| filter(occurences > 0) %>% | filter(occurences > 0) %>% | ||||
| join_redner(res) %>% | join_redner(res) %>% | ||||
| select(content, fraktion) %>% | select(content, fraktion) %>% | ||||
| filter(!is.na(fraktion)) %>% | |||||
| group_by(fraktion) %>% | group_by(fraktion) %>% | ||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| arrange(desc(n)) %>% | arrange(desc(n)) %>% | ||||
| bar_plot_fraktionen() | |||||
| bar_plot_fraktionen(n, fill = fraktion, | |||||
| title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||||
| ylab = "Number of uses of 'Kohleausstieg'") | |||||
| ``` | |||||
| ### Who gives the most speeches? | |||||
| ```{r} | |||||
| res$reden %>% | |||||
| group_by(redner) %>% | |||||
| summarize(n = n()) %>% | |||||
| arrange(-n) %>% | |||||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||||
| head(10) | |||||
| ``` | |||||
| ### Who talks the longest? | |||||
| ```{r} | |||||
| res$talks %>% | |||||
| mutate(content_len = str_length(content)) %>% | |||||
| group_by(redner) %>% | |||||
| summarize(avg_content_len = mean(content_len)) %>% | |||||
| arrange(-avg_content_len) %>% | |||||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||||
| head(10) | |||||
| ``` | |||||
| ### Which party gives the most applause to which parties? | |||||
| ```{r} | |||||
| res$applause %>% | |||||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||||
| select(on_fraktion = fraktion, where(is.logical)) %>% | |||||
| group_by(on_fraktion) %>% | |||||
| arrange(on_fraktion) %>% | |||||
| summarize("AfD" = sum(`AfD`), | |||||
| "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), | |||||
| "CDU/CSU" = sum(`CDU_CSU`), | |||||
| "DIE LINKE" = sum(`DIE_LINKE`), | |||||
| "FDP" = sum(`FDP`), | |||||
| "SPD" = sum(`SPD`)) -> tb | |||||
| ``` | |||||
| For plotting our results we reorganize them a bit and produce a bar plot: | |||||
| ```{r, fig.width=7} | |||||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||||
| filter(!is.na(on_fraktion)) %>% | |||||
| rename(fraktion = on_fraktion) %>% | |||||
| bar_plot_fraktionen(value, | |||||
| fill = by_fraktion, | |||||
| title = "Number of rounds of applauses from fractions to fractions", | |||||
| xlab = "Applauded fraction", | |||||
| ylab = "Rounds of applauses", | |||||
| filllab = "Applauding fraction") | |||||
| ``` | |||||
| ### Which party comments the most on which parties? | |||||
| ```{r} | |||||
| res$comments %>% | |||||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||||
| select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% | |||||
| group_by(on_fraktion) %>% | |||||
| summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), | |||||
| `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), | |||||
| `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T), | |||||
| `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T), | |||||
| `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T), | |||||
| `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) -> tb | |||||
| ``` | |||||
| Analogously we plot the results: | |||||
| ```{r, fig.width=7} | |||||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||||
| filter(!is.na(on_fraktion)) %>% | |||||
| rename(fraktion = on_fraktion) %>% | |||||
| bar_plot_fraktionen(value, | |||||
| fill = by_fraktion, | |||||
| title = "Number of comments from fractions to fractions", | |||||
| xlab = "Commented fraction", | |||||
| ylab = "Number of comments", | |||||
| filllab = "Commenting fraction") | |||||
| ``` | |||||
| ### When are which topics discussed the most? | |||||
| ```{r, fig.width=7} | |||||
| pandemic_pattern <- "(?i)virus|corona|covid|lockdown" | |||||
| climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen" | |||||
| pension_pattern <- "(?i)rente|pension|altersarmut" | |||||
| word_usage_by_date(res, c(pandemic = pandemic_pattern, | |||||
| climate = climate_pattern, | |||||
| pension = pension_pattern)) %>% | |||||
| ggplot(aes(x = date, y = count, color = pattern)) + | |||||
| xlab("date of session") + | |||||
| ylab("occurence of word per session") + | |||||
| labs(color = "Topic") + | |||||
| geom_point() | |||||
| ``` | ``` | ||||