Quellcode durchsuchen

solve most of predefined challenges

genderequality-alternative
flavis vor 4 Jahren
Ursprung
Commit
4fe45feec9
7 geänderte Dateien mit 138 neuen und 24 gelöschten Zeilen
  1. +1
    -0
      DESCRIPTION
  2. +3
    -0
      NAMESPACE
  3. +19
    -0
      R/analyze.R
  4. +1
    -0
      R/hateimparlament-package.R
  5. +36
    -18
      R/parse.R
  6. +4
    -3
      R/repair.R
  7. +74
    -3
      vignettes/funwithdata.Rmd

+ 1
- 0
DESCRIPTION Datei anzeigen

@@ -21,6 +21,7 @@ Imports:
rvest,
stringr,
tibble,
tidyr,
xml2
Suggests:
rmarkdown,


+ 3
- 0
NAMESPACE Datei anzeigen

@@ -1,16 +1,19 @@
# Generated by roxygen2: do not edit by hand

export(bar_plot_fraktionen)
export(fetch_all)
export(find_word)
export(join_redner)
export(read_all)
export(read_from_csv)
export(repair)
export(word_usage_by_date)
export(write_to_csv)
import(dplyr)
import(pbapply)
import(purrr)
import(stringr)
import(tibble)
import(tidyr)
import(utils)
import(xml2)

+ 19
- 0
R/analyze.R Datei anzeigen

@@ -29,3 +29,22 @@ bar_plot_fraktionen <- function(tb) {
scale_fill_manual(values = party_colors) +
geom_bar(stat = "identity")
}

# Counts how many talks do match a given pattern and summarises by date
#
#' @export
word_usage_by_date <- function(res, patterns, name, tidy=F) {
tb <- res$talks
nms <- names(patterns)
for (i in seq_along(patterns)) {
if (!is.null(nms)) name <- nms[[i]]
else name <- patterns[[i]]
tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]]))
}
left_join(tb, res$reden, by=c("rede_id" = "id")) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
arrange(date) -> tb
if (!tidy) pivot_longer(tb, where(is.numeric) , names_to = "pattern", values_to="count")
else tb
}

+ 1
- 0
R/hateimparlament-package.R Datei anzeigen

@@ -6,6 +6,7 @@
#' @import stringr
#' @import xml2
#' @import utils
#' @import tidyr
#' @import purrr
#' @keywords internal
"_PACKAGE"


+ 36
- 18
R/parse.R Datei anzeigen

@@ -20,7 +20,8 @@ read_all <- function(path="records/") {
lapply(res, `[[`, "reden") %>%
bind_rows() %>%
distinct() ->
distinct() %>%
mutate(date = as.Date(date, format="%d.%m.%Y")) ->
reden

lapply(res, `[[`, "talks") %>%
@@ -31,11 +32,26 @@ read_all <- function(path="records/") {
lapply(res, `[[`, "comments") %>%
bind_rows() %>%
distinct() ->
comments
commentsandapplause

if (length(available_protocols) == 0)
warning("The given directory is empty or does not exist.")
list(redner = redner, reden = reden, talks = talks, comments = comments)

filter(commentsandapplause, type == "comment") %>%
select(-type) ->
comments
filter(commentsandapplause, type == "applause") %>%
select(-type, -kommentator, -content) %>%
mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
"SPD" = str_detect(fraktion, "SPD"),
"FDP" = str_detect(fraktion, "FDP"),
"DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
"BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
"AfD" = str_detect(fraktion, "AfD")) %>%
select(-fraktion) ->
applause

list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause)
}

# this reads all currently parseable data from one xml
@@ -43,6 +59,8 @@ read_one <- function(name, path) {
x <- tryCatch(read_xml(paste0(path, name)),
error = function(c) NULL)
if (is.null(x)) return(NULL)
# extract date of session
date <- xml_attr(x, "sitzung-datum")
cs <- xml_children(x)

verlauf <- xml_find_first(x, "sitzungsverlauf")
@@ -54,7 +72,7 @@ read_one <- function(name, path) {

xml_children(verlauf) %>%
xml_find_all("rede") %>%
parse_redenliste() ->
parse_redenliste(date) ->
res

list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments)
@@ -86,7 +104,7 @@ parse_redner <- function(redner_xml) {
# parse one rede
# returns: - a rede (with rede id and redner id)
# - all talks appearing in the rede (with corresponding content)
parse_rede <- function(rede_xml) {
parse_rede <- function(rede_xml, date) {
rede_id <- xml_attr(rede_xml, "id")
cs <- xml_children(rede_xml)
cur_redner <- NA_character_
@@ -132,7 +150,7 @@ parse_rede <- function(rede_xml) {
redner = cur_redner,
content = cur_content)
reden <- c(reden, list(rede))
list(rede = c(id = rede_id, redner = principal_redner),
list(rede = c(id = rede_id, redner = principal_redner, date = date),
parts = reden,
comments = comments)
}
@@ -142,16 +160,13 @@ fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LIN

parse_comment <- function(comment, rede_id, on_redner) {
base <- c(rede_id = rede_id, on_redner = on_redner)
str_extract_all(comment, fraktionspattern) %>%
`[[`(1) %>%
sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
str_c(collapse=",") ->
by
# classify comment
# TODO:
# - actually separate content properly
# - differentiate between [AfD] and AfD in by
if(str_detect(comment, "Beifall")) {
str_extract_all(comment, fraktionspattern) %>%
`[[`(1) %>%
sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
str_c(collapse=",") ->
by
c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
} else {
ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
@@ -160,12 +175,13 @@ parse_comment <- function(comment, rede_id, on_redner) {
}

# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
parse_redenliste <- function(redenliste_xml) {
d <- sapply(redenliste_xml, parse_rede)
parse_redenliste <- function(redenliste_xml, date) {
d <- sapply(redenliste_xml, parse_rede, date = date)
reden <- simplify2array(d["rede", ])
parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
list(reden = tibble(id = reden["id",], redner = reden["redner",]),
list(reden = tibble(id = reden["id",], redner = reden["redner",],
date = reden["date",]),
talks = tibble(rede_id = parts["rede_id", ],
redner = parts["redner", ],
content = parts["content", ]),
@@ -196,6 +212,7 @@ write_to_csv <- function(tables, path="csv/", create=F) {
write.table(tables$reden, str_c(path, "reden.csv"))
write.table(tables$talks, str_c(path, "talks.csv"))
write.table(tables$comments, str_c(path, "comments.csv"))
write.table(tables$applause, str_c(path, "applause.csv"))
}

#' @export
@@ -207,7 +224,8 @@ read_from_csv <- function(path="csv/") {
tibble() %>%
mutate(redner = as.character(redner)),
talks = tibble %$% read.table(str_c(path, "talks.csv")),
comments = tibble %$% read.table(str_c(path, "comments.csv")))
comments = tibble %$% read.table(str_c(path, "comments.csv")),
applause = tibble %$% read.table(str_c(path, "applause.csv")))
}

# -------------------------------


+ 4
- 3
R/repair.R Datei anzeigen

@@ -47,8 +47,8 @@ repair_reden <- function(reden) {

repair_talks <- function(talks) {
if (nrow(talks) == 0) return(talks)
# TODO: fill with content
talks
# ignore all talks which have empty content
filter(talks, str_length(content) > 0)
}

# tries to find the correct redner id given a name
@@ -90,6 +90,7 @@ repair <- function(parse_output) {
reden = repair_reden(parse_output$reden),
talks = repair_talks(parse_output$talks),
#comments = repair_comments(parse_output$comments)
comments = parse_output$comments
comments = parse_output$comments,
applause = parse_output$applause
)
}

+ 74
- 3
vignettes/funwithdata.Rmd Datei anzeigen

@@ -48,8 +48,8 @@ talks <- res$talks
## Analysis

Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
```{r, fig.width=10}
join_redner(reden, res) %>%
```{r, fig.width=8}
join_redner(res$reden, res) %>%
group_by(fraktion) %>%
summarize(n = n()) %>%
arrange(n) %>%
@@ -58,7 +58,7 @@ join_redner(reden, res) %>%

### Count a word occurence

```{r, fig.width=10}
```{r, fig.width=8}
find_word(res, "hitler") %>%
filter(occurences > 0) %>%
join_redner(res) %>%
@@ -68,3 +68,74 @@ find_word(res, "hitler") %>%
arrange(desc(n)) %>%
bar_plot_fraktionen()
```

### Who gives the most speeches?

```{r}
res$reden %>%
group_by(redner) %>%
summarize(n = n()) %>%
arrange(-n) %>%
left_join(res$redner, by=c("redner" = "id")) %>%
head(10)
```

### Who talks the longest?

```{r}
res$talks %>%
mutate(content_len = str_length(content)) %>%
group_by(redner) %>%
summarize(avg_content_len = mean(content_len)) %>%
arrange(-avg_content_len) %>%
left_join(res$redner, by=c("redner" = "id")) %>%
head(10)
```

### Which party gives the most applause to which parties?

```{r}
res$applause %>%
left_join(res$redner, by=c("on_redner" = "id")) %>%
select(on_fraktion = fraktion, where(is.logical)) %>%
group_by(on_fraktion) %>%
arrange(on_fraktion) %>%
summarize("AfD" = sum(`AfD`),
"BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
"CDU/CSU" = sum(`CDU_CSU`),
"DIE LINKE" = sum(`DIE_LINKE`),
"FDP" = sum(`FDP`),
"SPD" = sum(`SPD`))
```

### Which party comments the most on which parties?

```{r}
res$comments %>%
left_join(res$redner, by=c("on_redner" = "id")) %>%
select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>%
group_by(on_fraktion) %>%
summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T),
`BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
`CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T),
`DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T),
`FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T),
`SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T))
```

### When are which topics discussed the most?

```{r, fig.width=8}
pandemic_pattern <- "(?i)virus|corona|covid|lockdown"
climate_pattern <- "(?i)klimawandel|erderwärmung|co2|treibhaus|methan|kyoto-protokoll|klimaabkommen"
pension_pattern <- "(?i)rente|pension|altersarmut"

word_usage_by_date(res, c(pandemic = pandemic_pattern,
climate = climate_pattern,
pension = pension_pattern)) %>%
ggplot(aes(x = date, y = count, color = pattern)) +
xlab("date of session") +
ylab("occurence of word per session") +
labs(color = "Topic") +
geom_point()
```

Laden…
Abbrechen
Speichern