---
title: "Analysis of vocabulary"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Analysis of vocabulary}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup}
library(hateimparlament)
library(dplyr)
library(stringr)
library(ggplot2)
```

## Preparation of data

First, you need to download all records of the current legislative period.
```r
fetch_all("../inst/records/") # path to directory where records should be stored
```
Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
```r
read_all("../inst/records/") %>% repair() -> res

speeches <- res$speeches
speaker <- res$speaker
talks <- res$talks
```
We also used `repair` to fix a bunch of formatting issues in the records and unpacked
the result into more descriptive variables.

For development purposes, we only fetch records if they are not already
stored as csv files:
```{r}
tables <- read_from_csv_or_fetch('../inst/')

comments <- tables$comments
speeches <- tables$speeches
speaker <- tables$speaker
talks <- tables$talks
```

Further, we need to load a list of words that were used by Hitler but not by standard German texts.
```{r}
fil <- file('../inst/hitler_texts/hitler_words')
Worte <- readLines(fil)
hitlerwords <- tibble(Worte)
```

## Analysis

Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`.
```{r}
talks %>% 
  left_join(speaker, by=c(speaker='id')) %>% 
  group_by(fraction) %>%
  summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraction
```
For each party, we want to get a tibble of words with frequency.
```{r}
#AfD
Worte <- str_extract_all(talks_by_fraction$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
afdtotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words

#AfD&Fraktionslos
Worte <- str_extract_all(talks_by_fraction$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
afdundfraktionslostotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words

#BÜNDNIS 90 / DIE GRÜNEN
Worte <- str_extract_all(talks_by_fraction$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
grünetotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words

#CDU/CSU
Worte <- str_extract_all(talks_by_fraction$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
cdutotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words

#DIE LINKE
Worte <- str_extract_all(talks_by_fraction$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
linketotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words

#FDP
Worte <- str_extract_all(talks_by_fraction$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
fdptotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words

#Fraktionslos
Worte <- str_extract_all(talks_by_fraction$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
fraktionslostotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words

#SPD
Worte <- str_extract_all(talks_by_fraction$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
spdtotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words

#NA
Worte <- str_extract_all(talks_by_fraction$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
natotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words

#alle
all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words)
total <- sum(all_words$n)
all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
```

Now we want to extract the words that are more frequently used by a specific fraction.
```{r}
afd_words %>% 
  transmute(freq, fraction_n = n) %>%
  left_join(all_words) %>%
  transmute(
    fraction_freq = freq, 
    total_freq = part, 
    fraction_n, 
    total_n = n, 
    rel_quotient = fraction_freq/total_freq, 
    abs_quotient = fraction_n/total_n) %>% 
  arrange(-abs_quotient, -fraction_n) %>% 
  filter(rel_quotient > 1) -> 
  afd_high_frequent

select(afd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% 
  filter(total_n > 80)

afdundfraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
select(afdundfraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

grüne_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
select(grüne_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

cdu_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
select(cdu_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

linke_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
select(linke_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

fdp_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
select(fdp_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

fraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
select(fraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

spd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
select(spd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)

na_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> na_high_frequent
select(na_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
```

We compare these words with `hitlerwords`.

```{r}
afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afd_hitler_comparison
afdundfraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afdundfraktionslos_hitler_comparison
grüne_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> grüne_hitler_comparison
cdu_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> cdu_hitler_comparison
linke_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> linke_hitler_comparison
fdp_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fdp_hitler_comparison
fraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fraktionslos_hitler_comparison
spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> spd_hitler_comparison
na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison

#not unique
tibble(fraction = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"),
       absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)),
       total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words))
      ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison
hitler_comparison
```
Finally, we want to plot our results:
```{r, fig.width=7}
bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]")
```