--- title: "Analysis of vocabulary" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Analysis of vocabulary} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(hateimparlament) library(dplyr) library(stringr) library(ggplot2) ``` ## Preparation of data First, you need to download all records of the current legislative period. ```r fetch_all("../inst/records/") # path to directory where records should be stored ``` Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: ```r read_all("../inst/records/") %>% repair() -> res speeches <- res$speeches speaker <- res$speaker talks <- res$talks ``` We also used `repair` to fix a bunch of formatting issues in the records and unpacked the result into more descriptive variables. For development purposes, we only fetch records if they are not already stored as csv files: ```{r} tables <- read_from_csv_or_fetch('../inst/') comments <- tables$comments speeches <- tables$speeches speaker <- tables$speaker talks <- tables$talks ``` Further, we need to load a list of words that were used by Hitler but not by standard German texts. ```{r} fil <- file('../inst/hitler_texts/hitler_words') Worte <- readLines(fil) hitlerwords <- tibble(Worte) ``` ## Analysis Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`. ```{r} talks %>% left_join(speaker, by=c(speaker='id')) %>% group_by(fraction) %>% summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraction ``` For each party, we want to get a tibble of words with frequency. ```{r} #AfD Worte <- str_extract_all(talks_by_fraction$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] afdtotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words #AfD&Fraktionslos Worte <- str_extract_all(talks_by_fraction$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] afdundfraktionslostotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words #BÜNDNIS 90 / DIE GRÜNEN Worte <- str_extract_all(talks_by_fraction$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] grünetotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words #CDU/CSU Worte <- str_extract_all(talks_by_fraction$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] cdutotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words #DIE LINKE Worte <- str_extract_all(talks_by_fraction$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] linketotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words #FDP Worte <- str_extract_all(talks_by_fraction$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] fdptotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words #Fraktionslos Worte <- str_extract_all(talks_by_fraction$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] fraktionslostotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words #SPD Worte <- str_extract_all(talks_by_fraction$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] spdtotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words #NA Worte <- str_extract_all(talks_by_fraction$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] natotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words #alle all_words <- bind_rows(afd_words, afdundfraktionslos_words, grüne_words, cdu_words, linke_words, fdp_words, fraktionslos_words, spd_words, na_words) total <- sum(all_words$n) all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words ``` Now we want to extract the words that are more frequently used by a specific fraction. ```{r} afd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute( fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afd_high_frequent select(afd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) afdundfraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent select(afdundfraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) grüne_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent select(grüne_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) cdu_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent select(cdu_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) linke_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> linke_high_frequent select(linke_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) fdp_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent select(fdp_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) fraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent select(fraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) spd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> spd_high_frequent select(spd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) na_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> na_high_frequent select(na_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) ``` We compare these words with `hitlerwords`. ```{r} afd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afd_hitler_comparison afdundfraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> afdundfraktionslos_hitler_comparison grüne_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> grüne_hitler_comparison cdu_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> cdu_hitler_comparison linke_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> linke_hitler_comparison fdp_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fdp_hitler_comparison fraktionslos_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> fraktionslos_hitler_comparison spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> spd_hitler_comparison na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison #not unique tibble(fraction = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"), absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)), total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words)) ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison hitler_comparison ``` Finally, we want to plot our results: ```{r, fig.width=7} bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") ```