From a0df02dbed7bd4a698c43475117161438af136e6 Mon Sep 17 00:00:00 2001 From: JosuaKugler Date: Tue, 3 Aug 2021 17:11:11 +0200 Subject: [PATCH] refactor fraktion -> fraction --- R/analyze.R | 12 +++---- R/parse.R | 34 +++++++++---------- R/repair.R | 14 ++++---- README.md | 2 +- vignettes/funwithdata.Rmd | 46 ++++++++++++------------- vignettes/hitlercomparison.Rmd | 62 +++++++++++++++++----------------- 6 files changed, 85 insertions(+), 85 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index dce1547..fd728cd 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -6,9 +6,9 @@ find_word <- function(res, word) { } #' @export -join_speaker <- function(tb, res, fraktion_only = F) { +join_speaker <- function(tb, res, fraction_only = F) { joined <- left_join(tb, res$speaker, by=c("speaker" = "id")) - if (fraktion_only) select(joined, "fraktion") + if (fraction_only) select(joined, "fraction") else joined } @@ -30,9 +30,9 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", #' @export bar_plot_fractions <- function(tb, - x_variable = NULL, # default is fraktion + x_variable = NULL, # default is fraction y_variable = NULL, # default is n - fill = NULL, # default is fraktion + fill = NULL, # default is fraction title = NULL, xlab = "Fraction", ylab = "n", @@ -46,9 +46,9 @@ bar_plot_fractions <- function(tb, x_variable <- enexpr(x_variable) # set default values - if (is.null(fill)) fill <- expr(fraktion) + if (is.null(fill)) fill <- expr(fraction) if (is.null(y_variable)) y_variable <- expr(n) - if (is.null(x_variable)) x_variable <- expr(fraktion) + if (is.null(x_variable)) x_variable <- expr(fraction) # either reorder fraction factor by variable value if (reorder) maps <- aes(x = reorder(!!x_variable, -!!y_variable), diff --git a/R/parse.R b/R/parse.R index 7184779..53f73c6 100644 --- a/R/parse.R +++ b/R/parse.R @@ -42,13 +42,13 @@ read_all <- function(path="records/") { comments filter(commentsandapplause, type == "applause") %>% select(-type, -kommentator, -content) %>% - mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"), - "SPD" = str_detect(fraktion, "SPD"), - "FDP" = str_detect(fraktion, "FDP"), - "DIE_LINKE" = str_detect(fraktion, "DIE LINKE"), - "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"), - "AfD" = str_detect(fraktion, "AfD")) %>% - select(-fraktion) -> + mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"), + "SPD" = str_detect(fraction, "SPD"), + "FDP" = str_detect(fraction, "FDP"), + "DIE_LINKE" = str_detect(fraction, "DIE LINKE"), + "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"), + "AfD" = str_detect(fraction, "AfD")) %>% + select(-fraction) -> applause list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) @@ -90,14 +90,14 @@ parse_speaker <- function(speaker_xml) { nm <- xml_child(speaker_xml) vorname <- xml_get(nm, "vorname") nachname <- xml_get(nm, "nachname") - fraktion <- xml_get(nm, "fraktion") + fraction <- xml_get(nm, "fraction") titel <- xml_get(nm, "titel") rolle <- xml_find_all(nm, "rolle") if (length(rolle) > 0) { rolle_lang <- xml_get(rolle, "rolle_lang") rolle_kurz <- xml_get(rolle, "rolle_kurz") } else rolle_kurz <- rolle_lang <- NA_character_ - c(id = speaker_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, + c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel, rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) } @@ -155,22 +155,22 @@ parse_speech <- function(speech_xml, date) { comments = comments) } -fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" -fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") +fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" +fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") parse_comment <- function(comment, speech_id, on_speaker) { base <- c(speech_id = speech_id, on_speaker = on_speaker) # classify comment if(str_detect(comment, "Beifall")) { - str_extract_all(comment, fraktionspattern) %>% + str_extract_all(comment, fractionpattern) %>% `[[`(1) %>% - sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>% + sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>% str_c(collapse=",") -> by - c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment) + c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] - c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4]) + c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4]) } } @@ -188,7 +188,7 @@ parse_speechlist <- function(speechlist_xml, date) { comments = tibble(speech_id = comments["speech_id",], on_speaker = comments["on_speaker",], type = comments["type",], - fraktion = comments["fraktion",], + fraction = comments["fraction",], kommentator = comments["kommentator",], content = comments["content", ])) } @@ -199,7 +199,7 @@ parse_speakerliste <- function(speakerliste_xml) { tibble(id = d["id",], vorname = d["vorname",], nachname = d["nachname",], - fraktion = d["fraktion",], + fraction = d["fraction",], titel = d["titel",], rolle_kurz = d["rolle_kurz",], rolle_lang = d["rolle_lang",]) diff --git a/R/repair.R b/R/repair.R index 522a903..cd7bb37 100644 --- a/R/repair.R +++ b/R/repair.R @@ -1,4 +1,4 @@ -fraktionen <- c("AFD" = "AfD", +fractions <- c("AFD" = "AfD", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "FRAKTIONSLOS" = "Fraktionslos", @@ -7,9 +7,9 @@ fraktionen <- c("AFD" = "AfD", "CDU/CSU" = "CDU/CSU", "FDP" = "FDP") -repair_fraktion <- function(fraktion) { - cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "") - fraktionen[cleaned] +repair_fraction <- function(fraction) { + cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "") + fractions[cleaned] } # takes vector of titel and keeps longest @@ -26,17 +26,17 @@ repair_speaker <- function(speaker) { if (nrow(speaker) == 0) return(speaker) speaker %>% filter(id != "10000") %>% # invalid id's - mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion + mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction group_by(id) %>% summarize(vorname = head(vorname, 1), nachname = head(nachname, 1), - fraktion = collect_unique(fraktion), + fraction = collect_unique(fraction), titel = longest_titel(titel), rolle_kurz = collect_unique(str_squish(rolle_kurz)), rolle_lang = collect_unique(str_squish(rolle_lang))) %>% ungroup() #%>% # arrange(id) %>% - # distinct(vorname, nachname, fraktion, titel) + # distinct(vorname, nachname, fraction, titel) } repair_speeches <- function(speeches) { diff --git a/README.md b/README.md index 4c30ea2..afb6964 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ parse.R parsed einzelne Protokolle und erstellt 3 Tibbles ### Redner -Struktur: `id` , `vorname` , `nachname` , `fraktion` , `titel` , `rolle_kurz`, `rolle_lang` +Struktur: `id` , `vorname` , `nachname` , `fraction` , `titel` , `rolle_kurz`, `rolle_lang` Die Rollen sind beispielsweise "Bundeskanzlerin". Leider gegendert und deshalb wahrscheinlich nervig zu analysieren. diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 1e359fe..6b6c3e1 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -52,7 +52,7 @@ talks <- res$talks Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: ```{r, fig.width=7} join_speaker(res$speeches, res) %>% - group_by(fraktion) %>% + group_by(fraction) %>% summarize(n = n()) %>% arrange(n) %>% bar_plot_fractions(title="Number of speeches given by fraction", @@ -65,9 +65,9 @@ or counting the occurences of a given word: find_word(res, "Kohleausstieg") %>% filter(occurences > 0) %>% join_speaker(res) %>% - select(content, fraktion) %>% - filter(!is.na(fraktion)) %>% - group_by(fraktion) %>% + select(content, fraction) %>% + filter(!is.na(fraction)) %>% + group_by(fraction) %>% summarize(n = n()) %>% arrange(desc(n)) %>% bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", @@ -103,9 +103,9 @@ res$talks %>% ```{r} res$applause %>% left_join(res$speaker, by=c("on_speaker" = "id")) %>% - select(on_fraktion = fraktion, where(is.logical)) %>% - group_by(on_fraktion) %>% - arrange(on_fraktion) %>% + select(on_fraction = fraction, where(is.logical)) %>% + group_by(on_fraction) %>% + arrange(on_fraction) %>% summarize("AfD" = sum(`AfD`), "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), "CDU/CSU" = sum(`CDU_CSU`), @@ -117,11 +117,11 @@ res$applause %>% For plotting our results we reorganize them a bit and produce a bar plot: ```{r, fig.width=7} -pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% - filter(!is.na(on_fraktion)) %>% - bar_plot_fractions(x_variable = on_fraktion, +pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% + filter(!is.na(on_fraction)) %>% + bar_plot_fractions(x_variable = on_fraction, y_variable = value, - fill = by_fraktion, + fill = by_fraction, title = "Number of rounds of applauses from fractions to fractions", xlab = "Applauded fraction", ylab = "Rounds of applauses", @@ -135,23 +135,23 @@ pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% ```{r} res$comments %>% left_join(res$speaker, by=c("on_speaker" = "id")) %>% - select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% - group_by(on_fraktion) %>% - summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), - `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), - `CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T), - `DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T), - `FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T), - `SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) -> tb + select(by_fraction = fraction.x, on_fraction = fraction.y) %>% + group_by(on_fraction) %>% + summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T), + `BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T), + `CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T), + `DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T), + `FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T), + `SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb ``` Analogously we plot the results: ```{r, fig.width=7} -pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% - filter(!is.na(on_fraktion)) %>% - bar_plot_fractions(x_variable = on_fraktion, +pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>% + filter(!is.na(on_fraction)) %>% + bar_plot_fractions(x_variable = on_fraction, y_variable = value, - fill = by_fraktion, + fill = by_fraction, title = "Number of comments from fractions to fractions", xlab = "Commented fraction", ylab = "Number of comments", diff --git a/vignettes/hitlercomparison.Rmd b/vignettes/hitlercomparison.Rmd index 4dee93c..13d5b01 100644 --- a/vignettes/hitlercomparison.Rmd +++ b/vignettes/hitlercomparison.Rmd @@ -61,53 +61,53 @@ Now we extract the words that were used with higher frequency by one party and c ```{r} talks %>% left_join(speaker, by=c(speaker='id')) %>% - group_by(fraktion) %>% - summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion + group_by(fraction) %>% + summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraction ``` For each party, we want to get a tibble of words with frequency. ```{r} #AfD -Worte <- str_extract_all(talks_by_fraktion$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] afdtotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words #AfD&Fraktionslos -Worte <- str_extract_all(talks_by_fraktion$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] afdundfraktionslostotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words #BÜNDNIS 90 / DIE GRÜNEN -Worte <- str_extract_all(talks_by_fraktion$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] grünetotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words #CDU/CSU -Worte <- str_extract_all(talks_by_fraktion$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] cdutotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words #DIE LINKE -Worte <- str_extract_all(talks_by_fraktion$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] linketotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words #FDP -Worte <- str_extract_all(talks_by_fraktion$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] fdptotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words #Fraktionslos -Worte <- str_extract_all(talks_by_fraktion$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] fraktionslostotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words #SPD -Worte <- str_extract_all(talks_by_fraktion$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] spdtotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words #NA -Worte <- str_extract_all(talks_by_fraktion$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] +Worte <- str_extract_all(talks_by_fraction$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]] natotal = length(Worte) tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words @@ -117,34 +117,34 @@ total <- sum(all_words$n) all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words ``` -Now we want to extract the words that are more frequently used by a specific `fraktion`. +Now we want to extract the words that are more frequently used by a specific fraction. ```{r} -afd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afd_high_frequent -select(afd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +afd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afd_high_frequent +select(afd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -afdundfraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent -select(afdundfraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +afdundfraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent +select(afdundfraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -grüne_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent -select(grüne_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +grüne_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent +select(grüne_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -cdu_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent -select(cdu_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +cdu_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent +select(cdu_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -linke_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> linke_high_frequent -select(linke_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +linke_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> linke_high_frequent +select(linke_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -fdp_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent -select(fdp_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +fdp_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent +select(fdp_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -fraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent -select(fraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +fraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent +select(fraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -spd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> spd_high_frequent -select(spd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +spd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> spd_high_frequent +select(spd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) -na_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> na_high_frequent -select(na_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) +na_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> na_high_frequent +select(na_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80) ``` We compare these words with `hitlerwords`. @@ -161,7 +161,7 @@ spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerw na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison #not unique -tibble(fraktion = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"), +tibble(fraction = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"), absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)), total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words)) ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison