Просмотр исходного кода

refactor fraktion -> fraction

genderequality-alternative
JosuaKugler 4 лет назад
Родитель
Сommit
a0df02dbed
6 измененных файлов: 85 добавлений и 85 удалений
  1. +6
    -6
      R/analyze.R
  2. +17
    -17
      R/parse.R
  3. +7
    -7
      R/repair.R
  4. +1
    -1
      README.md
  5. +23
    -23
      vignettes/funwithdata.Rmd
  6. +31
    -31
      vignettes/hitlercomparison.Rmd

+ 6
- 6
R/analyze.R Просмотреть файл

@@ -6,9 +6,9 @@ find_word <- function(res, word) {
} }


#' @export #' @export
join_speaker <- function(tb, res, fraktion_only = F) {
join_speaker <- function(tb, res, fraction_only = F) {
joined <- left_join(tb, res$speaker, by=c("speaker" = "id")) joined <- left_join(tb, res$speaker, by=c("speaker" = "id"))
if (fraktion_only) select(joined, "fraktion")
if (fraction_only) select(joined, "fraction")
else joined else joined
} }


@@ -30,9 +30,9 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",


#' @export #' @export
bar_plot_fractions <- function(tb, bar_plot_fractions <- function(tb,
x_variable = NULL, # default is fraktion
x_variable = NULL, # default is fraction
y_variable = NULL, # default is n y_variable = NULL, # default is n
fill = NULL, # default is fraktion
fill = NULL, # default is fraction
title = NULL, title = NULL,
xlab = "Fraction", xlab = "Fraction",
ylab = "n", ylab = "n",
@@ -46,9 +46,9 @@ bar_plot_fractions <- function(tb,
x_variable <- enexpr(x_variable) x_variable <- enexpr(x_variable)


# set default values # set default values
if (is.null(fill)) fill <- expr(fraktion)
if (is.null(fill)) fill <- expr(fraction)
if (is.null(y_variable)) y_variable <- expr(n) if (is.null(y_variable)) y_variable <- expr(n)
if (is.null(x_variable)) x_variable <- expr(fraktion)
if (is.null(x_variable)) x_variable <- expr(fraction)


# either reorder fraction factor by variable value # either reorder fraction factor by variable value
if (reorder) maps <- aes(x = reorder(!!x_variable, -!!y_variable), if (reorder) maps <- aes(x = reorder(!!x_variable, -!!y_variable),


+ 17
- 17
R/parse.R Просмотреть файл

@@ -42,13 +42,13 @@ read_all <- function(path="records/") {
comments comments
filter(commentsandapplause, type == "applause") %>% filter(commentsandapplause, type == "applause") %>%
select(-type, -kommentator, -content) %>% select(-type, -kommentator, -content) %>%
mutate("CDU_CSU" = str_detect(fraktion, "CDU/CSU"),
"SPD" = str_detect(fraktion, "SPD"),
"FDP" = str_detect(fraktion, "FDP"),
"DIE_LINKE" = str_detect(fraktion, "DIE LINKE"),
"BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraktion, "BÜNDNIS 90/DIE GRÜNEN"),
"AfD" = str_detect(fraktion, "AfD")) %>%
select(-fraktion) ->
mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
"SPD" = str_detect(fraction, "SPD"),
"FDP" = str_detect(fraction, "FDP"),
"DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
"BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
"AfD" = str_detect(fraction, "AfD")) %>%
select(-fraction) ->
applause applause


list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
@@ -90,14 +90,14 @@ parse_speaker <- function(speaker_xml) {
nm <- xml_child(speaker_xml) nm <- xml_child(speaker_xml)
vorname <- xml_get(nm, "vorname") vorname <- xml_get(nm, "vorname")
nachname <- xml_get(nm, "nachname") nachname <- xml_get(nm, "nachname")
fraktion <- xml_get(nm, "fraktion")
fraction <- xml_get(nm, "fraction")
titel <- xml_get(nm, "titel") titel <- xml_get(nm, "titel")
rolle <- xml_find_all(nm, "rolle") rolle <- xml_find_all(nm, "rolle")
if (length(rolle) > 0) { if (length(rolle) > 0) {
rolle_lang <- xml_get(rolle, "rolle_lang") rolle_lang <- xml_get(rolle, "rolle_lang")
rolle_kurz <- xml_get(rolle, "rolle_kurz") rolle_kurz <- xml_get(rolle, "rolle_kurz")
} else rolle_kurz <- rolle_lang <- NA_character_ } else rolle_kurz <- rolle_lang <- NA_character_
c(id = speaker_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
} }


@@ -155,22 +155,22 @@ parse_speech <- function(speech_xml, date) {
comments = comments) comments = comments)
} }


fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")


parse_comment <- function(comment, speech_id, on_speaker) { parse_comment <- function(comment, speech_id, on_speaker) {
base <- c(speech_id = speech_id, on_speaker = on_speaker) base <- c(speech_id = speech_id, on_speaker = on_speaker)
# classify comment # classify comment
if(str_detect(comment, "Beifall")) { if(str_detect(comment, "Beifall")) {
str_extract_all(comment, fraktionspattern) %>%
str_extract_all(comment, fractionpattern) %>%
`[[`(1) %>% `[[`(1) %>%
sapply(partial(flip(head), 1) %.% agrep, x=fraktionsnames, max=0.2, value=T) %>%
sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
str_c(collapse=",") -> str_c(collapse=",") ->
by by
c(base, type = "applause", fraktion = by, kommentator = NA_character_, content = comment)
c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
} else { } else {
ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
c(base, type = "comment", fraktion = ps[3], kommentator = ps[2], content = ps[4])
c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
} }
} }


@@ -188,7 +188,7 @@ parse_speechlist <- function(speechlist_xml, date) {
comments = tibble(speech_id = comments["speech_id",], comments = tibble(speech_id = comments["speech_id",],
on_speaker = comments["on_speaker",], on_speaker = comments["on_speaker",],
type = comments["type",], type = comments["type",],
fraktion = comments["fraktion",],
fraction = comments["fraction",],
kommentator = comments["kommentator",], kommentator = comments["kommentator",],
content = comments["content", ])) content = comments["content", ]))
} }
@@ -199,7 +199,7 @@ parse_speakerliste <- function(speakerliste_xml) {
tibble(id = d["id",], tibble(id = d["id",],
vorname = d["vorname",], vorname = d["vorname",],
nachname = d["nachname",], nachname = d["nachname",],
fraktion = d["fraktion",],
fraction = d["fraction",],
titel = d["titel",], titel = d["titel",],
rolle_kurz = d["rolle_kurz",], rolle_kurz = d["rolle_kurz",],
rolle_lang = d["rolle_lang",]) rolle_lang = d["rolle_lang",])


+ 7
- 7
R/repair.R Просмотреть файл

@@ -1,4 +1,4 @@
fraktionen <- c("AFD" = "AfD",
fractions <- c("AFD" = "AfD",
"BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
"BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
"FRAKTIONSLOS" = "Fraktionslos", "FRAKTIONSLOS" = "Fraktionslos",
@@ -7,9 +7,9 @@ fraktionen <- c("AFD" = "AfD",
"CDU/CSU" = "CDU/CSU", "CDU/CSU" = "CDU/CSU",
"FDP" = "FDP") "FDP" = "FDP")


repair_fraktion <- function(fraktion) {
cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "")
fraktionen[cleaned]
repair_fraction <- function(fraction) {
cleaned <- str_to_upper %$% str_replace_all(fraction, "\\s", "")
fractions[cleaned]
} }


# takes vector of titel and keeps longest # takes vector of titel and keeps longest
@@ -26,17 +26,17 @@ repair_speaker <- function(speaker) {
if (nrow(speaker) == 0) return(speaker) if (nrow(speaker) == 0) return(speaker)
speaker %>% speaker %>%
filter(id != "10000") %>% # invalid id's filter(id != "10000") %>% # invalid id's
mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion
mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction
group_by(id) %>% group_by(id) %>%
summarize(vorname = head(vorname, 1), summarize(vorname = head(vorname, 1),
nachname = head(nachname, 1), nachname = head(nachname, 1),
fraktion = collect_unique(fraktion),
fraction = collect_unique(fraction),
titel = longest_titel(titel), titel = longest_titel(titel),
rolle_kurz = collect_unique(str_squish(rolle_kurz)), rolle_kurz = collect_unique(str_squish(rolle_kurz)),
rolle_lang = collect_unique(str_squish(rolle_lang))) %>% rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
ungroup() #%>% ungroup() #%>%
# arrange(id) %>% # arrange(id) %>%
# distinct(vorname, nachname, fraktion, titel)
# distinct(vorname, nachname, fraction, titel)
} }


repair_speeches <- function(speeches) { repair_speeches <- function(speeches) {


+ 1
- 1
README.md Просмотреть файл

@@ -44,7 +44,7 @@ parse.R parsed einzelne Protokolle und erstellt 3 Tibbles


### Redner ### Redner


Struktur: `id` , `vorname` , `nachname` , `fraktion` , `titel` , `rolle_kurz`, `rolle_lang`
Struktur: `id` , `vorname` , `nachname` , `fraction` , `titel` , `rolle_kurz`, `rolle_lang`


Die Rollen sind beispielsweise "Bundeskanzlerin". Leider gegendert und deshalb wahrscheinlich Die Rollen sind beispielsweise "Bundeskanzlerin". Leider gegendert und deshalb wahrscheinlich
nervig zu analysieren. nervig zu analysieren.


+ 23
- 23
vignettes/funwithdata.Rmd Просмотреть файл

@@ -52,7 +52,7 @@ talks <- res$talks
Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks:
```{r, fig.width=7} ```{r, fig.width=7}
join_speaker(res$speeches, res) %>% join_speaker(res$speeches, res) %>%
group_by(fraktion) %>%
group_by(fraction) %>%
summarize(n = n()) %>% summarize(n = n()) %>%
arrange(n) %>% arrange(n) %>%
bar_plot_fractions(title="Number of speeches given by fraction", bar_plot_fractions(title="Number of speeches given by fraction",
@@ -65,9 +65,9 @@ or counting the occurences of a given word:
find_word(res, "Kohleausstieg") %>% find_word(res, "Kohleausstieg") %>%
filter(occurences > 0) %>% filter(occurences > 0) %>%
join_speaker(res) %>% join_speaker(res) %>%
select(content, fraktion) %>%
filter(!is.na(fraktion)) %>%
group_by(fraktion) %>%
select(content, fraction) %>%
filter(!is.na(fraction)) %>%
group_by(fraction) %>%
summarize(n = n()) %>% summarize(n = n()) %>%
arrange(desc(n)) %>% arrange(desc(n)) %>%
bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)",
@@ -103,9 +103,9 @@ res$talks %>%
```{r} ```{r}
res$applause %>% res$applause %>%
left_join(res$speaker, by=c("on_speaker" = "id")) %>% left_join(res$speaker, by=c("on_speaker" = "id")) %>%
select(on_fraktion = fraktion, where(is.logical)) %>%
group_by(on_fraktion) %>%
arrange(on_fraktion) %>%
select(on_fraction = fraction, where(is.logical)) %>%
group_by(on_fraction) %>%
arrange(on_fraction) %>%
summarize("AfD" = sum(`AfD`), summarize("AfD" = sum(`AfD`),
"BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`), "BÜNDNIS 90 / DIE GRÜNEN" = sum(`BÜNDNIS_90_DIE_GRÜNEN`),
"CDU/CSU" = sum(`CDU_CSU`), "CDU/CSU" = sum(`CDU_CSU`),
@@ -117,11 +117,11 @@ res$applause %>%
For plotting our results we reorganize them a bit and produce a bar plot: For plotting our results we reorganize them a bit and produce a bar plot:


```{r, fig.width=7} ```{r, fig.width=7}
pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>%
filter(!is.na(on_fraktion)) %>%
bar_plot_fractions(x_variable = on_fraktion,
pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
filter(!is.na(on_fraction)) %>%
bar_plot_fractions(x_variable = on_fraction,
y_variable = value, y_variable = value,
fill = by_fraktion,
fill = by_fraction,
title = "Number of rounds of applauses from fractions to fractions", title = "Number of rounds of applauses from fractions to fractions",
xlab = "Applauded fraction", xlab = "Applauded fraction",
ylab = "Rounds of applauses", ylab = "Rounds of applauses",
@@ -135,23 +135,23 @@ pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>%
```{r} ```{r}
res$comments %>% res$comments %>%
left_join(res$speaker, by=c("on_speaker" = "id")) %>% left_join(res$speaker, by=c("on_speaker" = "id")) %>%
select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>%
group_by(on_fraktion) %>%
summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T),
`BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraktion, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
`CDU/CSU` = sum(str_detect(by_fraktion, "CDU/CSU"), na.rm = T),
`DIE LINKE` = sum(str_detect(by_fraktion, "DIE LINKE"), na.rm=T),
`FDP` = sum(str_detect(by_fraktion, "FDP"), na.rm=T),
`SPD` = sum(str_detect(by_fraktion, "SPD"), na.rm=T)) -> tb
select(by_fraction = fraction.x, on_fraction = fraction.y) %>%
group_by(on_fraction) %>%
summarize(`AfD` = sum(str_detect(by_fraction, "AfD"), na.rm=T),
`BÜNDNIS 90 / DIE GRÜNEN` = sum(str_detect(by_fraction, "BÜNDNIS 90/DIE GRÜNEN"), na.rm=T),
`CDU/CSU` = sum(str_detect(by_fraction, "CDU/CSU"), na.rm = T),
`DIE LINKE` = sum(str_detect(by_fraction, "DIE LINKE"), na.rm=T),
`FDP` = sum(str_detect(by_fraction, "FDP"), na.rm=T),
`SPD` = sum(str_detect(by_fraction, "SPD"), na.rm=T)) -> tb
``` ```
Analogously we plot the results: Analogously we plot the results:


```{r, fig.width=7} ```{r, fig.width=7}
pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>%
filter(!is.na(on_fraktion)) %>%
bar_plot_fractions(x_variable = on_fraktion,
pivot_longer(tb, where(is.numeric), "by_fraction", "count") %>%
filter(!is.na(on_fraction)) %>%
bar_plot_fractions(x_variable = on_fraction,
y_variable = value, y_variable = value,
fill = by_fraktion,
fill = by_fraction,
title = "Number of comments from fractions to fractions", title = "Number of comments from fractions to fractions",
xlab = "Commented fraction", xlab = "Commented fraction",
ylab = "Number of comments", ylab = "Number of comments",


+ 31
- 31
vignettes/hitlercomparison.Rmd Просмотреть файл

@@ -61,53 +61,53 @@ Now we extract the words that were used with higher frequency by one party and c
```{r} ```{r}
talks %>% talks %>%
left_join(speaker, by=c(speaker='id')) %>% left_join(speaker, by=c(speaker='id')) %>%
group_by(fraktion) %>%
summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion
group_by(fraction) %>%
summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraction
``` ```
For each party, we want to get a tibble of words with frequency. For each party, we want to get a tibble of words with frequency.
```{r} ```{r}
#AfD #AfD
Worte <- str_extract_all(talks_by_fraktion$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[1]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
afdtotal = length(Worte) afdtotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdtotal) -> afd_words


#AfD&Fraktionslos #AfD&Fraktionslos
Worte <- str_extract_all(talks_by_fraktion$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[2]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
afdundfraktionslostotal = length(Worte) afdundfraktionslostotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/afdundfraktionslostotal) -> afdundfraktionslos_words


#BÜNDNIS 90 / DIE GRÜNEN #BÜNDNIS 90 / DIE GRÜNEN
Worte <- str_extract_all(talks_by_fraktion$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[3]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
grünetotal = length(Worte) grünetotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/grünetotal) -> grüne_words


#CDU/CSU #CDU/CSU
Worte <- str_extract_all(talks_by_fraktion$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[4]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
cdutotal = length(Worte) cdutotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/cdutotal) -> cdu_words


#DIE LINKE #DIE LINKE
Worte <- str_extract_all(talks_by_fraktion$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[5]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
linketotal = length(Worte) linketotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/linketotal) -> linke_words


#FDP #FDP
Worte <- str_extract_all(talks_by_fraktion$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[6]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
fdptotal = length(Worte) fdptotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fdptotal) -> fdp_words


#Fraktionslos #Fraktionslos
Worte <- str_extract_all(talks_by_fraktion$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[7]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
fraktionslostotal = length(Worte) fraktionslostotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/fraktionslostotal) -> fraktionslos_words


#SPD #SPD
Worte <- str_extract_all(talks_by_fraktion$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[8]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
spdtotal = length(Worte) spdtotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/spdtotal) -> spd_words


#NA #NA
Worte <- str_extract_all(talks_by_fraktion$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
Worte <- str_extract_all(talks_by_fraction$full_text[[9]], "\\b[a-zA-ZäöüÄÖÜß]+\\b")[[1]]
natotal = length(Worte) natotal = length(Worte)
tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words tibble(Worte) %>% group_by(Worte) %>% count() %>% mutate(freq =n/natotal) -> na_words


@@ -117,34 +117,34 @@ total <- sum(all_words$n)
all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words all_words %>% group_by(Worte) %>% summarize(n = sum(n), part= sum(n)/total) -> all_words
``` ```


Now we want to extract the words that are more frequently used by a specific `fraktion`.
Now we want to extract the words that are more frequently used by a specific fraction.
```{r} ```{r}
afd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afd_high_frequent
select(afd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
afd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afd_high_frequent
select(afd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


afdundfraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
select(afdundfraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
afdundfraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> afdundfraktionslos_high_frequent
select(afdundfraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


grüne_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
select(grüne_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
grüne_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> grüne_high_frequent
select(grüne_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


cdu_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
select(cdu_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
cdu_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> cdu_high_frequent
select(cdu_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


linke_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
select(linke_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
linke_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> linke_high_frequent
select(linke_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


fdp_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
select(fdp_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
fdp_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fdp_high_frequent
select(fdp_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


fraktionslos_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
select(fraktionslos_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
fraktionslos_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> fraktionslos_high_frequent
select(fraktionslos_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


spd_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
select(spd_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
spd_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> spd_high_frequent
select(spd_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)


na_words %>% transmute(freq, fraktion_n = n) %>% left_join(all_words) %>% transmute(fraktion_freq = freq, total_freq = part, fraktion_n, total_n = n, rel_quotient = fraktion_freq/total_freq, abs_quotient = fraktion_n/total_n) %>% arrange(-abs_quotient, -fraktion_n) %>% filter(rel_quotient > 1) -> na_high_frequent
select(na_high_frequent, fraktion_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
na_words %>% transmute(freq, fraction_n = n) %>% left_join(all_words) %>% transmute(fraction_freq = freq, total_freq = part, fraction_n, total_n = n, rel_quotient = fraction_freq/total_freq, abs_quotient = fraction_n/total_n) %>% arrange(-abs_quotient, -fraction_n) %>% filter(rel_quotient > 1) -> na_high_frequent
select(na_high_frequent, fraction_n, total_n, abs_quotient, rel_quotient) %>% filter(total_n > 80)
``` ```


We compare these words with `hitlerwords`. We compare these words with `hitlerwords`.
@@ -161,7 +161,7 @@ spd_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerw
na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison na_high_frequent %>% mutate(Worte = str_to_lower(Worte)) %>% inner_join(hitlerwords) -> na_hitler_comparison


#not unique #not unique
tibble(fraktion = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"),
tibble(fraction = c("AfD", "AfD&Fraktionslos", "BÜNDNIS 90 / DIE GRÜNEN", "CDU/CSU", "DIE LINKE", "FDP", "Fraktionslos", "SPD"),
absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)), absolute = c(nrow(afd_hitler_comparison), nrow(afdundfraktionslos_hitler_comparison), nrow(grüne_hitler_comparison), nrow(cdu_hitler_comparison), nrow(linke_hitler_comparison), nrow(fdp_hitler_comparison), nrow(fraktionslos_hitler_comparison), nrow(spd_hitler_comparison)),
total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words)) total = c(nrow(afd_words), nrow(afdundfraktionslos_words), nrow(grüne_words), nrow(cdu_words), nrow(linke_words), nrow(fdp_words), nrow(fraktionslos_words), nrow(spd_words))
) %>% mutate(percent = 100*absolute/total) -> hitler_comparison ) %>% mutate(percent = 100*absolute/total) -> hitler_comparison


Загрузка…
Отмена
Сохранить