| @@ -1,9 +1,10 @@ | |||||
| # Generated by roxygen2: do not edit by hand | # Generated by roxygen2: do not edit by hand | ||||
| export(bar_plot_fraktionen) | |||||
| export(bar_plot_fractions) | |||||
| export(fetch_all) | export(fetch_all) | ||||
| export(find_word) | export(find_word) | ||||
| export(join_redner) | |||||
| export(join_speaker) | |||||
| export(party_colors) | |||||
| export(read_all) | export(read_all) | ||||
| export(read_from_csv) | export(read_from_csv) | ||||
| export(repair) | export(repair) | ||||
| @@ -6,8 +6,8 @@ find_word <- function(res, word) { | |||||
| } | } | ||||
| #' @export | #' @export | ||||
| join_redner <- function(tb, res, fraktion_only = F) { | |||||
| joined <- left_join(tb, res$redner, by=c("redner" = "id")) | |||||
| join_speaker <- function(tb, res, fraktion_only = F) { | |||||
| joined <- left_join(tb, res$speaker, by=c("speaker" = "id")) | |||||
| if (fraktion_only) select(joined, "fraktion") | if (fraktion_only) select(joined, "fraktion") | ||||
| else joined | else joined | ||||
| } | } | ||||
| @@ -29,7 +29,7 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | |||||
| "FDP", "AfD", NA_character_)) | "FDP", "AfD", NA_character_)) | ||||
| #' @export | #' @export | ||||
| bar_plot_fraktionen <- function(tb, | |||||
| bar_plot_fractions <- function(tb, | |||||
| x_variable = NULL, # default is fraktion | x_variable = NULL, # default is fraktion | ||||
| y_variable = NULL, # default is n | y_variable = NULL, # default is n | ||||
| fill = NULL, # default is fraktion | fill = NULL, # default is fraktion | ||||
| @@ -82,7 +82,7 @@ word_usage_by_date <- function(res, patterns, name, tidy=F) { | |||||
| else name <- patterns[[i]] | else name <- patterns[[i]] | ||||
| tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) | tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) | ||||
| } | } | ||||
| left_join(tb, res$reden, by=c("rede_id" = "id")) %>% | |||||
| left_join(tb, res$speeches, by=c("speech_id" = "id")) %>% | |||||
| group_by(date) %>% | group_by(date) %>% | ||||
| summarize(across(where(is.numeric), sum)) %>% | summarize(across(where(is.numeric), sum)) %>% | ||||
| arrange(date) -> tb | arrange(date) -> tb | ||||
| @@ -13,16 +13,16 @@ read_all <- function(path="records/") { | |||||
| available_protocols <- list.files(path) | available_protocols <- list.files(path) | ||||
| res <- pblapply(available_protocols, read_one, path=path) | res <- pblapply(available_protocols, read_one, path=path) | ||||
| lapply(res, `[[`, "redner") %>% | |||||
| lapply(res, `[[`, "speaker") %>% | |||||
| bind_rows() %>% | bind_rows() %>% | ||||
| distinct() -> | distinct() -> | ||||
| redner | |||||
| speaker | |||||
| lapply(res, `[[`, "reden") %>% | |||||
| lapply(res, `[[`, "speeches") %>% | |||||
| bind_rows() %>% | bind_rows() %>% | ||||
| distinct() %>% | distinct() %>% | ||||
| mutate(date = as.Date(date, format="%d.%m.%Y")) -> | mutate(date = as.Date(date, format="%d.%m.%Y")) -> | ||||
| reden | |||||
| speeches | |||||
| lapply(res, `[[`, "talks") %>% | lapply(res, `[[`, "talks") %>% | ||||
| bind_rows() %>% | bind_rows() %>% | ||||
| @@ -51,7 +51,7 @@ read_all <- function(path="records/") { | |||||
| select(-fraktion) -> | select(-fraktion) -> | ||||
| applause | applause | ||||
| list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) | |||||
| list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) | |||||
| } | } | ||||
| # this reads all currently parseable data from one xml | # this reads all currently parseable data from one xml | ||||
| @@ -64,18 +64,18 @@ read_one <- function(name, path) { | |||||
| cs <- xml_children(x) | cs <- xml_children(x) | ||||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | verlauf <- xml_find_first(x, "sitzungsverlauf") | ||||
| rednerl <- xml_find_first(x, "rednerliste") | |||||
| speakerl <- xml_find_first(x, "rednerliste") | |||||
| xml_children(rednerl) %>% | |||||
| parse_rednerliste() -> | |||||
| redner | |||||
| xml_children(speakerl) %>% | |||||
| parse_speakerlist() -> | |||||
| speaker | |||||
| xml_children(verlauf) %>% | xml_children(verlauf) %>% | ||||
| xml_find_all("rede") %>% | xml_find_all("rede") %>% | ||||
| parse_redenliste(date) -> | |||||
| parse_speechlist(date) -> | |||||
| res | res | ||||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | |||||
| list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments) | |||||
| } | } | ||||
| xml_get <- function(node, name) { | xml_get <- function(node, name) { | ||||
| @@ -84,10 +84,10 @@ xml_get <- function(node, name) { | |||||
| else res | else res | ||||
| } | } | ||||
| # parse one redner | |||||
| parse_redner <- function(redner_xml) { | |||||
| redner_id <- xml_attr(redner_xml, "id") | |||||
| nm <- xml_child(redner_xml) | |||||
| # parse one speaker | |||||
| parse_speaker <- function(speaker_xml) { | |||||
| speaker_id <- xml_attr(speaker_xml, "id") | |||||
| nm <- xml_child(speaker_xml) | |||||
| vorname <- xml_get(nm, "vorname") | vorname <- xml_get(nm, "vorname") | ||||
| nachname <- xml_get(nm, "nachname") | nachname <- xml_get(nm, "nachname") | ||||
| fraktion <- xml_get(nm, "fraktion") | fraktion <- xml_get(nm, "fraktion") | ||||
| @@ -97,39 +97,39 @@ parse_redner <- function(redner_xml) { | |||||
| rolle_lang <- xml_get(rolle, "rolle_lang") | rolle_lang <- xml_get(rolle, "rolle_lang") | ||||
| rolle_kurz <- xml_get(rolle, "rolle_kurz") | rolle_kurz <- xml_get(rolle, "rolle_kurz") | ||||
| } else rolle_kurz <- rolle_lang <- NA_character_ | } else rolle_kurz <- rolle_lang <- NA_character_ | ||||
| c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||||
| c(id = speaker_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||||
| rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) | rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) | ||||
| } | } | ||||
| # parse one rede | |||||
| # returns: - a rede (with rede id and redner id) | |||||
| # - all talks appearing in the rede (with corresponding content) | |||||
| parse_rede <- function(rede_xml, date) { | |||||
| rede_id <- xml_attr(rede_xml, "id") | |||||
| cs <- xml_children(rede_xml) | |||||
| cur_redner <- NA_character_ | |||||
| principal_redner <- NA_character_ | |||||
| # parse one speech | |||||
| # returns: - a speech (with speech id and speaker id) | |||||
| # - all talks appearing in the speech (with corresponding content) | |||||
| parse_speech <- function(speech_xml, date) { | |||||
| speech_id <- xml_attr(speech_xml, "id") | |||||
| cs <- xml_children(speech_xml) | |||||
| cur_speaker <- NA_character_ | |||||
| principal_speaker <- NA_character_ | |||||
| cur_content <- "" | cur_content <- "" | ||||
| reden <- list() | |||||
| speeches <- list() | |||||
| comments <- list() | comments <- list() | ||||
| for (node in cs) { | for (node in cs) { | ||||
| if (xml_name(node) == "p" || xml_name(node) == "name") { | if (xml_name(node) == "p" || xml_name(node) == "name") { | ||||
| klasse <- xml_attr(node, "klasse") | klasse <- xml_attr(node, "klasse") | ||||
| if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { | |||||
| if (!is.na(cur_redner)) { | |||||
| rede <- c(rede_id = rede_id, | |||||
| redner = cur_redner, | |||||
| if ((!is.na(klasse) && klasse == "speaker") || xml_name(node) == "name") { | |||||
| if (!is.na(cur_speaker)) { | |||||
| speech <- c(speech_id = speech_id, | |||||
| speaker = cur_speaker, | |||||
| content = cur_content) | content = cur_content) | ||||
| reden <- c(reden, list(rede)) | |||||
| speeches <- c(speeches, list(speech)) | |||||
| cur_content <- "" | cur_content <- "" | ||||
| } | } | ||||
| if (is.na(principal_redner) && xml_name(node) != "name") { | |||||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||||
| if (is.na(principal_speaker) && xml_name(node) != "name") { | |||||
| principal_speaker <- xml_child(node) %>% xml_attr("id") | |||||
| } | } | ||||
| if (xml_name(node) == "name") { | if (xml_name(node) == "name") { | ||||
| cur_redner <- "BTP" | |||||
| cur_speaker <- "BTP" | |||||
| } else { | } else { | ||||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||||
| cur_speaker <- xml_child(node) %>% xml_attr("id") | |||||
| } | } | ||||
| } else { | } else { | ||||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | cur_content <- paste0(cur_content, xml_text(node), sep="\n") | ||||
| @@ -141,25 +141,25 @@ parse_rede <- function(rede_xml, date) { | |||||
| str_sub(2, -2) %>% | str_sub(2, -2) %>% | ||||
| str_split("–") %>% | str_split("–") %>% | ||||
| `[[`(1) %>% | `[[`(1) %>% | ||||
| lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> | |||||
| lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) -> | |||||
| cs | cs | ||||
| comments <- c(comments, cs) | comments <- c(comments, cs) | ||||
| } | } | ||||
| } | } | ||||
| rede <- c(rede_id = rede_id, | |||||
| redner = cur_redner, | |||||
| speech <- c(speech_id = speech_id, | |||||
| speaker = cur_speaker, | |||||
| content = cur_content) | content = cur_content) | ||||
| reden <- c(reden, list(rede)) | |||||
| list(rede = c(id = rede_id, redner = principal_redner, date = date), | |||||
| parts = reden, | |||||
| speeches <- c(speeches, list(speech)) | |||||
| list(speech = c(id = speech_id, speaker = principal_speaker, date = date), | |||||
| parts = speeches, | |||||
| comments = comments) | comments = comments) | ||||
| } | } | ||||
| fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" | fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" | ||||
| fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") | fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") | ||||
| parse_comment <- function(comment, rede_id, on_redner) { | |||||
| base <- c(rede_id = rede_id, on_redner = on_redner) | |||||
| parse_comment <- function(comment, speech_id, on_speaker) { | |||||
| base <- c(speech_id = speech_id, on_speaker = on_speaker) | |||||
| # classify comment | # classify comment | ||||
| if(str_detect(comment, "Beifall")) { | if(str_detect(comment, "Beifall")) { | ||||
| str_extract_all(comment, fraktionspattern) %>% | str_extract_all(comment, fraktionspattern) %>% | ||||
| @@ -174,28 +174,28 @@ parse_comment <- function(comment, rede_id, on_redner) { | |||||
| } | } | ||||
| } | } | ||||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||||
| parse_redenliste <- function(redenliste_xml, date) { | |||||
| d <- sapply(redenliste_xml, parse_rede, date = date) | |||||
| reden <- simplify2array(d["rede", ]) | |||||
| # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches | |||||
| parse_speechlist <- function(speechlist_xml, date) { | |||||
| d <- sapply(speechlist_xml, parse_speech, date = date) | |||||
| speeches <- simplify2array(d["speech", ]) | |||||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | ||||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | ||||
| list(reden = tibble(id = reden["id",], redner = reden["redner",], | |||||
| date = reden["date",]), | |||||
| talks = tibble(rede_id = parts["rede_id", ], | |||||
| redner = parts["redner", ], | |||||
| list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",], | |||||
| date = speeches["date",]), | |||||
| talks = tibble(speech_id = parts["speech_id", ], | |||||
| speaker = parts["speaker", ], | |||||
| content = parts["content", ]), | content = parts["content", ]), | ||||
| comments = tibble(rede_id = comments["rede_id",], | |||||
| on_redner = comments["on_redner",], | |||||
| comments = tibble(speech_id = comments["speech_id",], | |||||
| on_speaker = comments["on_speaker",], | |||||
| type = comments["type",], | type = comments["type",], | ||||
| fraktion = comments["fraktion",], | fraktion = comments["fraktion",], | ||||
| kommentator = comments["kommentator",], | kommentator = comments["kommentator",], | ||||
| content = comments["content", ])) | content = comments["content", ])) | ||||
| } | } | ||||
| # create a tibble of redner from a list of xml nodes representing redner | |||||
| parse_rednerliste <- function(rednerliste_xml) { | |||||
| d <- sapply(rednerliste_xml, parse_redner) | |||||
| # create a tibble of speaker from a list of xml nodes representing speaker | |||||
| parse_speakerliste <- function(speakerliste_xml) { | |||||
| d <- sapply(speakerliste_xml, parse_speaker) | |||||
| tibble(id = d["id",], | tibble(id = d["id",], | ||||
| vorname = d["vorname",], | vorname = d["vorname",], | ||||
| nachname = d["nachname",], | nachname = d["nachname",], | ||||
| @@ -208,8 +208,8 @@ parse_rednerliste <- function(rednerliste_xml) { | |||||
| #' @export | #' @export | ||||
| write_to_csv <- function(tables, path="csv/", create=F) { | write_to_csv <- function(tables, path="csv/", create=F) { | ||||
| check_directory(path, create) | check_directory(path, create) | ||||
| write.table(tables$redner, str_c(path, "redner.csv")) | |||||
| write.table(tables$reden, str_c(path, "reden.csv")) | |||||
| write.table(tables$speaker, str_c(path, "speaker.csv")) | |||||
| write.table(tables$speeches, str_c(path, "speeches.csv")) | |||||
| write.table(tables$talks, str_c(path, "talks.csv")) | write.table(tables$talks, str_c(path, "talks.csv")) | ||||
| write.table(tables$comments, str_c(path, "comments.csv")) | write.table(tables$comments, str_c(path, "comments.csv")) | ||||
| write.table(tables$applause, str_c(path, "applause.csv")) | write.table(tables$applause, str_c(path, "applause.csv")) | ||||
| @@ -217,12 +217,12 @@ write_to_csv <- function(tables, path="csv/", create=F) { | |||||
| #' @export | #' @export | ||||
| read_from_csv <- function(path="csv/") { | read_from_csv <- function(path="csv/") { | ||||
| list(redner = read.table(str_c(path, "redner.csv")) %>% | |||||
| list(speaker = read.table(str_c(path, "speaker.csv")) %>% | |||||
| tibble() %>% | tibble() %>% | ||||
| mutate(id = as.character(id)), | mutate(id = as.character(id)), | ||||
| reden = read.table(str_c(path, "reden.csv")) %>% | |||||
| speeches = read.table(str_c(path, "speeches.csv")) %>% | |||||
| tibble() %>% | tibble() %>% | ||||
| mutate(redner = as.character(redner)), | |||||
| mutate(speaker = as.character(speaker)), | |||||
| talks = tibble %$% read.table(str_c(path, "talks.csv")), | talks = tibble %$% read.table(str_c(path, "talks.csv")), | ||||
| comments = tibble %$% read.table(str_c(path, "comments.csv")), | comments = tibble %$% read.table(str_c(path, "comments.csv")), | ||||
| applause = tibble %$% read.table(str_c(path, "applause.csv"))) | applause = tibble %$% read.table(str_c(path, "applause.csv"))) | ||||
| @@ -234,8 +234,8 @@ read_from_csv <- function(path="csv/") { | |||||
| # make sure data ist downloaded via fetch.R | # make sure data ist downloaded via fetch.R | ||||
| # res <- read_one("records/19126-data.xml") | # res <- read_one("records/19126-data.xml") | ||||
| # | # | ||||
| # res$redner | |||||
| # res$reden | |||||
| # res$speaker | |||||
| # res$speeches | |||||
| # res$talks | # res$talks | ||||
| # ------------------------------- | # ------------------------------- | ||||
| @@ -21,10 +21,10 @@ longest_titel <- function(titel) { | |||||
| # takes character vector, removes duplicates and collapses | # takes character vector, removes duplicates and collapses | ||||
| collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") | collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") | ||||
| # expects a tibble of redner and repairs | |||||
| repair_redner <- function(redner) { | |||||
| if (nrow(redner) == 0) return(redner) | |||||
| redner %>% | |||||
| # expects a tibble of speaker and repairs | |||||
| repair_speaker <- function(speaker) { | |||||
| if (nrow(speaker) == 0) return(speaker) | |||||
| speaker %>% | |||||
| filter(id != "10000") %>% # invalid id's | filter(id != "10000") %>% # invalid id's | ||||
| mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion | mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion | ||||
| group_by(id) %>% | group_by(id) %>% | ||||
| @@ -39,10 +39,10 @@ repair_redner <- function(redner) { | |||||
| # distinct(vorname, nachname, fraktion, titel) | # distinct(vorname, nachname, fraktion, titel) | ||||
| } | } | ||||
| repair_reden <- function(reden) { | |||||
| if (nrow(reden) == 0) return(reden) | |||||
| repair_speeches <- function(speeches) { | |||||
| if (nrow(speeches) == 0) return(speeches) | |||||
| # TODO: fill with content | # TODO: fill with content | ||||
| reden | |||||
| speeches | |||||
| } | } | ||||
| repair_talks <- function(talks) { | repair_talks <- function(talks) { | ||||
| @@ -51,13 +51,13 @@ repair_talks <- function(talks) { | |||||
| filter(talks, str_length(content) > 0) | filter(talks, str_length(content) > 0) | ||||
| } | } | ||||
| # tries to find the correct redner id given a name | |||||
| # tries to find the correct speaker id given a name | |||||
| # this is sufficient since every prename lastname combination in the bundestag is | # this is sufficient since every prename lastname combination in the bundestag is | ||||
| # unique (luckily :D) | # unique (luckily :D) | ||||
| # returns a lookup table | # returns a lookup table | ||||
| lookup_redner <- function(comments, redner) { | |||||
| lookup_speaker <- function(comments, speaker) { | |||||
| tobereplaced <- "[-–—‑- ]" | tobereplaced <- "[-–—‑- ]" | ||||
| redner %>% | |||||
| speaker %>% | |||||
| unite(name, vorname, nachname, sep=".*") %>% | unite(name, vorname, nachname, sep=".*") %>% | ||||
| mutate(name = str_replace_all(name, tobereplaced, ".*")) -> | mutate(name = str_replace_all(name, tobereplaced, ".*")) -> | ||||
| rs | rs | ||||
| @@ -70,14 +70,14 @@ lookup_redner <- function(comments, redner) { | |||||
| } | } | ||||
| comments %>% | comments %>% | ||||
| distinct(kommentator) %>% | distinct(kommentator) %>% | ||||
| mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) | |||||
| mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) | |||||
| } | } | ||||
| repair_comments <- function(comments, redner) { | |||||
| # try to find a redner id for each actual comment | |||||
| repair_comments <- function(comments, speaker) { | |||||
| # try to find a speaker id for each actual comment | |||||
| comments %>% | comments %>% | ||||
| filter(!is.na(kommentator)) %>% | filter(!is.na(kommentator)) %>% | ||||
| lookup_redner(redner) %>% | |||||
| lookup_speaker(speaker) %>% | |||||
| left_join(comments, ., by="kommentator") %>% | left_join(comments, ., by="kommentator") %>% | ||||
| select(-kommentator) | select(-kommentator) | ||||
| } | } | ||||
| @@ -86,8 +86,8 @@ repair_comments <- function(comments, redner) { | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| repair <- function(parse_output) { | repair <- function(parse_output) { | ||||
| list(redner = repair_redner(parse_output$redner), | |||||
| reden = repair_reden(parse_output$reden), | |||||
| list(speaker = repair_speaker(parse_output$speaker), | |||||
| speeches = repair_speeches(parse_output$speeches), | |||||
| talks = repair_talks(parse_output$talks), | talks = repair_talks(parse_output$talks), | ||||
| #comments = repair_comments(parse_output$comments) | #comments = repair_comments(parse_output$comments) | ||||
| comments = parse_output$comments, | comments = parse_output$comments, | ||||
| @@ -42,8 +42,8 @@ res <- read_from_csv('../csv/') | |||||
| and unpack our tibbles | and unpack our tibbles | ||||
| ```{r} | ```{r} | ||||
| comments <- res$comments | comments <- res$comments | ||||
| reden <- res$reden | |||||
| redner <- res$redner | |||||
| speeches <- res$speeches | |||||
| speaker <- res$speaker | |||||
| talks <- res$talks | talks <- res$talks | ||||
| ``` | ``` | ||||
| @@ -51,11 +51,11 @@ talks <- res$talks | |||||
| Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | ||||
| ```{r, fig.width=7} | ```{r, fig.width=7} | ||||
| join_redner(res$reden, res) %>% | |||||
| join_speaker(res$speeches, res) %>% | |||||
| group_by(fraktion) %>% | group_by(fraktion) %>% | ||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| arrange(n) %>% | arrange(n) %>% | ||||
| bar_plot_fraktionen(title="Number of speeches given by fraction", | |||||
| bar_plot_fractions(title="Number of speeches given by fraction", | |||||
| ylab="Number of speeches") | ylab="Number of speeches") | ||||
| ``` | ``` | ||||
| @@ -64,13 +64,13 @@ or counting the occurences of a given word: | |||||
| ```{r, fig.width=7} | ```{r, fig.width=7} | ||||
| find_word(res, "Kohleausstieg") %>% | find_word(res, "Kohleausstieg") %>% | ||||
| filter(occurences > 0) %>% | filter(occurences > 0) %>% | ||||
| join_redner(res) %>% | |||||
| join_speaker(res) %>% | |||||
| select(content, fraktion) %>% | select(content, fraktion) %>% | ||||
| filter(!is.na(fraktion)) %>% | filter(!is.na(fraktion)) %>% | ||||
| group_by(fraktion) %>% | group_by(fraktion) %>% | ||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| arrange(desc(n)) %>% | arrange(desc(n)) %>% | ||||
| bar_plot_fraktionen(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||||
| bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||||
| ylab = "Number of uses of 'Kohleausstieg'", | ylab = "Number of uses of 'Kohleausstieg'", | ||||
| flipped = F) | flipped = F) | ||||
| ``` | ``` | ||||
| @@ -78,11 +78,11 @@ find_word(res, "Kohleausstieg") %>% | |||||
| ### Who gives the most speeches? | ### Who gives the most speeches? | ||||
| ```{r} | ```{r} | ||||
| res$reden %>% | |||||
| group_by(redner) %>% | |||||
| res$speeches %>% | |||||
| group_by(speaker) %>% | |||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| arrange(-n) %>% | arrange(-n) %>% | ||||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||||
| left_join(res$speaker, by=c("speaker" = "id")) %>% | |||||
| head(10) | head(10) | ||||
| ``` | ``` | ||||
| @@ -91,10 +91,10 @@ res$reden %>% | |||||
| ```{r} | ```{r} | ||||
| res$talks %>% | res$talks %>% | ||||
| mutate(content_len = str_length(content)) %>% | mutate(content_len = str_length(content)) %>% | ||||
| group_by(redner) %>% | |||||
| group_by(speaker) %>% | |||||
| summarize(avg_content_len = mean(content_len)) %>% | summarize(avg_content_len = mean(content_len)) %>% | ||||
| arrange(-avg_content_len) %>% | arrange(-avg_content_len) %>% | ||||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||||
| left_join(res$speaker, by=c("speaker" = "id")) %>% | |||||
| head(10) | head(10) | ||||
| ``` | ``` | ||||
| @@ -102,7 +102,7 @@ res$talks %>% | |||||
| ```{r} | ```{r} | ||||
| res$applause %>% | res$applause %>% | ||||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||||
| left_join(res$speaker, by=c("on_speaker" = "id")) %>% | |||||
| select(on_fraktion = fraktion, where(is.logical)) %>% | select(on_fraktion = fraktion, where(is.logical)) %>% | ||||
| group_by(on_fraktion) %>% | group_by(on_fraktion) %>% | ||||
| arrange(on_fraktion) %>% | arrange(on_fraktion) %>% | ||||
| @@ -119,7 +119,7 @@ For plotting our results we reorganize them a bit and produce a bar plot: | |||||
| ```{r, fig.width=7} | ```{r, fig.width=7} | ||||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | ||||
| filter(!is.na(on_fraktion)) %>% | filter(!is.na(on_fraktion)) %>% | ||||
| bar_plot_fraktionen(x_variable = on_fraktion, | |||||
| bar_plot_fractions(x_variable = on_fraktion, | |||||
| y_variable = value, | y_variable = value, | ||||
| fill = by_fraktion, | fill = by_fraktion, | ||||
| title = "Number of rounds of applauses from fractions to fractions", | title = "Number of rounds of applauses from fractions to fractions", | ||||
| @@ -134,7 +134,7 @@ pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||||
| ```{r} | ```{r} | ||||
| res$comments %>% | res$comments %>% | ||||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||||
| left_join(res$speaker, by=c("on_speaker" = "id")) %>% | |||||
| select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% | select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% | ||||
| group_by(on_fraktion) %>% | group_by(on_fraktion) %>% | ||||
| summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), | summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), | ||||
| @@ -149,7 +149,7 @@ Analogously we plot the results: | |||||
| ```{r, fig.width=7} | ```{r, fig.width=7} | ||||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | ||||
| filter(!is.na(on_fraktion)) %>% | filter(!is.na(on_fraktion)) %>% | ||||
| bar_plot_fraktionen(x_variable = on_fraktion, | |||||
| bar_plot_fractions(x_variable = on_fraktion, | |||||
| y_variable = value, | y_variable = value, | ||||
| fill = by_fraktion, | fill = by_fraktion, | ||||
| title = "Number of comments from fractions to fractions", | title = "Number of comments from fractions to fractions", | ||||
| @@ -31,8 +31,8 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp | |||||
| ```r | ```r | ||||
| read_all("../records/") %>% repair() -> res | read_all("../records/") %>% repair() -> res | ||||
| reden <- res$reden | |||||
| redner <- res$redner | |||||
| speeches <- res$speeches | |||||
| speaker <- res$speaker | |||||
| talks <- res$talks | talks <- res$talks | ||||
| ``` | ``` | ||||
| We also used `repair` to fix a bunch of formatting issues in the records and unpacked | We also used `repair` to fix a bunch of formatting issues in the records and unpacked | ||||
| @@ -43,8 +43,8 @@ For development purposes, we load the tables from csv files. | |||||
| tables <- read_from_csv('../csv/') | tables <- read_from_csv('../csv/') | ||||
| comments <- tables$comments | comments <- tables$comments | ||||
| reden <- tables$reden | |||||
| redner <- tables$redner | |||||
| speeches <- tables$speeches | |||||
| speaker <- tables$speaker | |||||
| talks <- tables$talks | talks <- tables$talks | ||||
| ``` | ``` | ||||
| @@ -60,7 +60,7 @@ hitlerwords <- tibble(Worte) | |||||
| Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`. | Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`. | ||||
| ```{r} | ```{r} | ||||
| talks %>% | talks %>% | ||||
| left_join(redner, by=c(redner='id')) %>% | |||||
| left_join(speaker, by=c(speaker='id')) %>% | |||||
| group_by(fraktion) %>% | group_by(fraktion) %>% | ||||
| summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion | summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion | ||||
| ``` | ``` | ||||
| @@ -169,5 +169,5 @@ hitler_comparison | |||||
| ``` | ``` | ||||
| Finally, we want to plot our results: | Finally, we want to plot our results: | ||||
| ```{r, fig.width=7} | ```{r, fig.width=7} | ||||
| bar_plot_fraktionen(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") | |||||
| bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") | |||||
| ``` | ``` | ||||