From 7315dd879393655a848fea03e615c06dcf741afc Mon Sep 17 00:00:00 2001 From: JosuaKugler Date: Tue, 3 Aug 2021 17:05:07 +0200 Subject: [PATCH] refactor rede -> speech, redner -> speaker --- NAMESPACE | 5 +- R/analyze.R | 8 +-- R/parse.R | 124 ++++++++++++++++----------------- R/repair.R | 32 ++++----- vignettes/funwithdata.Rmd | 30 ++++---- vignettes/hitlercomparison.Rmd | 12 ++-- 6 files changed, 106 insertions(+), 105 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d99fd6f..3d291b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,10 @@ # Generated by roxygen2: do not edit by hand -export(bar_plot_fraktionen) +export(bar_plot_fractions) export(fetch_all) export(find_word) -export(join_redner) +export(join_speaker) +export(party_colors) export(read_all) export(read_from_csv) export(repair) diff --git a/R/analyze.R b/R/analyze.R index 088d0d8..dce1547 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -6,8 +6,8 @@ find_word <- function(res, word) { } #' @export -join_redner <- function(tb, res, fraktion_only = F) { - joined <- left_join(tb, res$redner, by=c("redner" = "id")) +join_speaker <- function(tb, res, fraktion_only = F) { + joined <- left_join(tb, res$speaker, by=c("speaker" = "id")) if (fraktion_only) select(joined, "fraktion") else joined } @@ -29,7 +29,7 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", "FDP", "AfD", NA_character_)) #' @export -bar_plot_fraktionen <- function(tb, +bar_plot_fractions <- function(tb, x_variable = NULL, # default is fraktion y_variable = NULL, # default is n fill = NULL, # default is fraktion @@ -82,7 +82,7 @@ word_usage_by_date <- function(res, patterns, name, tidy=F) { else name <- patterns[[i]] tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) } - left_join(tb, res$reden, by=c("rede_id" = "id")) %>% + left_join(tb, res$speeches, by=c("speech_id" = "id")) %>% group_by(date) %>% summarize(across(where(is.numeric), sum)) %>% arrange(date) -> tb diff --git a/R/parse.R b/R/parse.R index 40b6045..7184779 100644 --- a/R/parse.R +++ b/R/parse.R @@ -13,16 +13,16 @@ read_all <- function(path="records/") { available_protocols <- list.files(path) res <- pblapply(available_protocols, read_one, path=path) - lapply(res, `[[`, "redner") %>% + lapply(res, `[[`, "speaker") %>% bind_rows() %>% distinct() -> - redner + speaker - lapply(res, `[[`, "reden") %>% + lapply(res, `[[`, "speeches") %>% bind_rows() %>% distinct() %>% mutate(date = as.Date(date, format="%d.%m.%Y")) -> - reden + speeches lapply(res, `[[`, "talks") %>% bind_rows() %>% @@ -51,7 +51,7 @@ read_all <- function(path="records/") { select(-fraktion) -> applause - list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) + list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) } # this reads all currently parseable data from one xml @@ -64,18 +64,18 @@ read_one <- function(name, path) { cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") - rednerl <- xml_find_first(x, "rednerliste") + speakerl <- xml_find_first(x, "rednerliste") - xml_children(rednerl) %>% - parse_rednerliste() -> - redner + xml_children(speakerl) %>% + parse_speakerlist() -> + speaker xml_children(verlauf) %>% xml_find_all("rede") %>% - parse_redenliste(date) -> + parse_speechlist(date) -> res - list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) + list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments) } xml_get <- function(node, name) { @@ -84,10 +84,10 @@ xml_get <- function(node, name) { else res } -# parse one redner -parse_redner <- function(redner_xml) { - redner_id <- xml_attr(redner_xml, "id") - nm <- xml_child(redner_xml) +# parse one speaker +parse_speaker <- function(speaker_xml) { + speaker_id <- xml_attr(speaker_xml, "id") + nm <- xml_child(speaker_xml) vorname <- xml_get(nm, "vorname") nachname <- xml_get(nm, "nachname") fraktion <- xml_get(nm, "fraktion") @@ -97,39 +97,39 @@ parse_redner <- function(redner_xml) { rolle_lang <- xml_get(rolle, "rolle_lang") rolle_kurz <- xml_get(rolle, "rolle_kurz") } else rolle_kurz <- rolle_lang <- NA_character_ - c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, + c(id = speaker_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) } -# parse one rede -# returns: - a rede (with rede id and redner id) -# - all talks appearing in the rede (with corresponding content) -parse_rede <- function(rede_xml, date) { - rede_id <- xml_attr(rede_xml, "id") - cs <- xml_children(rede_xml) - cur_redner <- NA_character_ - principal_redner <- NA_character_ +# parse one speech +# returns: - a speech (with speech id and speaker id) +# - all talks appearing in the speech (with corresponding content) +parse_speech <- function(speech_xml, date) { + speech_id <- xml_attr(speech_xml, "id") + cs <- xml_children(speech_xml) + cur_speaker <- NA_character_ + principal_speaker <- NA_character_ cur_content <- "" - reden <- list() + speeches <- list() comments <- list() for (node in cs) { if (xml_name(node) == "p" || xml_name(node) == "name") { klasse <- xml_attr(node, "klasse") - if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { - if (!is.na(cur_redner)) { - rede <- c(rede_id = rede_id, - redner = cur_redner, + if ((!is.na(klasse) && klasse == "speaker") || xml_name(node) == "name") { + if (!is.na(cur_speaker)) { + speech <- c(speech_id = speech_id, + speaker = cur_speaker, content = cur_content) - reden <- c(reden, list(rede)) + speeches <- c(speeches, list(speech)) cur_content <- "" } - if (is.na(principal_redner) && xml_name(node) != "name") { - principal_redner <- xml_child(node) %>% xml_attr("id") + if (is.na(principal_speaker) && xml_name(node) != "name") { + principal_speaker <- xml_child(node) %>% xml_attr("id") } if (xml_name(node) == "name") { - cur_redner <- "BTP" + cur_speaker <- "BTP" } else { - cur_redner <- xml_child(node) %>% xml_attr("id") + cur_speaker <- xml_child(node) %>% xml_attr("id") } } else { cur_content <- paste0(cur_content, xml_text(node), sep="\n") @@ -141,25 +141,25 @@ parse_rede <- function(rede_xml, date) { str_sub(2, -2) %>% str_split("–") %>% `[[`(1) %>% - lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> + lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) -> cs comments <- c(comments, cs) } } - rede <- c(rede_id = rede_id, - redner = cur_redner, + speech <- c(speech_id = speech_id, + speaker = cur_speaker, content = cur_content) - reden <- c(reden, list(rede)) - list(rede = c(id = rede_id, redner = principal_redner, date = date), - parts = reden, + speeches <- c(speeches, list(speech)) + list(speech = c(id = speech_id, speaker = principal_speaker, date = date), + parts = speeches, comments = comments) } fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") -parse_comment <- function(comment, rede_id, on_redner) { - base <- c(rede_id = rede_id, on_redner = on_redner) +parse_comment <- function(comment, speech_id, on_speaker) { + base <- c(speech_id = speech_id, on_speaker = on_speaker) # classify comment if(str_detect(comment, "Beifall")) { str_extract_all(comment, fraktionspattern) %>% @@ -174,28 +174,28 @@ parse_comment <- function(comment, rede_id, on_redner) { } } -# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden -parse_redenliste <- function(redenliste_xml, date) { - d <- sapply(redenliste_xml, parse_rede, date = date) - reden <- simplify2array(d["rede", ]) +# creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches +parse_speechlist <- function(speechlist_xml, date) { + d <- sapply(speechlist_xml, parse_speech, date = date) + speeches <- simplify2array(d["speech", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) - list(reden = tibble(id = reden["id",], redner = reden["redner",], - date = reden["date",]), - talks = tibble(rede_id = parts["rede_id", ], - redner = parts["redner", ], + list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",], + date = speeches["date",]), + talks = tibble(speech_id = parts["speech_id", ], + speaker = parts["speaker", ], content = parts["content", ]), - comments = tibble(rede_id = comments["rede_id",], - on_redner = comments["on_redner",], + comments = tibble(speech_id = comments["speech_id",], + on_speaker = comments["on_speaker",], type = comments["type",], fraktion = comments["fraktion",], kommentator = comments["kommentator",], content = comments["content", ])) } -# create a tibble of redner from a list of xml nodes representing redner -parse_rednerliste <- function(rednerliste_xml) { - d <- sapply(rednerliste_xml, parse_redner) +# create a tibble of speaker from a list of xml nodes representing speaker +parse_speakerliste <- function(speakerliste_xml) { + d <- sapply(speakerliste_xml, parse_speaker) tibble(id = d["id",], vorname = d["vorname",], nachname = d["nachname",], @@ -208,8 +208,8 @@ parse_rednerliste <- function(rednerliste_xml) { #' @export write_to_csv <- function(tables, path="csv/", create=F) { check_directory(path, create) - write.table(tables$redner, str_c(path, "redner.csv")) - write.table(tables$reden, str_c(path, "reden.csv")) + write.table(tables$speaker, str_c(path, "speaker.csv")) + write.table(tables$speeches, str_c(path, "speeches.csv")) write.table(tables$talks, str_c(path, "talks.csv")) write.table(tables$comments, str_c(path, "comments.csv")) write.table(tables$applause, str_c(path, "applause.csv")) @@ -217,12 +217,12 @@ write_to_csv <- function(tables, path="csv/", create=F) { #' @export read_from_csv <- function(path="csv/") { - list(redner = read.table(str_c(path, "redner.csv")) %>% + list(speaker = read.table(str_c(path, "speaker.csv")) %>% tibble() %>% mutate(id = as.character(id)), - reden = read.table(str_c(path, "reden.csv")) %>% + speeches = read.table(str_c(path, "speeches.csv")) %>% tibble() %>% - mutate(redner = as.character(redner)), + mutate(speaker = as.character(speaker)), talks = tibble %$% read.table(str_c(path, "talks.csv")), comments = tibble %$% read.table(str_c(path, "comments.csv")), applause = tibble %$% read.table(str_c(path, "applause.csv"))) @@ -234,8 +234,8 @@ read_from_csv <- function(path="csv/") { # make sure data ist downloaded via fetch.R # res <- read_one("records/19126-data.xml") # -# res$redner -# res$reden +# res$speaker +# res$speeches # res$talks # ------------------------------- diff --git a/R/repair.R b/R/repair.R index 21d5738..522a903 100644 --- a/R/repair.R +++ b/R/repair.R @@ -21,10 +21,10 @@ longest_titel <- function(titel) { # takes character vector, removes duplicates and collapses collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") -# expects a tibble of redner and repairs -repair_redner <- function(redner) { - if (nrow(redner) == 0) return(redner) - redner %>% +# expects a tibble of speaker and repairs +repair_speaker <- function(speaker) { + if (nrow(speaker) == 0) return(speaker) + speaker %>% filter(id != "10000") %>% # invalid id's mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion group_by(id) %>% @@ -39,10 +39,10 @@ repair_redner <- function(redner) { # distinct(vorname, nachname, fraktion, titel) } -repair_reden <- function(reden) { - if (nrow(reden) == 0) return(reden) +repair_speeches <- function(speeches) { + if (nrow(speeches) == 0) return(speeches) # TODO: fill with content - reden + speeches } repair_talks <- function(talks) { @@ -51,13 +51,13 @@ repair_talks <- function(talks) { filter(talks, str_length(content) > 0) } -# tries to find the correct redner id given a name +# tries to find the correct speaker id given a name # this is sufficient since every prename lastname combination in the bundestag is # unique (luckily :D) # returns a lookup table -lookup_redner <- function(comments, redner) { +lookup_speaker <- function(comments, speaker) { tobereplaced <- "[-–—‑­­-­­­ ]" - redner %>% + speaker %>% unite(name, vorname, nachname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs @@ -70,14 +70,14 @@ lookup_redner <- function(comments, redner) { } comments %>% distinct(kommentator) %>% - mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) + mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) } -repair_comments <- function(comments, redner) { - # try to find a redner id for each actual comment +repair_comments <- function(comments, speaker) { + # try to find a speaker id for each actual comment comments %>% filter(!is.na(kommentator)) %>% - lookup_redner(redner) %>% + lookup_speaker(speaker) %>% left_join(comments, ., by="kommentator") %>% select(-kommentator) } @@ -86,8 +86,8 @@ repair_comments <- function(comments, redner) { #' #' @export repair <- function(parse_output) { - list(redner = repair_redner(parse_output$redner), - reden = repair_reden(parse_output$reden), + list(speaker = repair_speaker(parse_output$speaker), + speeches = repair_speeches(parse_output$speeches), talks = repair_talks(parse_output$talks), #comments = repair_comments(parse_output$comments) comments = parse_output$comments, diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index d5dded1..1e359fe 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -42,8 +42,8 @@ res <- read_from_csv('../csv/') and unpack our tibbles ```{r} comments <- res$comments -reden <- res$reden -redner <- res$redner +speeches <- res$speeches +speaker <- res$speaker talks <- res$talks ``` @@ -51,11 +51,11 @@ talks <- res$talks Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: ```{r, fig.width=7} -join_redner(res$reden, res) %>% +join_speaker(res$speeches, res) %>% group_by(fraktion) %>% summarize(n = n()) %>% arrange(n) %>% - bar_plot_fraktionen(title="Number of speeches given by fraction", + bar_plot_fractions(title="Number of speeches given by fraction", ylab="Number of speeches") ``` @@ -64,13 +64,13 @@ or counting the occurences of a given word: ```{r, fig.width=7} find_word(res, "Kohleausstieg") %>% filter(occurences > 0) %>% - join_redner(res) %>% + join_speaker(res) %>% select(content, fraktion) %>% filter(!is.na(fraktion)) %>% group_by(fraktion) %>% summarize(n = n()) %>% arrange(desc(n)) %>% - bar_plot_fraktionen(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", + bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", ylab = "Number of uses of 'Kohleausstieg'", flipped = F) ``` @@ -78,11 +78,11 @@ find_word(res, "Kohleausstieg") %>% ### Who gives the most speeches? ```{r} -res$reden %>% - group_by(redner) %>% +res$speeches %>% + group_by(speaker) %>% summarize(n = n()) %>% arrange(-n) %>% - left_join(res$redner, by=c("redner" = "id")) %>% + left_join(res$speaker, by=c("speaker" = "id")) %>% head(10) ``` @@ -91,10 +91,10 @@ res$reden %>% ```{r} res$talks %>% mutate(content_len = str_length(content)) %>% - group_by(redner) %>% + group_by(speaker) %>% summarize(avg_content_len = mean(content_len)) %>% arrange(-avg_content_len) %>% - left_join(res$redner, by=c("redner" = "id")) %>% + left_join(res$speaker, by=c("speaker" = "id")) %>% head(10) ``` @@ -102,7 +102,7 @@ res$talks %>% ```{r} res$applause %>% - left_join(res$redner, by=c("on_redner" = "id")) %>% + left_join(res$speaker, by=c("on_speaker" = "id")) %>% select(on_fraktion = fraktion, where(is.logical)) %>% group_by(on_fraktion) %>% arrange(on_fraktion) %>% @@ -119,7 +119,7 @@ For plotting our results we reorganize them a bit and produce a bar plot: ```{r, fig.width=7} pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% filter(!is.na(on_fraktion)) %>% - bar_plot_fraktionen(x_variable = on_fraktion, + bar_plot_fractions(x_variable = on_fraktion, y_variable = value, fill = by_fraktion, title = "Number of rounds of applauses from fractions to fractions", @@ -134,7 +134,7 @@ pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% ```{r} res$comments %>% - left_join(res$redner, by=c("on_redner" = "id")) %>% + left_join(res$speaker, by=c("on_speaker" = "id")) %>% select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% group_by(on_fraktion) %>% summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), @@ -149,7 +149,7 @@ Analogously we plot the results: ```{r, fig.width=7} pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% filter(!is.na(on_fraktion)) %>% - bar_plot_fraktionen(x_variable = on_fraktion, + bar_plot_fractions(x_variable = on_fraktion, y_variable = value, fill = by_fraktion, title = "Number of comments from fractions to fractions", diff --git a/vignettes/hitlercomparison.Rmd b/vignettes/hitlercomparison.Rmd index 8b2449b..4dee93c 100644 --- a/vignettes/hitlercomparison.Rmd +++ b/vignettes/hitlercomparison.Rmd @@ -31,8 +31,8 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp ```r read_all("../records/") %>% repair() -> res -reden <- res$reden -redner <- res$redner +speeches <- res$speeches +speaker <- res$speaker talks <- res$talks ``` We also used `repair` to fix a bunch of formatting issues in the records and unpacked @@ -43,8 +43,8 @@ For development purposes, we load the tables from csv files. tables <- read_from_csv('../csv/') comments <- tables$comments -reden <- tables$reden -redner <- tables$redner +speeches <- tables$speeches +speaker <- tables$speaker talks <- tables$talks ``` @@ -60,7 +60,7 @@ hitlerwords <- tibble(Worte) Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`. ```{r} talks %>% - left_join(redner, by=c(redner='id')) %>% + left_join(speaker, by=c(speaker='id')) %>% group_by(fraktion) %>% summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion ``` @@ -169,5 +169,5 @@ hitler_comparison ``` Finally, we want to plot our results: ```{r, fig.width=7} -bar_plot_fraktionen(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") +bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") ```