| @@ -1,9 +1,10 @@ | |||
| # Generated by roxygen2: do not edit by hand | |||
| export(bar_plot_fraktionen) | |||
| export(bar_plot_fractions) | |||
| export(fetch_all) | |||
| export(find_word) | |||
| export(join_redner) | |||
| export(join_speaker) | |||
| export(party_colors) | |||
| export(read_all) | |||
| export(read_from_csv) | |||
| export(repair) | |||
| @@ -6,8 +6,8 @@ find_word <- function(res, word) { | |||
| } | |||
| #' @export | |||
| join_redner <- function(tb, res, fraktion_only = F) { | |||
| joined <- left_join(tb, res$redner, by=c("redner" = "id")) | |||
| join_speaker <- function(tb, res, fraktion_only = F) { | |||
| joined <- left_join(tb, res$speaker, by=c("speaker" = "id")) | |||
| if (fraktion_only) select(joined, "fraktion") | |||
| else joined | |||
| } | |||
| @@ -29,7 +29,7 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | |||
| "FDP", "AfD", NA_character_)) | |||
| #' @export | |||
| bar_plot_fraktionen <- function(tb, | |||
| bar_plot_fractions <- function(tb, | |||
| x_variable = NULL, # default is fraktion | |||
| y_variable = NULL, # default is n | |||
| fill = NULL, # default is fraktion | |||
| @@ -82,7 +82,7 @@ word_usage_by_date <- function(res, patterns, name, tidy=F) { | |||
| else name <- patterns[[i]] | |||
| tb <- mutate(tb, {{name}} := str_count(content, patterns[[i]])) | |||
| } | |||
| left_join(tb, res$reden, by=c("rede_id" = "id")) %>% | |||
| left_join(tb, res$speeches, by=c("speech_id" = "id")) %>% | |||
| group_by(date) %>% | |||
| summarize(across(where(is.numeric), sum)) %>% | |||
| arrange(date) -> tb | |||
| @@ -13,16 +13,16 @@ read_all <- function(path="records/") { | |||
| available_protocols <- list.files(path) | |||
| res <- pblapply(available_protocols, read_one, path=path) | |||
| lapply(res, `[[`, "redner") %>% | |||
| lapply(res, `[[`, "speaker") %>% | |||
| bind_rows() %>% | |||
| distinct() -> | |||
| redner | |||
| speaker | |||
| lapply(res, `[[`, "reden") %>% | |||
| lapply(res, `[[`, "speeches") %>% | |||
| bind_rows() %>% | |||
| distinct() %>% | |||
| mutate(date = as.Date(date, format="%d.%m.%Y")) -> | |||
| reden | |||
| speeches | |||
| lapply(res, `[[`, "talks") %>% | |||
| bind_rows() %>% | |||
| @@ -51,7 +51,7 @@ read_all <- function(path="records/") { | |||
| select(-fraktion) -> | |||
| applause | |||
| list(redner = redner, reden = reden, talks = talks, comments = comments, applause = applause) | |||
| list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) | |||
| } | |||
| # this reads all currently parseable data from one xml | |||
| @@ -64,18 +64,18 @@ read_one <- function(name, path) { | |||
| cs <- xml_children(x) | |||
| verlauf <- xml_find_first(x, "sitzungsverlauf") | |||
| rednerl <- xml_find_first(x, "rednerliste") | |||
| speakerl <- xml_find_first(x, "rednerliste") | |||
| xml_children(rednerl) %>% | |||
| parse_rednerliste() -> | |||
| redner | |||
| xml_children(speakerl) %>% | |||
| parse_speakerlist() -> | |||
| speaker | |||
| xml_children(verlauf) %>% | |||
| xml_find_all("rede") %>% | |||
| parse_redenliste(date) -> | |||
| parse_speechlist(date) -> | |||
| res | |||
| list(redner = redner, reden = res$reden, talks = res$talks, comments = res$comments) | |||
| list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments) | |||
| } | |||
| xml_get <- function(node, name) { | |||
| @@ -84,10 +84,10 @@ xml_get <- function(node, name) { | |||
| else res | |||
| } | |||
| # parse one redner | |||
| parse_redner <- function(redner_xml) { | |||
| redner_id <- xml_attr(redner_xml, "id") | |||
| nm <- xml_child(redner_xml) | |||
| # parse one speaker | |||
| parse_speaker <- function(speaker_xml) { | |||
| speaker_id <- xml_attr(speaker_xml, "id") | |||
| nm <- xml_child(speaker_xml) | |||
| vorname <- xml_get(nm, "vorname") | |||
| nachname <- xml_get(nm, "nachname") | |||
| fraktion <- xml_get(nm, "fraktion") | |||
| @@ -97,39 +97,39 @@ parse_redner <- function(redner_xml) { | |||
| rolle_lang <- xml_get(rolle, "rolle_lang") | |||
| rolle_kurz <- xml_get(rolle, "rolle_kurz") | |||
| } else rolle_kurz <- rolle_lang <- NA_character_ | |||
| c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||
| c(id = speaker_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel, | |||
| rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) | |||
| } | |||
| # parse one rede | |||
| # returns: - a rede (with rede id and redner id) | |||
| # - all talks appearing in the rede (with corresponding content) | |||
| parse_rede <- function(rede_xml, date) { | |||
| rede_id <- xml_attr(rede_xml, "id") | |||
| cs <- xml_children(rede_xml) | |||
| cur_redner <- NA_character_ | |||
| principal_redner <- NA_character_ | |||
| # parse one speech | |||
| # returns: - a speech (with speech id and speaker id) | |||
| # - all talks appearing in the speech (with corresponding content) | |||
| parse_speech <- function(speech_xml, date) { | |||
| speech_id <- xml_attr(speech_xml, "id") | |||
| cs <- xml_children(speech_xml) | |||
| cur_speaker <- NA_character_ | |||
| principal_speaker <- NA_character_ | |||
| cur_content <- "" | |||
| reden <- list() | |||
| speeches <- list() | |||
| comments <- list() | |||
| for (node in cs) { | |||
| if (xml_name(node) == "p" || xml_name(node) == "name") { | |||
| klasse <- xml_attr(node, "klasse") | |||
| if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { | |||
| if (!is.na(cur_redner)) { | |||
| rede <- c(rede_id = rede_id, | |||
| redner = cur_redner, | |||
| if ((!is.na(klasse) && klasse == "speaker") || xml_name(node) == "name") { | |||
| if (!is.na(cur_speaker)) { | |||
| speech <- c(speech_id = speech_id, | |||
| speaker = cur_speaker, | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| speeches <- c(speeches, list(speech)) | |||
| cur_content <- "" | |||
| } | |||
| if (is.na(principal_redner) && xml_name(node) != "name") { | |||
| principal_redner <- xml_child(node) %>% xml_attr("id") | |||
| if (is.na(principal_speaker) && xml_name(node) != "name") { | |||
| principal_speaker <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| if (xml_name(node) == "name") { | |||
| cur_redner <- "BTP" | |||
| cur_speaker <- "BTP" | |||
| } else { | |||
| cur_redner <- xml_child(node) %>% xml_attr("id") | |||
| cur_speaker <- xml_child(node) %>% xml_attr("id") | |||
| } | |||
| } else { | |||
| cur_content <- paste0(cur_content, xml_text(node), sep="\n") | |||
| @@ -141,25 +141,25 @@ parse_rede <- function(rede_xml, date) { | |||
| str_sub(2, -2) %>% | |||
| str_split("–") %>% | |||
| `[[`(1) %>% | |||
| lapply(parse_comment, rede_id = rede_id, on_redner = cur_redner) -> | |||
| lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) -> | |||
| cs | |||
| comments <- c(comments, cs) | |||
| } | |||
| } | |||
| rede <- c(rede_id = rede_id, | |||
| redner = cur_redner, | |||
| speech <- c(speech_id = speech_id, | |||
| speaker = cur_speaker, | |||
| content = cur_content) | |||
| reden <- c(reden, list(rede)) | |||
| list(rede = c(id = rede_id, redner = principal_redner, date = date), | |||
| parts = reden, | |||
| speeches <- c(speeches, list(speech)) | |||
| list(speech = c(id = speech_id, speaker = principal_speaker, date = date), | |||
| parts = speeches, | |||
| comments = comments) | |||
| } | |||
| fraktionspattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" | |||
| fraktionsnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP") | |||
| parse_comment <- function(comment, rede_id, on_redner) { | |||
| base <- c(rede_id = rede_id, on_redner = on_redner) | |||
| parse_comment <- function(comment, speech_id, on_speaker) { | |||
| base <- c(speech_id = speech_id, on_speaker = on_speaker) | |||
| # classify comment | |||
| if(str_detect(comment, "Beifall")) { | |||
| str_extract_all(comment, fraktionspattern) %>% | |||
| @@ -174,28 +174,28 @@ parse_comment <- function(comment, rede_id, on_redner) { | |||
| } | |||
| } | |||
| # creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden | |||
| parse_redenliste <- function(redenliste_xml, date) { | |||
| d <- sapply(redenliste_xml, parse_rede, date = date) | |||
| reden <- simplify2array(d["rede", ]) | |||
| # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches | |||
| parse_speechlist <- function(speechlist_xml, date) { | |||
| d <- sapply(speechlist_xml, parse_speech, date = date) | |||
| speeches <- simplify2array(d["speech", ]) | |||
| parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) | |||
| comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) | |||
| list(reden = tibble(id = reden["id",], redner = reden["redner",], | |||
| date = reden["date",]), | |||
| talks = tibble(rede_id = parts["rede_id", ], | |||
| redner = parts["redner", ], | |||
| list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",], | |||
| date = speeches["date",]), | |||
| talks = tibble(speech_id = parts["speech_id", ], | |||
| speaker = parts["speaker", ], | |||
| content = parts["content", ]), | |||
| comments = tibble(rede_id = comments["rede_id",], | |||
| on_redner = comments["on_redner",], | |||
| comments = tibble(speech_id = comments["speech_id",], | |||
| on_speaker = comments["on_speaker",], | |||
| type = comments["type",], | |||
| fraktion = comments["fraktion",], | |||
| kommentator = comments["kommentator",], | |||
| content = comments["content", ])) | |||
| } | |||
| # create a tibble of redner from a list of xml nodes representing redner | |||
| parse_rednerliste <- function(rednerliste_xml) { | |||
| d <- sapply(rednerliste_xml, parse_redner) | |||
| # create a tibble of speaker from a list of xml nodes representing speaker | |||
| parse_speakerliste <- function(speakerliste_xml) { | |||
| d <- sapply(speakerliste_xml, parse_speaker) | |||
| tibble(id = d["id",], | |||
| vorname = d["vorname",], | |||
| nachname = d["nachname",], | |||
| @@ -208,8 +208,8 @@ parse_rednerliste <- function(rednerliste_xml) { | |||
| #' @export | |||
| write_to_csv <- function(tables, path="csv/", create=F) { | |||
| check_directory(path, create) | |||
| write.table(tables$redner, str_c(path, "redner.csv")) | |||
| write.table(tables$reden, str_c(path, "reden.csv")) | |||
| write.table(tables$speaker, str_c(path, "speaker.csv")) | |||
| write.table(tables$speeches, str_c(path, "speeches.csv")) | |||
| write.table(tables$talks, str_c(path, "talks.csv")) | |||
| write.table(tables$comments, str_c(path, "comments.csv")) | |||
| write.table(tables$applause, str_c(path, "applause.csv")) | |||
| @@ -217,12 +217,12 @@ write_to_csv <- function(tables, path="csv/", create=F) { | |||
| #' @export | |||
| read_from_csv <- function(path="csv/") { | |||
| list(redner = read.table(str_c(path, "redner.csv")) %>% | |||
| list(speaker = read.table(str_c(path, "speaker.csv")) %>% | |||
| tibble() %>% | |||
| mutate(id = as.character(id)), | |||
| reden = read.table(str_c(path, "reden.csv")) %>% | |||
| speeches = read.table(str_c(path, "speeches.csv")) %>% | |||
| tibble() %>% | |||
| mutate(redner = as.character(redner)), | |||
| mutate(speaker = as.character(speaker)), | |||
| talks = tibble %$% read.table(str_c(path, "talks.csv")), | |||
| comments = tibble %$% read.table(str_c(path, "comments.csv")), | |||
| applause = tibble %$% read.table(str_c(path, "applause.csv"))) | |||
| @@ -234,8 +234,8 @@ read_from_csv <- function(path="csv/") { | |||
| # make sure data ist downloaded via fetch.R | |||
| # res <- read_one("records/19126-data.xml") | |||
| # | |||
| # res$redner | |||
| # res$reden | |||
| # res$speaker | |||
| # res$speeches | |||
| # res$talks | |||
| # ------------------------------- | |||
| @@ -21,10 +21,10 @@ longest_titel <- function(titel) { | |||
| # takes character vector, removes duplicates and collapses | |||
| collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") | |||
| # expects a tibble of redner and repairs | |||
| repair_redner <- function(redner) { | |||
| if (nrow(redner) == 0) return(redner) | |||
| redner %>% | |||
| # expects a tibble of speaker and repairs | |||
| repair_speaker <- function(speaker) { | |||
| if (nrow(speaker) == 0) return(speaker) | |||
| speaker %>% | |||
| filter(id != "10000") %>% # invalid id's | |||
| mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion | |||
| group_by(id) %>% | |||
| @@ -39,10 +39,10 @@ repair_redner <- function(redner) { | |||
| # distinct(vorname, nachname, fraktion, titel) | |||
| } | |||
| repair_reden <- function(reden) { | |||
| if (nrow(reden) == 0) return(reden) | |||
| repair_speeches <- function(speeches) { | |||
| if (nrow(speeches) == 0) return(speeches) | |||
| # TODO: fill with content | |||
| reden | |||
| speeches | |||
| } | |||
| repair_talks <- function(talks) { | |||
| @@ -51,13 +51,13 @@ repair_talks <- function(talks) { | |||
| filter(talks, str_length(content) > 0) | |||
| } | |||
| # tries to find the correct redner id given a name | |||
| # tries to find the correct speaker id given a name | |||
| # this is sufficient since every prename lastname combination in the bundestag is | |||
| # unique (luckily :D) | |||
| # returns a lookup table | |||
| lookup_redner <- function(comments, redner) { | |||
| lookup_speaker <- function(comments, speaker) { | |||
| tobereplaced <- "[-–—‑- ]" | |||
| redner %>% | |||
| speaker %>% | |||
| unite(name, vorname, nachname, sep=".*") %>% | |||
| mutate(name = str_replace_all(name, tobereplaced, ".*")) -> | |||
| rs | |||
| @@ -70,14 +70,14 @@ lookup_redner <- function(comments, redner) { | |||
| } | |||
| comments %>% | |||
| distinct(kommentator) %>% | |||
| mutate(redner = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) | |||
| mutate(speaker = Vectorize(find_match)(str_replace_all(kommentator, tobereplaced, ""))) | |||
| } | |||
| repair_comments <- function(comments, redner) { | |||
| # try to find a redner id for each actual comment | |||
| repair_comments <- function(comments, speaker) { | |||
| # try to find a speaker id for each actual comment | |||
| comments %>% | |||
| filter(!is.na(kommentator)) %>% | |||
| lookup_redner(redner) %>% | |||
| lookup_speaker(speaker) %>% | |||
| left_join(comments, ., by="kommentator") %>% | |||
| select(-kommentator) | |||
| } | |||
| @@ -86,8 +86,8 @@ repair_comments <- function(comments, redner) { | |||
| #' | |||
| #' @export | |||
| repair <- function(parse_output) { | |||
| list(redner = repair_redner(parse_output$redner), | |||
| reden = repair_reden(parse_output$reden), | |||
| list(speaker = repair_speaker(parse_output$speaker), | |||
| speeches = repair_speeches(parse_output$speeches), | |||
| talks = repair_talks(parse_output$talks), | |||
| #comments = repair_comments(parse_output$comments) | |||
| comments = parse_output$comments, | |||
| @@ -42,8 +42,8 @@ res <- read_from_csv('../csv/') | |||
| and unpack our tibbles | |||
| ```{r} | |||
| comments <- res$comments | |||
| reden <- res$reden | |||
| redner <- res$redner | |||
| speeches <- res$speeches | |||
| speaker <- res$speaker | |||
| talks <- res$talks | |||
| ``` | |||
| @@ -51,11 +51,11 @@ talks <- res$talks | |||
| Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: | |||
| ```{r, fig.width=7} | |||
| join_redner(res$reden, res) %>% | |||
| join_speaker(res$speeches, res) %>% | |||
| group_by(fraktion) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(n) %>% | |||
| bar_plot_fraktionen(title="Number of speeches given by fraction", | |||
| bar_plot_fractions(title="Number of speeches given by fraction", | |||
| ylab="Number of speeches") | |||
| ``` | |||
| @@ -64,13 +64,13 @@ or counting the occurences of a given word: | |||
| ```{r, fig.width=7} | |||
| find_word(res, "Kohleausstieg") %>% | |||
| filter(occurences > 0) %>% | |||
| join_redner(res) %>% | |||
| join_speaker(res) %>% | |||
| select(content, fraktion) %>% | |||
| filter(!is.na(fraktion)) %>% | |||
| group_by(fraktion) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(desc(n)) %>% | |||
| bar_plot_fraktionen(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||
| bar_plot_fractions(title = "Parties using the word 'Kohleausstieg' the most (absolutely)", | |||
| ylab = "Number of uses of 'Kohleausstieg'", | |||
| flipped = F) | |||
| ``` | |||
| @@ -78,11 +78,11 @@ find_word(res, "Kohleausstieg") %>% | |||
| ### Who gives the most speeches? | |||
| ```{r} | |||
| res$reden %>% | |||
| group_by(redner) %>% | |||
| res$speeches %>% | |||
| group_by(speaker) %>% | |||
| summarize(n = n()) %>% | |||
| arrange(-n) %>% | |||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||
| left_join(res$speaker, by=c("speaker" = "id")) %>% | |||
| head(10) | |||
| ``` | |||
| @@ -91,10 +91,10 @@ res$reden %>% | |||
| ```{r} | |||
| res$talks %>% | |||
| mutate(content_len = str_length(content)) %>% | |||
| group_by(redner) %>% | |||
| group_by(speaker) %>% | |||
| summarize(avg_content_len = mean(content_len)) %>% | |||
| arrange(-avg_content_len) %>% | |||
| left_join(res$redner, by=c("redner" = "id")) %>% | |||
| left_join(res$speaker, by=c("speaker" = "id")) %>% | |||
| head(10) | |||
| ``` | |||
| @@ -102,7 +102,7 @@ res$talks %>% | |||
| ```{r} | |||
| res$applause %>% | |||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||
| left_join(res$speaker, by=c("on_speaker" = "id")) %>% | |||
| select(on_fraktion = fraktion, where(is.logical)) %>% | |||
| group_by(on_fraktion) %>% | |||
| arrange(on_fraktion) %>% | |||
| @@ -119,7 +119,7 @@ For plotting our results we reorganize them a bit and produce a bar plot: | |||
| ```{r, fig.width=7} | |||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||
| filter(!is.na(on_fraktion)) %>% | |||
| bar_plot_fraktionen(x_variable = on_fraktion, | |||
| bar_plot_fractions(x_variable = on_fraktion, | |||
| y_variable = value, | |||
| fill = by_fraktion, | |||
| title = "Number of rounds of applauses from fractions to fractions", | |||
| @@ -134,7 +134,7 @@ pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||
| ```{r} | |||
| res$comments %>% | |||
| left_join(res$redner, by=c("on_redner" = "id")) %>% | |||
| left_join(res$speaker, by=c("on_speaker" = "id")) %>% | |||
| select(by_fraktion = fraktion.x, on_fraktion = fraktion.y) %>% | |||
| group_by(on_fraktion) %>% | |||
| summarize(`AfD` = sum(str_detect(by_fraktion, "AfD"), na.rm=T), | |||
| @@ -149,7 +149,7 @@ Analogously we plot the results: | |||
| ```{r, fig.width=7} | |||
| pivot_longer(tb, where(is.numeric), "by_fraktion", "count") %>% | |||
| filter(!is.na(on_fraktion)) %>% | |||
| bar_plot_fraktionen(x_variable = on_fraktion, | |||
| bar_plot_fractions(x_variable = on_fraktion, | |||
| y_variable = value, | |||
| fill = by_fraktion, | |||
| title = "Number of comments from fractions to fractions", | |||
| @@ -31,8 +31,8 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp | |||
| ```r | |||
| read_all("../records/") %>% repair() -> res | |||
| reden <- res$reden | |||
| redner <- res$redner | |||
| speeches <- res$speeches | |||
| speaker <- res$speaker | |||
| talks <- res$talks | |||
| ``` | |||
| We also used `repair` to fix a bunch of formatting issues in the records and unpacked | |||
| @@ -43,8 +43,8 @@ For development purposes, we load the tables from csv files. | |||
| tables <- read_from_csv('../csv/') | |||
| comments <- tables$comments | |||
| reden <- tables$reden | |||
| redner <- tables$redner | |||
| speeches <- tables$speeches | |||
| speaker <- tables$speaker | |||
| talks <- tables$talks | |||
| ``` | |||
| @@ -60,7 +60,7 @@ hitlerwords <- tibble(Worte) | |||
| Now we extract the words that were used with higher frequency by one party and compare them with `hitlerwords`. | |||
| ```{r} | |||
| talks %>% | |||
| left_join(redner, by=c(redner='id')) %>% | |||
| left_join(speaker, by=c(speaker='id')) %>% | |||
| group_by(fraktion) %>% | |||
| summarize(full_text=str_c(content, collapse="\n")) -> talks_by_fraktion | |||
| ``` | |||
| @@ -169,5 +169,5 @@ hitler_comparison | |||
| ``` | |||
| Finally, we want to plot our results: | |||
| ```{r, fig.width=7} | |||
| bar_plot_fraktionen(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") | |||
| bar_plot_fractions(hitler_comparison, y_variable = percent, title="Coincidence of party vocabulary with nazi vocabulary", ylab="unique 'nazi' words per total (unique) fraction words [%]") | |||
| ``` | |||