From 1463f6092e131bc7c29d3113d9819a7cac24271c Mon Sep 17 00:00:00 2001 From: flavis Date: Wed, 28 Jul 2021 21:23:06 +0200 Subject: [PATCH 1/2] add basic analysing utils --- NAMESPACE | 2 ++ R/analyze.R | 13 +++++++++++++ vignettes/funwithdata.Rmd | 12 ++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 R/analyze.R diff --git a/NAMESPACE b/NAMESPACE index b52ac75..9b21836 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(fetch_all) +export(find_word) +export(join_redner) export(read_all) export(read_from_csv) export(repair) diff --git a/R/analyze.R b/R/analyze.R new file mode 100644 index 0000000..b818005 --- /dev/null +++ b/R/analyze.R @@ -0,0 +1,13 @@ +#' @export +find_word <- function(res, word) { + talks <- res$talks + mutate(talks, occurences = sapply(str_match_all(talks$content, regex(word, ignore_case = TRUE)), + nrow)) +} + +#' @export +join_redner <- function(tb, res, fraktion_only = F) { + joined <- left_join(tb, res$redner, by=c("redner" = "id")) + if (fraktion_only) select(joined, "fraktion") + else joined +} diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 098ce4c..17c6ff4 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -57,3 +57,15 @@ left_join(reden, redner, by=c("redner" = "id")) %>% ggplot(aes(x = fraktion, y = n)) + geom_bar(stat = "identity") ``` + +### Count a word occurence + +```{r} +find_word(res, "hitler") %>% + filter(occurences > 0) %>% + join_redner(res) %>% + select(content, fraktion) %>% + group_by(fraktion) %>% + summarize(n = n()) %>% + arrange(desc(n)) +``` From d01cea9d525bd56e9eb18dc0d0339c8b972123d2 Mon Sep 17 00:00:00 2001 From: flavis Date: Wed, 28 Jul 2021 22:46:17 +0200 Subject: [PATCH 2/2] improve vignette --- R/analyze.R | 18 ++++++++++++++++++ README.md | 5 +++++ vignettes/funwithdata.Rmd | 31 +++++++++++++++---------------- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index b818005..dfb12b7 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -11,3 +11,21 @@ join_redner <- function(tb, res, fraktion_only = F) { if (fraktion_only) select(joined, "fraktion") else joined } + +party_colors <- c( + SPD="#DF0B25", + "CDU/CSU"="#000000", + AfD="#1A9FDD", + "AfD&Fraktionslos"="#1A9FDD", + "DIE LINKE"="#BC3475", + "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", + FDP="#FEEB34", + Fraktionslos="#FEEB34" +) + +#' @export +bar_plot_fraktionen <- function(tb) { + ggplot(tb, aes(x = reorder(fraktion, -n), y = n, fill = fraktion)) + + scale_fill_manual(values = party_colors) + + geom_bar(stat = "identity") +} diff --git a/README.md b/README.md index 5be96b0..4e5ed98 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,11 @@ Um dokumentationen neu zu laden / zu erstellen (ruft roxgen auf) document() ``` +Baue vignetten +```r +rmarkdown::render("vignettes/bla.Rmd") +``` + # Herunterladen Bevor analysiert werden kann, muss fetch.R ausgeführt werden, um alle Protokolle herunterzuladen. diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 17c6ff4..2e6bc28 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -29,43 +29,42 @@ fetch_all("../records/") # path to directory where records should be stored Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: ```r read_all("../records/") %>% repair() -> res - -reden <- res$reden -redner <- res$redner -talks <- res$talks ``` We also used `repair` to fix a bunch of formatting issues in the records and unpacked the result into more descriptive variables. For development purposes, we load the tables from csv files. ```{r} -tables <- read_from_csv('../csv/') - -comments <- tables$comments -reden <- tables$reden -redner <- tables$redner -talks <- tables$talks +res <- read_from_csv('../csv/') +``` +and unpack our tibbles +```{r} +comments <- res$comments +reden <- res$reden +redner <- res$redner +talks <- res$talks ``` ## Analysis Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: -```{r} -left_join(reden, redner, by=c("redner" = "id")) %>% +```{r, fig.width=10} +join_redner(reden, res) %>% group_by(fraktion) %>% summarize(n = n()) %>% - ggplot(aes(x = fraktion, y = n)) + - geom_bar(stat = "identity") + arrange(n) %>% + bar_plot_fraktionen() ``` ### Count a word occurence -```{r} +```{r, fig.width=10} find_word(res, "hitler") %>% filter(occurences > 0) %>% join_redner(res) %>% select(content, fraktion) %>% group_by(fraktion) %>% summarize(n = n()) %>% - arrange(desc(n)) + arrange(desc(n)) %>% + bar_plot_fraktionen() ```