diff --git a/NAMESPACE b/NAMESPACE index b52ac75..9b21836 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(fetch_all) +export(find_word) +export(join_redner) export(read_all) export(read_from_csv) export(repair) diff --git a/R/analyze.R b/R/analyze.R new file mode 100644 index 0000000..dfb12b7 --- /dev/null +++ b/R/analyze.R @@ -0,0 +1,31 @@ +#' @export +find_word <- function(res, word) { + talks <- res$talks + mutate(talks, occurences = sapply(str_match_all(talks$content, regex(word, ignore_case = TRUE)), + nrow)) +} + +#' @export +join_redner <- function(tb, res, fraktion_only = F) { + joined <- left_join(tb, res$redner, by=c("redner" = "id")) + if (fraktion_only) select(joined, "fraktion") + else joined +} + +party_colors <- c( + SPD="#DF0B25", + "CDU/CSU"="#000000", + AfD="#1A9FDD", + "AfD&Fraktionslos"="#1A9FDD", + "DIE LINKE"="#BC3475", + "BÜNDNIS 90 / DIE GRÜNEN"="#4A932B", + FDP="#FEEB34", + Fraktionslos="#FEEB34" +) + +#' @export +bar_plot_fraktionen <- function(tb) { + ggplot(tb, aes(x = reorder(fraktion, -n), y = n, fill = fraktion)) + + scale_fill_manual(values = party_colors) + + geom_bar(stat = "identity") +} diff --git a/README.md b/README.md index 5be96b0..4e5ed98 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,11 @@ Um dokumentationen neu zu laden / zu erstellen (ruft roxgen auf) document() ``` +Baue vignetten +```r +rmarkdown::render("vignettes/bla.Rmd") +``` + # Herunterladen Bevor analysiert werden kann, muss fetch.R ausgeführt werden, um alle Protokolle herunterzuladen. diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd index 1217c0d..2e6bc28 100644 --- a/vignettes/funwithdata.Rmd +++ b/vignettes/funwithdata.Rmd @@ -29,32 +29,42 @@ fetch_all("../records/") # path to directory where records should be stored Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: ```r read_all("../records/") %>% repair() -> res - -reden <- res$reden -redner <- res$redner -talks <- res$talks ``` We also used `repair` to fix a bunch of formatting issues in the records and unpacked the result into more descriptive variables. For development purposes, we load the tables from csv files. ```{r} -tables <- read_from_csv('../csv/') - -comments <- tables$comments -reden <- tables$reden -redner <- tables$redner -talks <- tables$talks +res <- read_from_csv('../csv/') +``` +and unpack our tibbles +```{r} +comments <- res$comments +reden <- res$reden +redner <- res$redner +talks <- res$talks ``` ## Analysis Now we can start analysing our parsed dataset, e.g. find out which party gives the most talks: -```{r} -left_join(reden, redner, by=c("redner" = "id")) %>% +```{r, fig.width=10} +join_redner(reden, res) %>% group_by(fraktion) %>% summarize(n = n()) %>% - ggplot(aes(x = fraktion, y = n)) + - geom_bar(stat = "identity") + arrange(n) %>% + bar_plot_fraktionen() ``` +### Count a word occurence + +```{r, fig.width=10} +find_word(res, "hitler") %>% + filter(occurences > 0) %>% + join_redner(res) %>% + select(content, fraktion) %>% + group_by(fraktion) %>% + summarize(n = n()) %>% + arrange(desc(n)) %>% + bar_plot_fraktionen() +```