diff --git a/R/analyze.R b/R/analyze.R index 917cf68..0f4e8e7 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -20,11 +20,11 @@ find_word <- function(res, word) { #' add information from speaker table to a tibble containing speaker id #' #' @param tb tibble -#' @param res tibble -#' @param fraction_only bool +#' @param res list of tibbles +#' @param fraction_only if TRUE, only select fraction from the resulting joined tibble #' #' left join speaker information from res$speaker into tb. -#' if fraction_only is TRUE, only fraction is selected from the resulting joined tibble +#' if fraction_only #' #' @export join_speaker <- function(tb, res, fraction_only = F) { @@ -33,7 +33,7 @@ join_speaker <- function(tb, res, fraction_only = F) { else joined } -#' lookup table for party colors +#' lookup table for official party colors #' #' @export party_colors <- c( @@ -51,27 +51,27 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", "FDP", "AfD", NA_character_)) -#' plot data depending on fractions in a standardized, configurable way +#' Bar chart visualizing fraction based data +#' +#' Can be configured to also visualize data not related to fractions. #' #' @param tb tibble -#' @param x_variable column in tb -#' @param y_variable column in tb -#' @param fill column in tb -#' @param title char -#' @param xlab char -#' @param ylab char -#' @param filllab char -#' @param flipped bool -#' @param position char -#' @param reorder bool +#' @param x_variable column in tb, default is fraction +#' @param y_variable column in tb, default is n +#' @param fill column in tb, default is fraction +#' @param title plot title +#' @param xlab label for x axis, default is fraction +#' @param ylab label for y axis, default is n +#' @param filllab default is 'Fraction' +#' @param flipped if TRUE draw bars horizontally, else vertically. Default is TRUE +#' @param position default is 'dodge' +#' @param reorder Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default). #' #' plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. -#' Then color the plot depending on the fill value -#' Give the plot a title, an x-label xlab as well as an y-label ylab -#' Color the legend according to filllab -#' Setting flipped to TRUE makes the bars horizontal -#' Improve positioning details according to position -#' and finally reorder x_variable (default ist to order fractions according to seat order) +#' Then color the plot depending on the fill value. +#' Give the plot a title and a label for x-axis and y-axis, +#' color the legend according to filllab and finally +#' improve positioning details according to position #' #' @export bar_plot_fractions <- function(tb, @@ -116,16 +116,15 @@ bar_plot_fractions <- function(tb, if (flipped) plt + coord_flip() else plt } -#' Counts how many talks do match a given pattern and summarises by date -#' -#' @param res tibble -#' @param patterns char list -#' @param name char ? what is name needed for?? -#' @param tidy bool, default F -#' -#' shorter summary if tidy=F -#' if tidy is set to T, the resulting tibble is tidy -#' +#' Word usage summarised by date +#' +#' Counts how many talks do match a given pattern and summarises by date. +#' +#' @param res List of Tibbles to be analysed. +#' @param patterns Words to look up. +#' @param name ? +#' @param tidy default is FALSE. +#' #' @export word_usage_by_date <- function(res, patterns, name, tidy=F) { tb <- res$talks diff --git a/R/parse.R b/R/parse.R index e9c75e9..498dae7 100644 --- a/R/parse.R +++ b/R/parse.R @@ -205,13 +205,11 @@ parse_speakerlist <- function(speakerliste_xml) { rolle_lang = d["rolle_lang",]) } -#' Write the parsed and repaired results into a csv file to make loading and developing faster and easier +#' Write the parsed and repaired results into separate csv files #' -#' @param tables tibble list -#' @param path char -#' @param create bool -#' -#' if create is set to TRUE, the directory given in path is created +#' @param tables list of tables to convert into a csv files. +#' @param path where to put the csv files. +#' @param create set TRUE if the path does not exist yet and you want to create it #' #' @export write_to_csv <- function(tables, path="data/csv/", create=F) { @@ -223,11 +221,12 @@ write_to_csv <- function(tables, path="data/csv/", create=F) { write.table(tables$applause, str_c(path, "applause.csv")) } -#' Read the needed tables for developing from a csv file. -#' -#' @param path char -#' -#' Reading the tables from a csv is way faster than reading and repairing the data every single time + +#' create a tibble from the csv file +#' +#' @param path directory to read files from +#' +#' reading the tables from a csv is way faster than reading and repairing the data every single time #' #' @export read_from_csv <- function(path="data/csv/") { diff --git a/man/bar_plot_fractions.Rd b/man/bar_plot_fractions.Rd index ff3a512..4bc1122 100644 --- a/man/bar_plot_fractions.Rd +++ b/man/bar_plot_fractions.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/analyze.R \name{bar_plot_fractions} \alias{bar_plot_fractions} -\title{plot data depending on fractions in a standardized, configurable way} +\title{Bar chart visualizing fraction based data} \usage{ bar_plot_fractions( tb, @@ -21,34 +21,32 @@ bar_plot_fractions( \arguments{ \item{tb}{tibble} -\item{x_variable}{column in tb} +\item{x_variable}{column in tb, default is fraction} -\item{y_variable}{column in tb} +\item{y_variable}{column in tb, default is n} -\item{fill}{column in tb} +\item{fill}{column in tb, default is fraction} -\item{title}{char} +\item{title}{plot title} -\item{xlab}{char} +\item{xlab}{label for x axis, default is fraction} -\item{ylab}{char} +\item{ylab}{label for y axis, default is n} -\item{filllab}{char} +\item{filllab}{default is 'Fraction'} -\item{flipped}{bool} +\item{flipped}{if TRUE draw bars horizontally, else vertically. Default is TRUE} -\item{position}{char} +\item{position}{default is 'dodge'} -\item{reorder}{bool +\item{reorder}{Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default). plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. -Then color the plot depending on the fill value -Give the plot a title, an x-label xlab as well as an y-label ylab -Color the legend according to filllab -Setting flipped to TRUE makes the bars horizontal -Improve positioning details according to position -and finally reorder x_variable (default ist to order fractions according to seat order)} +Then color the plot depending on the fill value. +Give the plot a title and a label for x-axis and y-axis, +color the legend according to filllab and finally +improve positioning details according to position} } \description{ -plot data depending on fractions in a standardized, configurable way +Can be configured to also visualize data not related to fractions. } diff --git a/man/fetch_all.Rd b/man/fetch_all.Rd index 694e97d..0b52206 100644 --- a/man/fetch_all.Rd +++ b/man/fetch_all.Rd @@ -8,6 +8,10 @@ fetch_all(download_dir = "data/records/", create = FALSE) } \arguments{ \item{download_dir}{character} + +\item{create}{bool + +if create is TRUE, the directory given in download_dir is created} } \description{ This fetches all available records of the 19th legislative period of the german Bundestag. diff --git a/man/join_speaker.Rd b/man/join_speaker.Rd index e03ec2d..5b6413b 100644 --- a/man/join_speaker.Rd +++ b/man/join_speaker.Rd @@ -9,12 +9,12 @@ join_speaker(tb, res, fraction_only = F) \arguments{ \item{tb}{tibble} -\item{res}{tibble} +\item{res}{list of tibbles} -\item{fraction_only}{bool +\item{fraction_only}{if TRUE, only select fraction from the resulting joined tibble left join speaker information from res$speaker into tb. -if fraction_only is TRUE, only fraction is selected from the resulting joined tibble} +if fraction_only} } \description{ add information from speaker table to a tibble containing speaker id diff --git a/man/party_colors.Rd b/man/party_colors.Rd index b0e5a18..1fd75b7 100644 --- a/man/party_colors.Rd +++ b/man/party_colors.Rd @@ -3,7 +3,7 @@ \docType{data} \name{party_colors} \alias{party_colors} -\title{lookup table for party colors} +\title{lookup table for official party colors} \format{ An object of class \code{character} of length 8. } @@ -11,6 +11,6 @@ An object of class \code{character} of length 8. party_colors } \description{ -lookup table for party colors +lookup table for official party colors } \keyword{datasets} diff --git a/man/read_from_csv.Rd b/man/read_from_csv.Rd index 3244c82..cd3fbc4 100644 --- a/man/read_from_csv.Rd +++ b/man/read_from_csv.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/parse.R \name{read_from_csv} \alias{read_from_csv} -\title{Read the needed tables for developing from a csv file.} +\title{create a tibble from the csv file} \usage{ read_from_csv(path = "data/csv/") } \arguments{ -\item{path}{char +\item{path}{directory to read files from -Reading the tables from a csv is way faster than reading and repairing the data every single time} +reading the tables from a csv is way faster than reading and repairing the data every single time} } \description{ -Read the needed tables for developing from a csv file. +create a tibble from the csv file } diff --git a/man/word_usage_by_date.Rd b/man/word_usage_by_date.Rd index ab96b2d..661d9c5 100644 --- a/man/word_usage_by_date.Rd +++ b/man/word_usage_by_date.Rd @@ -2,22 +2,19 @@ % Please edit documentation in R/analyze.R \name{word_usage_by_date} \alias{word_usage_by_date} -\title{Counts how many talks do match a given pattern and summarises by date} +\title{Word usage summarised by date} \usage{ word_usage_by_date(res, patterns, name, tidy = F) } \arguments{ -\item{res}{tibble} +\item{res}{List of Tibbles to be analysed.} -\item{patterns}{char list} +\item{patterns}{Words to look up.} -\item{name}{char ? what is name needed for??} +\item{name}{?} -\item{tidy}{bool, default F - -shorter summary if tidy=F -if tidy is set to T, the resulting tibble is tidy} +\item{tidy}{default is FALSE.} } \description{ -Counts how many talks do match a given pattern and summarises by date +Counts how many talks do match a given pattern and summarises by date. } diff --git a/man/write_to_csv.Rd b/man/write_to_csv.Rd index cd7f200..5cd1af7 100644 --- a/man/write_to_csv.Rd +++ b/man/write_to_csv.Rd @@ -2,19 +2,17 @@ % Please edit documentation in R/parse.R \name{write_to_csv} \alias{write_to_csv} -\title{Write the parsed and repaired results into a csv file to make loading and developing faster and easier} +\title{Write the parsed and repaired results into separate csv files} \usage{ write_to_csv(tables, path = "data/csv/", create = F) } \arguments{ -\item{tables}{tibble list} +\item{tables}{list of tables to convert into a csv files.} -\item{path}{char} +\item{path}{where to put the csv files.} -\item{create}{bool - -if create is set to TRUE, the directory given in path is created} +\item{create}{set TRUE if the path does not exist yet and you want to create it} } \description{ -Write the parsed and repaired results into a csv file to make loading and developing faster and easier +Write the parsed and repaired results into separate csv files } diff --git a/vignettes/genderequality.Rmd b/vignettes/genderequality.Rmd new file mode 100644 index 0000000..d4aa64d --- /dev/null +++ b/vignettes/genderequality.Rmd @@ -0,0 +1,96 @@ +--- +title: "genderequality" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{genderequality} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(hateimparlament) +library(dplyr) +library(ggplot2) +library(stringr) +library(tidyr) +library(rvest) +``` + +## Preparation of data + +First, you need to download all records of the current legislative period. +```r +fetch_all("../records/") # path to directory where records should be stored +``` +Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: +```r +read_all("../records/") %>% repair() -> res +``` +We also used `repair` to fix a bunch of formatting issues in the records and unpacked +the result into more descriptive variables. + +For development purposes, we load the tables from csv files. +```{r} +res <- read_from_csv('../csv/') +``` +and unpack our tibbles +```{r} +comments <- res$comments +speeches <- res$speeches +speaker <- res$speaker +talks <- res$talks +``` + +Bevor we can do our analysis, we have to assign a gender to our politicans. + +```{r} +extract_href <- function(sel, html) { + html %>% + html_node(sel) %>% + html_attr("href") +} + +first_content_p_text <- function(url) { + res <- NA + i <- 1 + while(is.na(res)) { + read_html(url) %>% + html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% + html_text() -> res + i <- i + 1 + } + res +} + +abgeordneten_list_html <- read_html( + "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") + +selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") +link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) +link <- str_c("https://de.wikipedia.org", link_part2) + +text <- sapply(link, first_content_p_text) +text %>% + str_extract(" ist ein.") %>% + str_replace(" ist eine", "female") %>% + str_replace(" ist ein ", "male") -> + gender + +text %>% + str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% + str_trim() -> + names + +gender <- tibble(speaker = names, + gender = gender) + + +``` +