| @@ -20,11 +20,11 @@ find_word <- function(res, word) { | |||||
| #' add information from speaker table to a tibble containing speaker id | #' add information from speaker table to a tibble containing speaker id | ||||
| #' | #' | ||||
| #' @param tb tibble | #' @param tb tibble | ||||
| #' @param res tibble | |||||
| #' @param fraction_only bool | |||||
| #' @param res list of tibbles | |||||
| #' @param fraction_only if TRUE, only select fraction from the resulting joined tibble | |||||
| #' | #' | ||||
| #' left join speaker information from res$speaker into tb. | #' left join speaker information from res$speaker into tb. | ||||
| #' if fraction_only is TRUE, only fraction is selected from the resulting joined tibble | |||||
| #' if fraction_only | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| join_speaker <- function(tb, res, fraction_only = F) { | join_speaker <- function(tb, res, fraction_only = F) { | ||||
| @@ -33,7 +33,7 @@ join_speaker <- function(tb, res, fraction_only = F) { | |||||
| else joined | else joined | ||||
| } | } | ||||
| #' lookup table for party colors | |||||
| #' lookup table for official party colors | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| party_colors <- c( | party_colors <- c( | ||||
| @@ -51,27 +51,27 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | |||||
| "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", | "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", | ||||
| "FDP", "AfD", NA_character_)) | "FDP", "AfD", NA_character_)) | ||||
| #' plot data depending on fractions in a standardized, configurable way | |||||
| #' Bar chart visualizing fraction based data | |||||
| #' | |||||
| #' Can be configured to also visualize data not related to fractions. | |||||
| #' | #' | ||||
| #' @param tb tibble | #' @param tb tibble | ||||
| #' @param x_variable column in tb | |||||
| #' @param y_variable column in tb | |||||
| #' @param fill column in tb | |||||
| #' @param title char | |||||
| #' @param xlab char | |||||
| #' @param ylab char | |||||
| #' @param filllab char | |||||
| #' @param flipped bool | |||||
| #' @param position char | |||||
| #' @param reorder bool | |||||
| #' @param x_variable column in tb, default is fraction | |||||
| #' @param y_variable column in tb, default is n | |||||
| #' @param fill column in tb, default is fraction | |||||
| #' @param title plot title | |||||
| #' @param xlab label for x axis, default is fraction | |||||
| #' @param ylab label for y axis, default is n | |||||
| #' @param filllab default is 'Fraction' | |||||
| #' @param flipped if TRUE draw bars horizontally, else vertically. Default is TRUE | |||||
| #' @param position default is 'dodge' | |||||
| #' @param reorder Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default). | |||||
| #' | #' | ||||
| #' plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. | #' plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. | ||||
| #' Then color the plot depending on the fill value | |||||
| #' Give the plot a title, an x-label xlab as well as an y-label ylab | |||||
| #' Color the legend according to filllab | |||||
| #' Setting flipped to TRUE makes the bars horizontal | |||||
| #' Improve positioning details according to position | |||||
| #' and finally reorder x_variable (default ist to order fractions according to seat order) | |||||
| #' Then color the plot depending on the fill value. | |||||
| #' Give the plot a title and a label for x-axis and y-axis, | |||||
| #' color the legend according to filllab and finally | |||||
| #' improve positioning details according to position | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| bar_plot_fractions <- function(tb, | bar_plot_fractions <- function(tb, | ||||
| @@ -116,16 +116,15 @@ bar_plot_fractions <- function(tb, | |||||
| if (flipped) plt + coord_flip() else plt | if (flipped) plt + coord_flip() else plt | ||||
| } | } | ||||
| #' Counts how many talks do match a given pattern and summarises by date | |||||
| #' | |||||
| #' @param res tibble | |||||
| #' @param patterns char list | |||||
| #' @param name char ? what is name needed for?? | |||||
| #' @param tidy bool, default F | |||||
| #' | |||||
| #' shorter summary if tidy=F | |||||
| #' if tidy is set to T, the resulting tibble is tidy | |||||
| #' | |||||
| #' Word usage summarised by date | |||||
| #' | |||||
| #' Counts how many talks do match a given pattern and summarises by date. | |||||
| #' | |||||
| #' @param res List of Tibbles to be analysed. | |||||
| #' @param patterns Words to look up. | |||||
| #' @param name ? | |||||
| #' @param tidy default is FALSE. | |||||
| #' | |||||
| #' @export | #' @export | ||||
| word_usage_by_date <- function(res, patterns, name, tidy=F) { | word_usage_by_date <- function(res, patterns, name, tidy=F) { | ||||
| tb <- res$talks | tb <- res$talks | ||||
| @@ -205,13 +205,11 @@ parse_speakerlist <- function(speakerliste_xml) { | |||||
| rolle_lang = d["rolle_lang",]) | rolle_lang = d["rolle_lang",]) | ||||
| } | } | ||||
| #' Write the parsed and repaired results into a csv file to make loading and developing faster and easier | |||||
| #' Write the parsed and repaired results into separate csv files | |||||
| #' | #' | ||||
| #' @param tables tibble list | |||||
| #' @param path char | |||||
| #' @param create bool | |||||
| #' | |||||
| #' if create is set to TRUE, the directory given in path is created | |||||
| #' @param tables list of tables to convert into a csv files. | |||||
| #' @param path where to put the csv files. | |||||
| #' @param create set TRUE if the path does not exist yet and you want to create it | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| write_to_csv <- function(tables, path="data/csv/", create=F) { | write_to_csv <- function(tables, path="data/csv/", create=F) { | ||||
| @@ -223,11 +221,12 @@ write_to_csv <- function(tables, path="data/csv/", create=F) { | |||||
| write.table(tables$applause, str_c(path, "applause.csv")) | write.table(tables$applause, str_c(path, "applause.csv")) | ||||
| } | } | ||||
| #' Read the needed tables for developing from a csv file. | |||||
| #' | |||||
| #' @param path char | |||||
| #' | |||||
| #' Reading the tables from a csv is way faster than reading and repairing the data every single time | |||||
| #' create a tibble from the csv file | |||||
| #' | |||||
| #' @param path directory to read files from | |||||
| #' | |||||
| #' reading the tables from a csv is way faster than reading and repairing the data every single time | |||||
| #' | #' | ||||
| #' @export | #' @export | ||||
| read_from_csv <- function(path="data/csv/") { | read_from_csv <- function(path="data/csv/") { | ||||
| @@ -2,7 +2,7 @@ | |||||
| % Please edit documentation in R/analyze.R | % Please edit documentation in R/analyze.R | ||||
| \name{bar_plot_fractions} | \name{bar_plot_fractions} | ||||
| \alias{bar_plot_fractions} | \alias{bar_plot_fractions} | ||||
| \title{plot data depending on fractions in a standardized, configurable way} | |||||
| \title{Bar chart visualizing fraction based data} | |||||
| \usage{ | \usage{ | ||||
| bar_plot_fractions( | bar_plot_fractions( | ||||
| tb, | tb, | ||||
| @@ -21,34 +21,32 @@ bar_plot_fractions( | |||||
| \arguments{ | \arguments{ | ||||
| \item{tb}{tibble} | \item{tb}{tibble} | ||||
| \item{x_variable}{column in tb} | |||||
| \item{x_variable}{column in tb, default is fraction} | |||||
| \item{y_variable}{column in tb} | |||||
| \item{y_variable}{column in tb, default is n} | |||||
| \item{fill}{column in tb} | |||||
| \item{fill}{column in tb, default is fraction} | |||||
| \item{title}{char} | |||||
| \item{title}{plot title} | |||||
| \item{xlab}{char} | |||||
| \item{xlab}{label for x axis, default is fraction} | |||||
| \item{ylab}{char} | |||||
| \item{ylab}{label for y axis, default is n} | |||||
| \item{filllab}{char} | |||||
| \item{filllab}{default is 'Fraction'} | |||||
| \item{flipped}{bool} | |||||
| \item{flipped}{if TRUE draw bars horizontally, else vertically. Default is TRUE} | |||||
| \item{position}{char} | |||||
| \item{position}{default is 'dodge'} | |||||
| \item{reorder}{bool | |||||
| \item{reorder}{Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default). | |||||
| plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. | plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable. | ||||
| Then color the plot depending on the fill value | |||||
| Give the plot a title, an x-label xlab as well as an y-label ylab | |||||
| Color the legend according to filllab | |||||
| Setting flipped to TRUE makes the bars horizontal | |||||
| Improve positioning details according to position | |||||
| and finally reorder x_variable (default ist to order fractions according to seat order)} | |||||
| Then color the plot depending on the fill value. | |||||
| Give the plot a title and a label for x-axis and y-axis, | |||||
| color the legend according to filllab and finally | |||||
| improve positioning details according to position} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| plot data depending on fractions in a standardized, configurable way | |||||
| Can be configured to also visualize data not related to fractions. | |||||
| } | } | ||||
| @@ -8,6 +8,10 @@ fetch_all(download_dir = "data/records/", create = FALSE) | |||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{download_dir}{character} | \item{download_dir}{character} | ||||
| \item{create}{bool | |||||
| if create is TRUE, the directory given in download_dir is created} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| This fetches all available records of the 19th legislative period of the german Bundestag. | This fetches all available records of the 19th legislative period of the german Bundestag. | ||||
| @@ -9,12 +9,12 @@ join_speaker(tb, res, fraction_only = F) | |||||
| \arguments{ | \arguments{ | ||||
| \item{tb}{tibble} | \item{tb}{tibble} | ||||
| \item{res}{tibble} | |||||
| \item{res}{list of tibbles} | |||||
| \item{fraction_only}{bool | |||||
| \item{fraction_only}{if TRUE, only select fraction from the resulting joined tibble | |||||
| left join speaker information from res$speaker into tb. | left join speaker information from res$speaker into tb. | ||||
| if fraction_only is TRUE, only fraction is selected from the resulting joined tibble} | |||||
| if fraction_only} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| add information from speaker table to a tibble containing speaker id | add information from speaker table to a tibble containing speaker id | ||||
| @@ -3,7 +3,7 @@ | |||||
| \docType{data} | \docType{data} | ||||
| \name{party_colors} | \name{party_colors} | ||||
| \alias{party_colors} | \alias{party_colors} | ||||
| \title{lookup table for party colors} | |||||
| \title{lookup table for official party colors} | |||||
| \format{ | \format{ | ||||
| An object of class \code{character} of length 8. | An object of class \code{character} of length 8. | ||||
| } | } | ||||
| @@ -11,6 +11,6 @@ An object of class \code{character} of length 8. | |||||
| party_colors | party_colors | ||||
| } | } | ||||
| \description{ | \description{ | ||||
| lookup table for party colors | |||||
| lookup table for official party colors | |||||
| } | } | ||||
| \keyword{datasets} | \keyword{datasets} | ||||
| @@ -2,15 +2,15 @@ | |||||
| % Please edit documentation in R/parse.R | % Please edit documentation in R/parse.R | ||||
| \name{read_from_csv} | \name{read_from_csv} | ||||
| \alias{read_from_csv} | \alias{read_from_csv} | ||||
| \title{Read the needed tables for developing from a csv file.} | |||||
| \title{create a tibble from the csv file} | |||||
| \usage{ | \usage{ | ||||
| read_from_csv(path = "data/csv/") | read_from_csv(path = "data/csv/") | ||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{path}{char | |||||
| \item{path}{directory to read files from | |||||
| Reading the tables from a csv is way faster than reading and repairing the data every single time} | |||||
| reading the tables from a csv is way faster than reading and repairing the data every single time} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| Read the needed tables for developing from a csv file. | |||||
| create a tibble from the csv file | |||||
| } | } | ||||
| @@ -2,22 +2,19 @@ | |||||
| % Please edit documentation in R/analyze.R | % Please edit documentation in R/analyze.R | ||||
| \name{word_usage_by_date} | \name{word_usage_by_date} | ||||
| \alias{word_usage_by_date} | \alias{word_usage_by_date} | ||||
| \title{Counts how many talks do match a given pattern and summarises by date} | |||||
| \title{Word usage summarised by date} | |||||
| \usage{ | \usage{ | ||||
| word_usage_by_date(res, patterns, name, tidy = F) | word_usage_by_date(res, patterns, name, tidy = F) | ||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{res}{tibble} | |||||
| \item{res}{List of Tibbles to be analysed.} | |||||
| \item{patterns}{char list} | |||||
| \item{patterns}{Words to look up.} | |||||
| \item{name}{char ? what is name needed for??} | |||||
| \item{name}{?} | |||||
| \item{tidy}{bool, default F | |||||
| shorter summary if tidy=F | |||||
| if tidy is set to T, the resulting tibble is tidy} | |||||
| \item{tidy}{default is FALSE.} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| Counts how many talks do match a given pattern and summarises by date | |||||
| Counts how many talks do match a given pattern and summarises by date. | |||||
| } | } | ||||
| @@ -2,19 +2,17 @@ | |||||
| % Please edit documentation in R/parse.R | % Please edit documentation in R/parse.R | ||||
| \name{write_to_csv} | \name{write_to_csv} | ||||
| \alias{write_to_csv} | \alias{write_to_csv} | ||||
| \title{Write the parsed and repaired results into a csv file to make loading and developing faster and easier} | |||||
| \title{Write the parsed and repaired results into separate csv files} | |||||
| \usage{ | \usage{ | ||||
| write_to_csv(tables, path = "data/csv/", create = F) | write_to_csv(tables, path = "data/csv/", create = F) | ||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{tables}{tibble list} | |||||
| \item{tables}{list of tables to convert into a csv files.} | |||||
| \item{path}{char} | |||||
| \item{path}{where to put the csv files.} | |||||
| \item{create}{bool | |||||
| if create is set to TRUE, the directory given in path is created} | |||||
| \item{create}{set TRUE if the path does not exist yet and you want to create it} | |||||
| } | } | ||||
| \description{ | \description{ | ||||
| Write the parsed and repaired results into a csv file to make loading and developing faster and easier | |||||
| Write the parsed and repaired results into separate csv files | |||||
| } | } | ||||
| @@ -0,0 +1,96 @@ | |||||
| --- | |||||
| title: "genderequality" | |||||
| output: rmarkdown::html_vignette | |||||
| vignette: > | |||||
| %\VignetteIndexEntry{genderequality} | |||||
| %\VignetteEngine{knitr::rmarkdown} | |||||
| %\VignetteEncoding{UTF-8} | |||||
| --- | |||||
| ```{r, include = FALSE} | |||||
| knitr::opts_chunk$set( | |||||
| collapse = TRUE, | |||||
| comment = "#>" | |||||
| ) | |||||
| ``` | |||||
| ```{r setup} | |||||
| library(hateimparlament) | |||||
| library(dplyr) | |||||
| library(ggplot2) | |||||
| library(stringr) | |||||
| library(tidyr) | |||||
| library(rvest) | |||||
| ``` | |||||
| ## Preparation of data | |||||
| First, you need to download all records of the current legislative period. | |||||
| ```r | |||||
| fetch_all("../records/") # path to directory where records should be stored | |||||
| ``` | |||||
| Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: | |||||
| ```r | |||||
| read_all("../records/") %>% repair() -> res | |||||
| ``` | |||||
| We also used `repair` to fix a bunch of formatting issues in the records and unpacked | |||||
| the result into more descriptive variables. | |||||
| For development purposes, we load the tables from csv files. | |||||
| ```{r} | |||||
| res <- read_from_csv('../csv/') | |||||
| ``` | |||||
| and unpack our tibbles | |||||
| ```{r} | |||||
| comments <- res$comments | |||||
| speeches <- res$speeches | |||||
| speaker <- res$speaker | |||||
| talks <- res$talks | |||||
| ``` | |||||
| Bevor we can do our analysis, we have to assign a gender to our politicans. | |||||
| ```{r} | |||||
| extract_href <- function(sel, html) { | |||||
| html %>% | |||||
| html_node(sel) %>% | |||||
| html_attr("href") | |||||
| } | |||||
| first_content_p_text <- function(url) { | |||||
| res <- NA | |||||
| i <- 1 | |||||
| while(is.na(res)) { | |||||
| read_html(url) %>% | |||||
| html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% | |||||
| html_text() -> res | |||||
| i <- i + 1 | |||||
| } | |||||
| res | |||||
| } | |||||
| abgeordneten_list_html <- read_html( | |||||
| "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") | |||||
| selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") | |||||
| link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) | |||||
| link <- str_c("https://de.wikipedia.org", link_part2) | |||||
| text <- sapply(link, first_content_p_text) | |||||
| text %>% | |||||
| str_extract(" ist ein.") %>% | |||||
| str_replace(" ist eine", "female") %>% | |||||
| str_replace(" ist ein ", "male") -> | |||||
| gender | |||||
| text %>% | |||||
| str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% | |||||
| str_trim() -> | |||||
| names | |||||
| gender <- tibble(speaker = names, | |||||
| gender = gender) | |||||
| ``` | |||||