Merge branch 'master' of https://git.flavigny.de/christian/hateimparlament

hace 4 años · feed583fa9
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -20,11 +20,11 @@ find_word <- function(res, word) {
 #' add information from speaker table to a tibble containing speaker id
 #' 
 #' @param tb tibble
 #' @param res tibble
 #' @param fraction_only bool
 #' @param res list of tibbles
 #' @param fraction_only if TRUE, only select fraction from the resulting joined tibble
 #' 
 #' left join speaker information from res$speaker into tb.
 #' if fraction_only is TRUE, only fraction is selected from the resulting joined tibble
 #' if fraction_only 
 #'
 #' @export
 join_speaker <- function(tb, res, fraction_only = F) {
@@ -33,7 +33,7 @@ join_speaker <- function(tb, res, fraction_only = F) {
    else joined
 }

 #' lookup table for party colors
 #' lookup table for official party colors
 #'
 #' @export
 party_colors <- c(
@@ -51,27 +51,27 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",
                        "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU",
                        "FDP", "AfD", NA_character_))

 #' plot data depending on fractions in a standardized, configurable way
 #' Bar chart visualizing fraction based data
 #' 
 #' Can be configured to also visualize data not related to fractions.
 #' 
 #' @param tb tibble
 #' @param x_variable column in tb
 #' @param y_variable column in tb
 #' @param fill column in tb
 #' @param title char
 #' @param xlab char
 #' @param ylab char
 #' @param filllab char
 #' @param flipped bool
 #' @param position char
 #' @param reorder bool
 #' @param x_variable column in tb, default is fraction
 #' @param y_variable column in tb, default is n
 #' @param fill column in tb, default is fraction
 #' @param title plot title
 #' @param xlab label for x axis, default is fraction
 #' @param ylab label for y axis, default is n
 #' @param filllab default is 'Fraction'
 #' @param flipped if TRUE draw bars horizontally, else vertically. Default is TRUE
 #' @param position default is 'dodge'
 #' @param reorder Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default).
 #' 
 #' plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable.
 #' Then color the plot depending on the fill value
 #' Give the plot a title, an x-label xlab as well as an y-label ylab
 #' Color the legend according to filllab
 #' Setting flipped to TRUE makes the bars horizontal
 #' Improve positioning details according to position
 #' and finally reorder x_variable (default ist to order fractions according to seat order)
 #' Then color the plot depending on the fill value.
 #' Give the plot a title and a label for x-axis and y-axis,
 #' color the legend according to filllab and finally
 #' improve positioning details according to position
 #' 
 #' @export
 bar_plot_fractions <- function(tb,
@@ -116,16 +116,15 @@ bar_plot_fractions <- function(tb,
    if (flipped) plt + coord_flip() else plt
 }

 #' Counts how many talks do match a given pattern and summarises by date
 #' 
 #' @param res tibble
 #' @param patterns char list
 #' @param name char ? what is name needed for??
 #' @param tidy bool, default F
 #' 
 #' shorter summary if tidy=F
 #' if tidy is set to T, the resulting tibble is tidy
 #' 
 #' Word usage summarised by date
 #'
 #' Counts how many talks do match a given pattern and summarises by date.
 #'
 #' @param res List of Tibbles to be analysed.
 #' @param patterns Words to look up.
 #' @param name ?
 #' @param tidy default is FALSE.
 #'
 #' @export
 word_usage_by_date <- function(res, patterns, name, tidy=F) {
    tb <- res$talks
--- a/R/parse.R
+++ b/R/parse.R
@@ -205,13 +205,11 @@ parse_speakerlist <- function(speakerliste_xml) {
           rolle_lang = d["rolle_lang",])
 }

 #' Write the parsed and repaired results into a csv file to make loading and developing faster and easier
 #' Write the parsed and repaired results into separate csv files
 #' 
 #' @param tables tibble list
 #' @param path char
 #' @param create bool
 #' 
 #' if create is set to TRUE, the directory given in path is created
 #' @param tables list of tables to convert into a csv files.
 #' @param path where to put the csv files.
 #' @param create set TRUE if the path does not exist yet and you want to create it
 #' 
 #' @export
 write_to_csv <- function(tables, path="data/csv/", create=F) {
@@ -223,11 +221,12 @@ write_to_csv <- function(tables, path="data/csv/", create=F) {
    write.table(tables$applause, str_c(path, "applause.csv"))
 }

 #' Read the needed tables for developing from a csv file.
 #' 
 #' @param path char
 #' 
 #' Reading the tables from a csv is way faster than reading and repairing the data every single time

 #' create a tibble from the csv file
 #'
 #' @param path directory to read files from
 #'
 #' reading the tables from a csv is way faster than reading and repairing the data every single time
 #' 
 #' @export
 read_from_csv <- function(path="data/csv/") {
--- a/man/bar_plot_fractions.Rd
+++ b/man/bar_plot_fractions.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/analyze.R
 \name{bar_plot_fractions}
 \alias{bar_plot_fractions}
 \title{plot data depending on fractions in a standardized, configurable way}
 \title{Bar chart visualizing fraction based data}
 \usage{
 bar_plot_fractions(
  tb,
@@ -21,34 +21,32 @@ bar_plot_fractions(
 \arguments{
 \item{tb}{tibble}

 \item{x_variable}{column in tb}
 \item{x_variable}{column in tb, default is fraction}

 \item{y_variable}{column in tb}
 \item{y_variable}{column in tb, default is n}

 \item{fill}{column in tb}
 \item{fill}{column in tb, default is fraction}

 \item{title}{char}
 \item{title}{plot title}

 \item{xlab}{char}
 \item{xlab}{label for x axis, default is fraction}

 \item{ylab}{char}
 \item{ylab}{label for y axis, default is n}

 \item{filllab}{char}
 \item{filllab}{default is 'Fraction'}

 \item{flipped}{bool}
 \item{flipped}{if TRUE draw bars horizontally, else vertically. Default is TRUE}

 \item{position}{char}
 \item{position}{default is 'dodge'}

 \item{reorder}{bool
 \item{reorder}{Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default).

 plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable.
 Then color the plot depending on the fill value
 Give the plot a title, an x-label xlab as well as an y-label ylab
 Color the legend according to filllab
 Setting flipped to TRUE makes the bars horizontal
 Improve positioning details according to position
 and finally reorder x_variable (default ist to order fractions according to seat order)}
 Then color the plot depending on the fill value.
 Give the plot a title and a label for x-axis and y-axis,
 color the legend according to filllab and finally
 improve positioning details according to position}
 }
 \description{
 plot data depending on fractions in a standardized, configurable way
 Can be configured to also visualize data not related to fractions.
 }
--- a/man/fetch_all.Rd
+++ b/man/fetch_all.Rd
@@ -8,6 +8,10 @@ fetch_all(download_dir = "data/records/", create = FALSE)
 }
 \arguments{
 \item{download_dir}{character}

 \item{create}{bool

 if create is TRUE, the directory given in download_dir is created}
 }
 \description{
 This fetches all available records of the 19th legislative period of the german Bundestag.
--- a/man/join_speaker.Rd
+++ b/man/join_speaker.Rd
@@ -9,12 +9,12 @@ join_speaker(tb, res, fraction_only = F)
 \arguments{
 \item{tb}{tibble}

 \item{res}{tibble}
 \item{res}{list of tibbles}

 \item{fraction_only}{bool
 \item{fraction_only}{if TRUE, only select fraction from the resulting joined tibble

 left join speaker information from res$speaker into tb.
 if fraction_only is TRUE, only fraction is selected from the resulting joined tibble}
 if fraction_only}
 }
 \description{
 add information from speaker table to a tibble containing speaker id
--- a/man/party_colors.Rd
+++ b/man/party_colors.Rd
@@ -3,7 +3,7 @@
 \docType{data}
 \name{party_colors}
 \alias{party_colors}
 \title{lookup table for party colors}
 \title{lookup table for official party colors}
 \format{
 An object of class \code{character} of length 8.
 }
@@ -11,6 +11,6 @@ An object of class \code{character} of length 8.
 party_colors
 }
 \description{
 lookup table for party colors
 lookup table for official party colors
 }
 \keyword{datasets}
--- a/man/read_from_csv.Rd
+++ b/man/read_from_csv.Rd
@@ -2,15 +2,15 @@
 % Please edit documentation in R/parse.R
 \name{read_from_csv}
 \alias{read_from_csv}
 \title{Read the needed tables for developing from a csv file.}
 \title{create a tibble from the csv file}
 \usage{
 read_from_csv(path = "data/csv/")
 }
 \arguments{
 \item{path}{char
 \item{path}{directory to read files from

 Reading the tables from a csv is way faster than reading and repairing the data every single time}
 reading the tables from a csv is way faster than reading and repairing the data every single time}
 }
 \description{
 Read the needed tables for developing from a csv file.
 create a tibble from the csv file
 }
--- a/man/word_usage_by_date.Rd
+++ b/man/word_usage_by_date.Rd
@@ -2,22 +2,19 @@
 % Please edit documentation in R/analyze.R
 \name{word_usage_by_date}
 \alias{word_usage_by_date}
 \title{Counts how many talks do match a given pattern and summarises by date}
 \title{Word usage summarised by date}
 \usage{
 word_usage_by_date(res, patterns, name, tidy = F)
 }
 \arguments{
 \item{res}{tibble}
 \item{res}{List of Tibbles to be analysed.}

 \item{patterns}{char list}
 \item{patterns}{Words to look up.}

 \item{name}{char ? what is name needed for??}
 \item{name}{?}

 \item{tidy}{bool, default F

 shorter summary if tidy=F
 if tidy is set to T, the resulting tibble is tidy}
 \item{tidy}{default is FALSE.}
 }
 \description{
 Counts how many talks do match a given pattern and summarises by date
 Counts how many talks do match a given pattern and summarises by date.
 }
--- a/man/write_to_csv.Rd
+++ b/man/write_to_csv.Rd
@@ -2,19 +2,17 @@
 % Please edit documentation in R/parse.R
 \name{write_to_csv}
 \alias{write_to_csv}
 \title{Write the parsed and repaired results into a csv file to make loading and developing faster and easier}
 \title{Write the parsed and repaired results into separate csv files}
 \usage{
 write_to_csv(tables, path = "data/csv/", create = F)
 }
 \arguments{
 \item{tables}{tibble list}
 \item{tables}{list of tables to convert into a csv files.}

 \item{path}{char}
 \item{path}{where to put the csv files.}

 \item{create}{bool

 if create is set to TRUE, the directory given in path is created}
 \item{create}{set TRUE if the path does not exist yet and you want to create it}
 }
 \description{
 Write the parsed and repaired results into a csv file to make loading and developing faster and easier
 Write the parsed and repaired results into separate csv files
 }
--- a/vignettes/genderequality.Rmd
+++ b/vignettes/genderequality.Rmd
@@ -0,0 +1,96 @@
 ---
 title: "genderequality"
 output: rmarkdown::html_vignette
 vignette: >
  %\VignetteIndexEntry{genderequality}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
 ---

 ```{r, include = FALSE}
 knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
 )
 ```

 ```{r setup}
 library(hateimparlament)
 library(dplyr)
 library(ggplot2)
 library(stringr)
 library(tidyr)
 library(rvest)
 ```

 ## Preparation of data

 First, you need to download all records of the current legislative period.
 ```r
 fetch_all("../records/") # path to directory where records should be stored
 ```
 Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
 ```r
 read_all("../records/") %>% repair() -> res
 ```
 We also used `repair` to fix a bunch of formatting issues in the records and unpacked
 the result into more descriptive variables.

 For development purposes, we load the tables from csv files.
 ```{r}
 res <- read_from_csv('../csv/')
 ```
 and unpack our tibbles
 ```{r}
 comments <- res$comments
 speeches <- res$speeches
 speaker <- res$speaker
 talks <- res$talks
 ```

 Bevor we can do our analysis, we have to assign a gender to our politicans.

 ```{r}
 extract_href <- function(sel, html) {
  html %>%
    html_node(sel) %>%
    html_attr("href")
 }

 first_content_p_text <- function(url) {
  res <- NA
  i <- 1
  while(is.na(res)) {
    read_html(url) %>% 
      html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})"))  %>% 
      html_text() -> res
    i <- i + 1
  }
  res
 }

 abgeordneten_list_html <- read_html(
  "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")

 selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
 link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
 link <- str_c("https://de.wikipedia.org", link_part2)

 text <- sapply(link, first_content_p_text)
 text %>% 
  str_extract(" ist ein.") %>% 
  str_replace(" ist eine", "female") %>% 
  str_replace(" ist ein ", "male") ->
  gender

 text %>% 
  str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% 
  str_trim() -> 
  names

 gender <- tibble(speaker = names,
                 gender = gender)


 ```