Przeglądaj źródła

Merge branch 'master' of https://git.flavigny.de/christian/hateimparlament

genderequality-alternative
JosuaKugler 4 lat temu
rodzic
commit
feed583fa9
10 zmienionych plików z 176 dodań i 85 usunięć
  1. +30
    -31
      R/analyze.R
  2. +10
    -11
      R/parse.R
  3. +16
    -18
      man/bar_plot_fractions.Rd
  4. +4
    -0
      man/fetch_all.Rd
  5. +3
    -3
      man/join_speaker.Rd
  6. +2
    -2
      man/party_colors.Rd
  7. +4
    -4
      man/read_from_csv.Rd
  8. +6
    -9
      man/word_usage_by_date.Rd
  9. +5
    -7
      man/write_to_csv.Rd
  10. +96
    -0
      vignettes/genderequality.Rmd

+ 30
- 31
R/analyze.R Wyświetl plik

@@ -20,11 +20,11 @@ find_word <- function(res, word) {
#' add information from speaker table to a tibble containing speaker id
#'
#' @param tb tibble
#' @param res tibble
#' @param fraction_only bool
#' @param res list of tibbles
#' @param fraction_only if TRUE, only select fraction from the resulting joined tibble
#'
#' left join speaker information from res$speaker into tb.
#' if fraction_only is TRUE, only fraction is selected from the resulting joined tibble
#' if fraction_only
#'
#' @export
join_speaker <- function(tb, res, fraction_only = F) {
@@ -33,7 +33,7 @@ join_speaker <- function(tb, res, fraction_only = F) {
else joined
}

#' lookup table for party colors
#' lookup table for official party colors
#'
#' @export
party_colors <- c(
@@ -51,27 +51,27 @@ party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",
"DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU",
"FDP", "AfD", NA_character_))

#' plot data depending on fractions in a standardized, configurable way
#' Bar chart visualizing fraction based data
#'
#' Can be configured to also visualize data not related to fractions.
#'
#' @param tb tibble
#' @param x_variable column in tb
#' @param y_variable column in tb
#' @param fill column in tb
#' @param title char
#' @param xlab char
#' @param ylab char
#' @param filllab char
#' @param flipped bool
#' @param position char
#' @param reorder bool
#' @param x_variable column in tb, default is fraction
#' @param y_variable column in tb, default is n
#' @param fill column in tb, default is fraction
#' @param title plot title
#' @param xlab label for x axis, default is fraction
#' @param ylab label for y axis, default is n
#' @param filllab default is 'Fraction'
#' @param flipped if TRUE draw bars horizontally, else vertically. Default is TRUE
#' @param position default is 'dodge'
#' @param reorder Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default).
#'
#' plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable.
#' Then color the plot depending on the fill value
#' Give the plot a title, an x-label xlab as well as an y-label ylab
#' Color the legend according to filllab
#' Setting flipped to TRUE makes the bars horizontal
#' Improve positioning details according to position
#' and finally reorder x_variable (default ist to order fractions according to seat order)
#' Then color the plot depending on the fill value.
#' Give the plot a title and a label for x-axis and y-axis,
#' color the legend according to filllab and finally
#' improve positioning details according to position
#'
#' @export
bar_plot_fractions <- function(tb,
@@ -116,16 +116,15 @@ bar_plot_fractions <- function(tb,
if (flipped) plt + coord_flip() else plt
}

#' Counts how many talks do match a given pattern and summarises by date
#'
#' @param res tibble
#' @param patterns char list
#' @param name char ? what is name needed for??
#' @param tidy bool, default F
#'
#' shorter summary if tidy=F
#' if tidy is set to T, the resulting tibble is tidy
#'
#' Word usage summarised by date
#'
#' Counts how many talks do match a given pattern and summarises by date.
#'
#' @param res List of Tibbles to be analysed.
#' @param patterns Words to look up.
#' @param name ?
#' @param tidy default is FALSE.
#'
#' @export
word_usage_by_date <- function(res, patterns, name, tidy=F) {
tb <- res$talks


+ 10
- 11
R/parse.R Wyświetl plik

@@ -205,13 +205,11 @@ parse_speakerlist <- function(speakerliste_xml) {
rolle_lang = d["rolle_lang",])
}

#' Write the parsed and repaired results into a csv file to make loading and developing faster and easier
#' Write the parsed and repaired results into separate csv files
#'
#' @param tables tibble list
#' @param path char
#' @param create bool
#'
#' if create is set to TRUE, the directory given in path is created
#' @param tables list of tables to convert into a csv files.
#' @param path where to put the csv files.
#' @param create set TRUE if the path does not exist yet and you want to create it
#'
#' @export
write_to_csv <- function(tables, path="data/csv/", create=F) {
@@ -223,11 +221,12 @@ write_to_csv <- function(tables, path="data/csv/", create=F) {
write.table(tables$applause, str_c(path, "applause.csv"))
}

#' Read the needed tables for developing from a csv file.
#'
#' @param path char
#'
#' Reading the tables from a csv is way faster than reading and repairing the data every single time

#' create a tibble from the csv file
#'
#' @param path directory to read files from
#'
#' reading the tables from a csv is way faster than reading and repairing the data every single time
#'
#' @export
read_from_csv <- function(path="data/csv/") {


+ 16
- 18
man/bar_plot_fractions.Rd Wyświetl plik

@@ -2,7 +2,7 @@
% Please edit documentation in R/analyze.R
\name{bar_plot_fractions}
\alias{bar_plot_fractions}
\title{plot data depending on fractions in a standardized, configurable way}
\title{Bar chart visualizing fraction based data}
\usage{
bar_plot_fractions(
tb,
@@ -21,34 +21,32 @@ bar_plot_fractions(
\arguments{
\item{tb}{tibble}

\item{x_variable}{column in tb}
\item{x_variable}{column in tb, default is fraction}

\item{y_variable}{column in tb}
\item{y_variable}{column in tb, default is n}

\item{fill}{column in tb}
\item{fill}{column in tb, default is fraction}

\item{title}{char}
\item{title}{plot title}

\item{xlab}{char}
\item{xlab}{label for x axis, default is fraction}

\item{ylab}{char}
\item{ylab}{label for y axis, default is n}

\item{filllab}{char}
\item{filllab}{default is 'Fraction'}

\item{flipped}{bool}
\item{flipped}{if TRUE draw bars horizontally, else vertically. Default is TRUE}

\item{position}{char}
\item{position}{default is 'dodge'}

\item{reorder}{bool
\item{reorder}{Either reorder fraction factor by variable value or reorder fraction factor by party seat order in parliament (default).

plot data from tb in the following way: for each item in x_variable show the corresponding value in y_variable.
Then color the plot depending on the fill value
Give the plot a title, an x-label xlab as well as an y-label ylab
Color the legend according to filllab
Setting flipped to TRUE makes the bars horizontal
Improve positioning details according to position
and finally reorder x_variable (default ist to order fractions according to seat order)}
Then color the plot depending on the fill value.
Give the plot a title and a label for x-axis and y-axis,
color the legend according to filllab and finally
improve positioning details according to position}
}
\description{
plot data depending on fractions in a standardized, configurable way
Can be configured to also visualize data not related to fractions.
}

+ 4
- 0
man/fetch_all.Rd Wyświetl plik

@@ -8,6 +8,10 @@ fetch_all(download_dir = "data/records/", create = FALSE)
}
\arguments{
\item{download_dir}{character}

\item{create}{bool

if create is TRUE, the directory given in download_dir is created}
}
\description{
This fetches all available records of the 19th legislative period of the german Bundestag.


+ 3
- 3
man/join_speaker.Rd Wyświetl plik

@@ -9,12 +9,12 @@ join_speaker(tb, res, fraction_only = F)
\arguments{
\item{tb}{tibble}

\item{res}{tibble}
\item{res}{list of tibbles}

\item{fraction_only}{bool
\item{fraction_only}{if TRUE, only select fraction from the resulting joined tibble

left join speaker information from res$speaker into tb.
if fraction_only is TRUE, only fraction is selected from the resulting joined tibble}
if fraction_only}
}
\description{
add information from speaker table to a tibble containing speaker id


+ 2
- 2
man/party_colors.Rd Wyświetl plik

@@ -3,7 +3,7 @@
\docType{data}
\name{party_colors}
\alias{party_colors}
\title{lookup table for party colors}
\title{lookup table for official party colors}
\format{
An object of class \code{character} of length 8.
}
@@ -11,6 +11,6 @@ An object of class \code{character} of length 8.
party_colors
}
\description{
lookup table for party colors
lookup table for official party colors
}
\keyword{datasets}

+ 4
- 4
man/read_from_csv.Rd Wyświetl plik

@@ -2,15 +2,15 @@
% Please edit documentation in R/parse.R
\name{read_from_csv}
\alias{read_from_csv}
\title{Read the needed tables for developing from a csv file.}
\title{create a tibble from the csv file}
\usage{
read_from_csv(path = "data/csv/")
}
\arguments{
\item{path}{char
\item{path}{directory to read files from

Reading the tables from a csv is way faster than reading and repairing the data every single time}
reading the tables from a csv is way faster than reading and repairing the data every single time}
}
\description{
Read the needed tables for developing from a csv file.
create a tibble from the csv file
}

+ 6
- 9
man/word_usage_by_date.Rd Wyświetl plik

@@ -2,22 +2,19 @@
% Please edit documentation in R/analyze.R
\name{word_usage_by_date}
\alias{word_usage_by_date}
\title{Counts how many talks do match a given pattern and summarises by date}
\title{Word usage summarised by date}
\usage{
word_usage_by_date(res, patterns, name, tidy = F)
}
\arguments{
\item{res}{tibble}
\item{res}{List of Tibbles to be analysed.}

\item{patterns}{char list}
\item{patterns}{Words to look up.}

\item{name}{char ? what is name needed for??}
\item{name}{?}

\item{tidy}{bool, default F

shorter summary if tidy=F
if tidy is set to T, the resulting tibble is tidy}
\item{tidy}{default is FALSE.}
}
\description{
Counts how many talks do match a given pattern and summarises by date
Counts how many talks do match a given pattern and summarises by date.
}

+ 5
- 7
man/write_to_csv.Rd Wyświetl plik

@@ -2,19 +2,17 @@
% Please edit documentation in R/parse.R
\name{write_to_csv}
\alias{write_to_csv}
\title{Write the parsed and repaired results into a csv file to make loading and developing faster and easier}
\title{Write the parsed and repaired results into separate csv files}
\usage{
write_to_csv(tables, path = "data/csv/", create = F)
}
\arguments{
\item{tables}{tibble list}
\item{tables}{list of tables to convert into a csv files.}

\item{path}{char}
\item{path}{where to put the csv files.}

\item{create}{bool

if create is set to TRUE, the directory given in path is created}
\item{create}{set TRUE if the path does not exist yet and you want to create it}
}
\description{
Write the parsed and repaired results into a csv file to make loading and developing faster and easier
Write the parsed and repaired results into separate csv files
}

+ 96
- 0
vignettes/genderequality.Rmd Wyświetl plik

@@ -0,0 +1,96 @@
---
title: "genderequality"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{genderequality}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```

```{r setup}
library(hateimparlament)
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
library(rvest)
```

## Preparation of data

First, you need to download all records of the current legislative period.
```r
fetch_all("../records/") # path to directory where records should be stored
```
Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
```r
read_all("../records/") %>% repair() -> res
```
We also used `repair` to fix a bunch of formatting issues in the records and unpacked
the result into more descriptive variables.

For development purposes, we load the tables from csv files.
```{r}
res <- read_from_csv('../csv/')
```
and unpack our tibbles
```{r}
comments <- res$comments
speeches <- res$speeches
speaker <- res$speaker
talks <- res$talks
```

Bevor we can do our analysis, we have to assign a gender to our politicans.

```{r}
extract_href <- function(sel, html) {
html %>%
html_node(sel) %>%
html_attr("href")
}

first_content_p_text <- function(url) {
res <- NA
i <- 1
while(is.na(res)) {
read_html(url) %>%
html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>%
html_text() -> res
i <- i + 1
}
res
}

abgeordneten_list_html <- read_html(
"https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")

selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
link <- str_c("https://de.wikipedia.org", link_part2)

text <- sapply(link, first_content_p_text)
text %>%
str_extract(" ist ein.") %>%
str_replace(" ist eine", "female") %>%
str_replace(" ist ein ", "male") ->
gender

text %>%
str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>%
str_trim() ->
names

gender <- tibble(speaker = names,
gender = gender)


```


Ładowanie…
Anuluj
Zapisz