|
|
|
@@ -0,0 +1,94 @@ |
|
|
|
--- |
|
|
|
title: "genderequality" |
|
|
|
output: rmarkdown::html_vignette |
|
|
|
vignette: > |
|
|
|
%\VignetteIndexEntry{genderequality} |
|
|
|
%\VignetteEngine{knitr::rmarkdown} |
|
|
|
%\VignetteEncoding{UTF-8} |
|
|
|
--- |
|
|
|
|
|
|
|
```{r, include = FALSE} |
|
|
|
knitr::opts_chunk$set( |
|
|
|
collapse = TRUE, |
|
|
|
comment = "#>" |
|
|
|
) |
|
|
|
``` |
|
|
|
|
|
|
|
```{r setup} |
|
|
|
library(hateimparlament) |
|
|
|
library(dplyr) |
|
|
|
library(ggplot2) |
|
|
|
library(stringr) |
|
|
|
library(tidyr) |
|
|
|
library(rvest) |
|
|
|
``` |
|
|
|
|
|
|
|
## Preparation of data |
|
|
|
|
|
|
|
First, you need to download all records of the current legislative period. |
|
|
|
```r |
|
|
|
fetch_all("../records/") # path to directory where records should be stored |
|
|
|
``` |
|
|
|
Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: |
|
|
|
```r |
|
|
|
read_all("../records/") %>% repair() -> res |
|
|
|
``` |
|
|
|
We also used `repair` to fix a bunch of formatting issues in the records and unpacked |
|
|
|
the result into more descriptive variables. |
|
|
|
|
|
|
|
For development purposes, we load the tables from csv files. |
|
|
|
```{r} |
|
|
|
res <- read_from_csv('../csv/') |
|
|
|
``` |
|
|
|
and unpack our tibbles |
|
|
|
```{r} |
|
|
|
comments <- res$comments |
|
|
|
speeches <- res$speeches |
|
|
|
speaker <- res$speaker |
|
|
|
talks <- res$talks |
|
|
|
``` |
|
|
|
|
|
|
|
Bevor we can do our analysis, we have to assign a gender to our politicans. |
|
|
|
|
|
|
|
```{r} |
|
|
|
extract_href <- function(sel, html) { |
|
|
|
html %>% |
|
|
|
html_node(sel) %>% |
|
|
|
html_attr("href") |
|
|
|
} |
|
|
|
|
|
|
|
first_content_p_text <- function(url) { |
|
|
|
res <- NA |
|
|
|
i <- 1 |
|
|
|
while(is.na(res)) { |
|
|
|
read_html(url) %>% |
|
|
|
html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% |
|
|
|
html_text() -> res |
|
|
|
i <- i + 1 |
|
|
|
} |
|
|
|
res |
|
|
|
} |
|
|
|
|
|
|
|
abgeordneten_list_html <- read_html( |
|
|
|
"https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") |
|
|
|
|
|
|
|
selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") |
|
|
|
link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) |
|
|
|
link <- str_c("https://de.wikipedia.org", link_part2) |
|
|
|
|
|
|
|
text <- sapply(link, first_content_p_text) |
|
|
|
text %>% |
|
|
|
str_extract(" ist ein.") %>% |
|
|
|
str_replace(" ist eine", "female") %>% |
|
|
|
str_replace(" ist ein ", "male") -> |
|
|
|
gender |
|
|
|
|
|
|
|
text %>% |
|
|
|
str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% |
|
|
|
str_trim() -> |
|
|
|
names |
|
|
|
|
|
|
|
gender <- tibble(name = names, |
|
|
|
gender = gender) |
|
|
|
``` |
|
|
|
|