|
|
|
@@ -1,8 +1,8 @@ |
|
|
|
--- |
|
|
|
title: "genderequality" |
|
|
|
title: "Differences in gender" |
|
|
|
output: rmarkdown::html_vignette |
|
|
|
vignette: > |
|
|
|
%\VignetteIndexEntry{genderequality} |
|
|
|
%\VignetteIndexEntry{Differences in gender} |
|
|
|
%\VignetteEngine{knitr::rmarkdown} |
|
|
|
%\VignetteEncoding{UTF-8} |
|
|
|
--- |
|
|
|
@@ -20,7 +20,7 @@ library(dplyr) |
|
|
|
library(ggplot2) |
|
|
|
library(stringr) |
|
|
|
library(tidyr) |
|
|
|
library(rvest) |
|
|
|
library(xml2) |
|
|
|
``` |
|
|
|
|
|
|
|
## Preparation of data |
|
|
|
@@ -33,8 +33,7 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp |
|
|
|
```r |
|
|
|
read_all("../records/") %>% repair() -> res |
|
|
|
``` |
|
|
|
We also used `repair` to fix a bunch of formatting issues in the records and unpacked |
|
|
|
the result into more descriptive variables. |
|
|
|
We also used `repair` to fix a bunch of formatting issues in the records. |
|
|
|
|
|
|
|
For development purposes, we load the tables from csv files. |
|
|
|
```{r} |
|
|
|
@@ -48,53 +47,33 @@ speaker <- res$speaker |
|
|
|
talks <- res$talks |
|
|
|
``` |
|
|
|
|
|
|
|
Bevor we can do our analysis, we have to assign a gender to our politicans. |
|
|
|
Bevor we can do our analysis, we have to assign a gender to our politicans. We do this |
|
|
|
by reading the gender from the master data of all members of parliament, which is |
|
|
|
fetched from bundestag.de. |
|
|
|
|
|
|
|
```{r} |
|
|
|
extract_href <- function(sel, html) { |
|
|
|
html %>% |
|
|
|
html_node(sel) %>% |
|
|
|
html_attr("href") |
|
|
|
xml_get <- function(node, name) { |
|
|
|
res <- xml_text(xml_find_all(node, name)) |
|
|
|
if (length(res) == 0) NA_character_ |
|
|
|
else res |
|
|
|
} |
|
|
|
|
|
|
|
first_content_p_text <- function(url) { |
|
|
|
res <- NA |
|
|
|
i <- 1 |
|
|
|
while(is.na(res)) { |
|
|
|
read_html(url) %>% |
|
|
|
html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% |
|
|
|
html_text() -> res |
|
|
|
i <- i + 1 |
|
|
|
} |
|
|
|
res |
|
|
|
x <- read_xml("../inst/masterdata.xml") |
|
|
|
mdbs <- xml_find_all(x, "MDB") |
|
|
|
|
|
|
|
ids <- c() |
|
|
|
genders <- c() |
|
|
|
for (mdb in mdbs) { |
|
|
|
xml_get(mdb, "ID") -> mdb_id |
|
|
|
xml_find_first(mdb, "BIOGRAFISCHE_ANGABEN") %>% |
|
|
|
xml_get("GESCHLECHT") -> |
|
|
|
mdb_gender |
|
|
|
ids <- c(ids, mdb_id) |
|
|
|
genders <- c(genders, if (mdb_gender == "männlich") "male" else "female") |
|
|
|
} |
|
|
|
|
|
|
|
abgeordneten_list_html <- read_html( |
|
|
|
"https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") |
|
|
|
|
|
|
|
selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") |
|
|
|
link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) |
|
|
|
link <- str_c("https://de.wikipedia.org", link_part2) |
|
|
|
|
|
|
|
text <- sapply(link, first_content_p_text) |
|
|
|
text %>% |
|
|
|
str_extract(" ist ein.") %>% |
|
|
|
str_replace(" ist eine", "female") %>% |
|
|
|
str_replace(" ist ein ", "male") -> |
|
|
|
gender |
|
|
|
|
|
|
|
text %>% |
|
|
|
str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% |
|
|
|
str_trim() -> |
|
|
|
names |
|
|
|
|
|
|
|
gender <- tibble(speaker = names, |
|
|
|
gender = gender) |
|
|
|
|
|
|
|
speaker %>% |
|
|
|
unite("speaker", vorname, nachname, sep = " ") %>% |
|
|
|
right_join(gender, by = "speaker") -> |
|
|
|
speaker_with_gender |
|
|
|
gender <- tibble(id = ids, gender = genders) |
|
|
|
speaker_with_gender <- left_join(res$speaker, gender) |
|
|
|
``` |
|
|
|
|
|
|
|
## Analyse |
|
|
|
@@ -161,7 +140,7 @@ speeches %>% |
|
|
|
|
|
|
|
|
|
|
|
party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", |
|
|
|
"DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", |
|
|
|
"DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU", |
|
|
|
"FDP", "AfD", NA_character_)) |
|
|
|
|
|
|
|
speech_distribution %>% |
|
|
|
@@ -179,9 +158,8 @@ speeches %>% |
|
|
|
summarize(n = n()) %>% |
|
|
|
ungroup() %>% |
|
|
|
arrange(-n) %>% |
|
|
|
left_join(speaker, by=c("speaker" = "id")) %>% |
|
|
|
unite(name, vorname, nachname, sep = " ") %>% |
|
|
|
inner_join(gender, by=c("name"= "speaker")) %>% |
|
|
|
join_speaker(res) %>% |
|
|
|
left_join(gender, by=c("speaker"="id")) %>% |
|
|
|
group_by(gender) %>% |
|
|
|
summarise(absolute=sum(n)) %>% |
|
|
|
filter(gender %in% c("female", "male")) %>% |
|
|
|
|