| @@ -1,8 +1,8 @@ | |||
| --- | |||
| title: "genderequality" | |||
| title: "Differences in gender" | |||
| output: rmarkdown::html_vignette | |||
| vignette: > | |||
| %\VignetteIndexEntry{genderequality} | |||
| %\VignetteIndexEntry{Differences in gender} | |||
| %\VignetteEngine{knitr::rmarkdown} | |||
| %\VignetteEncoding{UTF-8} | |||
| --- | |||
| @@ -20,7 +20,7 @@ library(dplyr) | |||
| library(ggplot2) | |||
| library(stringr) | |||
| library(tidyr) | |||
| library(rvest) | |||
| library(xml2) | |||
| ``` | |||
| ## Preparation of data | |||
| @@ -33,8 +33,7 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp | |||
| ```r | |||
| read_all("../records/") %>% repair() -> res | |||
| ``` | |||
| We also used `repair` to fix a bunch of formatting issues in the records and unpacked | |||
| the result into more descriptive variables. | |||
| We also used `repair` to fix a bunch of formatting issues in the records. | |||
| For development purposes, we load the tables from csv files. | |||
| ```{r} | |||
| @@ -48,53 +47,33 @@ speaker <- res$speaker | |||
| talks <- res$talks | |||
| ``` | |||
| Bevor we can do our analysis, we have to assign a gender to our politicans. | |||
| Bevor we can do our analysis, we have to assign a gender to our politicans. We do this | |||
| by reading the gender from the master data of all members of parliament, which is | |||
| fetched from bundestag.de. | |||
| ```{r} | |||
| extract_href <- function(sel, html) { | |||
| html %>% | |||
| html_node(sel) %>% | |||
| html_attr("href") | |||
| xml_get <- function(node, name) { | |||
| res <- xml_text(xml_find_all(node, name)) | |||
| if (length(res) == 0) NA_character_ | |||
| else res | |||
| } | |||
| first_content_p_text <- function(url) { | |||
| res <- NA | |||
| i <- 1 | |||
| while(is.na(res)) { | |||
| read_html(url) %>% | |||
| html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% | |||
| html_text() -> res | |||
| i <- i + 1 | |||
| } | |||
| res | |||
| x <- read_xml("../inst/masterdata.xml") | |||
| mdbs <- xml_find_all(x, "MDB") | |||
| ids <- c() | |||
| genders <- c() | |||
| for (mdb in mdbs) { | |||
| xml_get(mdb, "ID") -> mdb_id | |||
| xml_find_first(mdb, "BIOGRAFISCHE_ANGABEN") %>% | |||
| xml_get("GESCHLECHT") -> | |||
| mdb_gender | |||
| ids <- c(ids, mdb_id) | |||
| genders <- c(genders, if (mdb_gender == "männlich") "male" else "female") | |||
| } | |||
| abgeordneten_list_html <- read_html( | |||
| "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") | |||
| selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") | |||
| link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) | |||
| link <- str_c("https://de.wikipedia.org", link_part2) | |||
| text <- sapply(link, first_content_p_text) | |||
| text %>% | |||
| str_extract(" ist ein.") %>% | |||
| str_replace(" ist eine", "female") %>% | |||
| str_replace(" ist ein ", "male") -> | |||
| gender | |||
| text %>% | |||
| str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% | |||
| str_trim() -> | |||
| names | |||
| gender <- tibble(speaker = names, | |||
| gender = gender) | |||
| speaker %>% | |||
| unite("speaker", vorname, nachname, sep = " ") %>% | |||
| right_join(gender, by = "speaker") -> | |||
| speaker_with_gender | |||
| gender <- tibble(id = ids, gender = genders) | |||
| speaker_with_gender <- left_join(res$speaker, gender) | |||
| ``` | |||
| ## Analyse | |||
| @@ -161,7 +140,7 @@ speeches %>% | |||
| party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | |||
| "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", | |||
| "DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU", | |||
| "FDP", "AfD", NA_character_)) | |||
| speech_distribution %>% | |||
| @@ -179,9 +158,8 @@ speeches %>% | |||
| summarize(n = n()) %>% | |||
| ungroup() %>% | |||
| arrange(-n) %>% | |||
| left_join(speaker, by=c("speaker" = "id")) %>% | |||
| unite(name, vorname, nachname, sep = " ") %>% | |||
| inner_join(gender, by=c("name"= "speaker")) %>% | |||
| join_speaker(res) %>% | |||
| left_join(gender, by=c("speaker"="id")) %>% | |||
| group_by(gender) %>% | |||
| summarise(absolute=sum(n)) %>% | |||
| filter(gender %in% c("female", "male")) %>% | |||