| @@ -20,7 +20,7 @@ library(dplyr) | |||||
| library(ggplot2) | library(ggplot2) | ||||
| library(stringr) | library(stringr) | ||||
| library(tidyr) | library(tidyr) | ||||
| library(rvest) | |||||
| library(xml2) | |||||
| ``` | ``` | ||||
| ## Preparation of data | ## Preparation of data | ||||
| @@ -33,8 +33,7 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp | |||||
| ```r | ```r | ||||
| read_all("../records/") %>% repair() -> res | read_all("../records/") %>% repair() -> res | ||||
| ``` | ``` | ||||
| We also used `repair` to fix a bunch of formatting issues in the records and unpacked | |||||
| the result into more descriptive variables. | |||||
| We also used `repair` to fix a bunch of formatting issues in the records. | |||||
| For development purposes, we load the tables from csv files. | For development purposes, we load the tables from csv files. | ||||
| ```{r} | ```{r} | ||||
| @@ -48,53 +47,27 @@ speaker <- res$speaker | |||||
| talks <- res$talks | talks <- res$talks | ||||
| ``` | ``` | ||||
| Bevor we can do our analysis, we have to assign a gender to our politicans. | |||||
| Bevor we can do our analysis, we have to assign a gender to our politicans. We do this | |||||
| by reading the gender from the master data of all members of parliament, which is | |||||
| fetched from bundestag.de. | |||||
| ```{r} | ```{r} | ||||
| extract_href <- function(sel, html) { | |||||
| html %>% | |||||
| html_node(sel) %>% | |||||
| html_attr("href") | |||||
| x <- read_xml("../inst/masterdata.xml") | |||||
| mdbs <- xml_find_all(x, "MDB") | |||||
| ids <- c() | |||||
| genders <- c() | |||||
| for (mdb in mdbs) { | |||||
| xml_get(mdb, "ID") -> mdb_id | |||||
| xml_find_first(mdb, "BIOGRAFISCHE_ANGABEN") %>% | |||||
| xml_get("GESCHLECHT") -> | |||||
| mdb_gender | |||||
| ids <- c(ids, mdb_id) | |||||
| genders <- c(genders, if (mdb_gender == "männlich") "male" else "female") | |||||
| } | } | ||||
| first_content_p_text <- function(url) { | |||||
| res <- NA | |||||
| i <- 1 | |||||
| while(is.na(res)) { | |||||
| read_html(url) %>% | |||||
| html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% | |||||
| html_text() -> res | |||||
| i <- i + 1 | |||||
| } | |||||
| res | |||||
| } | |||||
| abgeordneten_list_html <- read_html( | |||||
| "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") | |||||
| selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") | |||||
| link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) | |||||
| link <- str_c("https://de.wikipedia.org", link_part2) | |||||
| text <- sapply(link, first_content_p_text) | |||||
| text %>% | |||||
| str_extract(" ist ein.") %>% | |||||
| str_replace(" ist eine", "female") %>% | |||||
| str_replace(" ist ein ", "male") -> | |||||
| gender | |||||
| text %>% | |||||
| str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% | |||||
| str_trim() -> | |||||
| names | |||||
| gender <- tibble(speaker = names, | |||||
| gender = gender) | |||||
| speaker %>% | |||||
| unite("speaker", vorname, nachname, sep = " ") %>% | |||||
| right_join(gender, by = "speaker") -> | |||||
| speaker_with_gender | |||||
| gender <- tibble(id = ids, gender = genders) | |||||
| speaker_with_gender <- left_join(res$speaker, gender) | |||||
| ``` | ``` | ||||
| ## Analyse | ## Analyse | ||||
| @@ -161,7 +134,7 @@ speeches %>% | |||||
| party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos", | ||||
| "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU", | |||||
| "DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU", | |||||
| "FDP", "AfD", NA_character_)) | "FDP", "AfD", NA_character_)) | ||||
| speech_distribution %>% | speech_distribution %>% | ||||
| @@ -179,9 +152,8 @@ speeches %>% | |||||
| summarize(n = n()) %>% | summarize(n = n()) %>% | ||||
| ungroup() %>% | ungroup() %>% | ||||
| arrange(-n) %>% | arrange(-n) %>% | ||||
| left_join(speaker, by=c("speaker" = "id")) %>% | |||||
| unite(name, vorname, nachname, sep = " ") %>% | |||||
| inner_join(gender, by=c("name"= "speaker")) %>% | |||||
| join_speaker(res) %>% | |||||
| left_join(gender, by=c("speaker"="id")) %>% | |||||
| group_by(gender) %>% | group_by(gender) %>% | ||||
| summarise(absolute=sum(n)) %>% | summarise(absolute=sum(n)) %>% | ||||
| filter(gender %in% c("female", "male")) %>% | filter(gender %in% c("female", "male")) %>% | ||||