--- title: "genderequality" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{genderequality} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(hateimparlament) library(dplyr) library(ggplot2) library(stringr) library(tidyr) library(rvest) ``` ## Preparation of data First, you need to download all records of the current legislative period. ```r fetch_all("../records/") # path to directory where records should be stored ``` Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by: ```r read_all("../records/") %>% repair() -> res ``` We also used `repair` to fix a bunch of formatting issues in the records and unpacked the result into more descriptive variables. For development purposes, we load the tables from csv files. ```{r} res <- read_from_csv('../inst/csv/') ``` and unpack our tibbles ```{r} comments <- res$comments speeches <- res$speeches speaker <- res$speaker talks <- res$talks ``` Bevor we can do our analysis, we have to assign a gender to our politicans. ```{r} extract_href <- function(sel, html) { html %>% html_node(sel) %>% html_attr("href") } first_content_p_text <- function(url) { res <- NA i <- 1 while(is.na(res)) { read_html(url) %>% html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>% html_text() -> res i <- i + 1 } res } abgeordneten_list_html <- read_html( "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)") selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a") link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html) link <- str_c("https://de.wikipedia.org", link_part2) text <- sapply(link, first_content_p_text) text %>% str_extract(" ist ein.") %>% str_replace(" ist eine", "female") %>% str_replace(" ist ein ", "male") -> gender text %>% str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% str_trim() -> names gender <- tibble(speaker = names, gender = gender) speaker %>% unite("speaker", vorname, nachname, sep = " ") %>% right_join(gender, by = "speaker") -> speaker_with_gender ``` #Analyse First, let's look at the relative distribution of the sexes throughout the whole Bundestag. ```{r} speaker_with_gender %>% select(gender) %>% group_by(gender) %>% summarise("count" = n()) %>% filter(gender %in% c("male", "female")) %>% mutate(portion = 100*count/sum(count)) -> plot1 bp <- ggplot(plot1, aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") pie <- bp + coord_polar("y", start=0) pie + scale_fill_manual(values=c("pink", "blue")) + ggtitle("Relative distribution of sexes") + xlab("") + ylab("") ``` Next we look at the individual distributions between men and women in relation to the individual parties. ```{r} speaker_with_gender %>% select(fraction, gender) %>% group_by(fraction, gender) %>% summarise("count" = n()) %>% filter(gender %in% c("male", "female")) %>% filter(!is.na(fraction)) %>% group_by(fraction) %>% mutate(portion = 100*count/sum(count)) -> plot2 plot2 %>% filter(fraction == "AfD") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie1 <- bp + coord_polar("y", start=0) + ggtitle("AfD") + xlab("") + ylab("") plot2 %>% filter(fraction == "BÜNDNIS 90 / DIE GRÜNEN") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie2 <- bp + coord_polar("y", start=0) + ggtitle("DIE GRÜNEN") + xlab("") + ylab("") plot2 %>% filter(fraction == "CDU/CSU") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie3 <- bp + coord_polar("y", start=0) + ggtitle("CDU/CSU") + xlab("") + ylab("") plot2 %>% filter(fraction == "DIE LINKE") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie4 <- bp + coord_polar("y", start=0) + ggtitle("DIE LINKE") + xlab("") + ylab("") plot2 %>% filter(fraction == "FDP") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie5 <- bp + coord_polar("y", start=0) + ggtitle("FDP") + xlab("") + ylab("") plot2 %>% filter(fraction == "SPD") %>% ggplot(aes(x = "", y = portion, fill = gender))+ geom_bar(width = 1, stat = "identity") -> bp pie6 <- bp + coord_polar("y", start=0) + ggtitle("SPD") + xlab("") + ylab("") gridExtra::grid.arrange(pie1,pie2,pie3,pie4,pie5,pie6,nrow=2) ``` Now let's analyze whether there are any differences in the amount of speeches given. ```{r} speeches %>% group_by(speaker) %>% summarize(n = n()) %>% ungroup() %>% arrange(-n) %>% left_join(speaker, by=c("speaker" = "id")) %>% unite(name, vorname, nachname, sep = " ") %>% inner_join(gender, by=c("name"= "speaker")) %>% group_by(gender) %>% summarise(absolute=sum(n)) %>% filter(gender %in% c("female", "male")) %>% mutate(absolute2=absolute/sum(absolute)) %>% mutate(portion=c(0.32, 0.68)) %>% mutate(relative=absolute*(1-portion)) %>% mutate(relative2=relative/sum(relative)) -> plot3 ``` At first lets take a look at the absolute difference in the amount of speeches by the two sexes. ```{r} barplot(plot3$absolute2, ylab = "amount of speeches", main = "Absolute comparison of speech shares", las = 1, names.arg = c("women", "men"), col = c("pink", "darkblue"), font.main = 4, cex.axis = 0.7) ``` Since there are more men represented in the German Bundestag, we now consider the relative proportions of speeches, depending on the ratio of men and women. ```{r} barplot(plot3$relative2, ylab = "amount of speeches", main = "Relative comparison of speech shares", las = 1, names.arg = c("women", "men"), col = c("pink", "darkblue"), font.main = 4, cex.axis = 0.7) ```