add masterdata from bundestag.de and use this for genderequality

4 年之前 · f753920d34
--- a/inst/masterdata.xml
+++ b/inst/masterdata.xml
--- a/vignettes/genderequality.Rmd
+++ b/vignettes/genderequality.Rmd
@@ -20,7 +20,7 @@ library(dplyr)
 library(ggplot2)
 library(stringr)
 library(tidyr)
 library(rvest)
 library(xml2)
 ```

 ## Preparation of data
@@ -33,8 +33,7 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp
 ```r
 read_all("../records/") %>% repair() -> res
 ```
 We also used `repair` to fix a bunch of formatting issues in the records and unpacked
 the result into more descriptive variables.
 We also used `repair` to fix a bunch of formatting issues in the records.

 For development purposes, we load the tables from csv files.
 ```{r}
@@ -48,53 +47,27 @@ speaker <- res$speaker
 talks <- res$talks
 ```

 Bevor we can do our analysis, we have to assign a gender to our politicans.
 Bevor we can do our analysis, we have to assign a gender to our politicans. We do this
 by reading the gender from the master data of all members of parliament, which is
 fetched from bundestag.de.

 ```{r}
 extract_href <- function(sel, html) {
  html %>%
    html_node(sel) %>%
    html_attr("href")
 x <- read_xml("../inst/masterdata.xml")
 mdbs <- xml_find_all(x, "MDB")

 ids <- c()
 genders <- c()
 for (mdb in mdbs) {
    xml_get(mdb, "ID") -> mdb_id
    xml_find_first(mdb, "BIOGRAFISCHE_ANGABEN") %>%
        xml_get("GESCHLECHT") ->
        mdb_gender
    ids <- c(ids, mdb_id)
    genders <- c(genders, if (mdb_gender == "männlich") "male" else "female")
 }

 first_content_p_text <- function(url) {
  res <- NA
  i <- 1
  while(is.na(res)) {
    read_html(url) %>% 
      html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})"))  %>% 
      html_text() -> res
    i <- i + 1
  }
  res
 }

 abgeordneten_list_html <- read_html(
  "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")

 selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
 link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
 link <- str_c("https://de.wikipedia.org", link_part2)

 text <- sapply(link, first_content_p_text)
 text %>% 
  str_extract(" ist ein.") %>% 
  str_replace(" ist eine", "female") %>% 
  str_replace(" ist ein ", "male") ->
  gender

 text %>% 
  str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% 
  str_trim() -> 
  names

 gender <- tibble(speaker = names,
                 gender = gender)

 speaker %>% 
  unite("speaker", vorname, nachname, sep = " ") %>% 
  right_join(gender, by = "speaker") -> 
  speaker_with_gender
 gender <- tibble(id = ids, gender = genders)
 speaker_with_gender <- left_join(res$speaker, gender)
 ```

 ## Analyse
@@ -161,7 +134,7 @@ speeches %>%


 party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",
                        "DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU",
                        "DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU",
                        "FDP", "AfD", NA_character_))

 speech_distribution %>% 
@@ -179,9 +152,8 @@ speeches %>%
  summarize(n = n()) %>%
  ungroup() %>% 
  arrange(-n) %>%
  left_join(speaker, by=c("speaker" = "id")) %>% 
  unite(name, vorname, nachname, sep = " ") %>% 
  inner_join(gender, by=c("name"= "speaker")) %>% 
  join_speaker(res) %>%
  left_join(gender, by=c("speaker"="id")) %>% 
  group_by(gender) %>% 
  summarise(absolute=sum(n)) %>%
  filter(gender %in% c("female", "male")) %>%