瀏覽代碼

add masterdata from bundestag.de and use this for genderequality

genderequality-alternative
flavis 4 年之前
父節點
當前提交
f753920d34
共有 2 個文件被更改,包括 109011 次插入50 次删除
  1. +108989
    -0
      inst/masterdata.xml
  2. +22
    -50
      vignettes/genderequality.Rmd

+ 108989
- 0
inst/masterdata.xml
文件差異過大導致無法顯示
查看文件


+ 22
- 50
vignettes/genderequality.Rmd 查看文件

@@ -20,7 +20,7 @@ library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
library(rvest)
library(xml2)
```

## Preparation of data
@@ -33,8 +33,7 @@ Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomp
```r
read_all("../records/") %>% repair() -> res
```
We also used `repair` to fix a bunch of formatting issues in the records and unpacked
the result into more descriptive variables.
We also used `repair` to fix a bunch of formatting issues in the records.

For development purposes, we load the tables from csv files.
```{r}
@@ -48,53 +47,27 @@ speaker <- res$speaker
talks <- res$talks
```

Bevor we can do our analysis, we have to assign a gender to our politicans.
Bevor we can do our analysis, we have to assign a gender to our politicans. We do this
by reading the gender from the master data of all members of parliament, which is
fetched from bundestag.de.

```{r}
extract_href <- function(sel, html) {
html %>%
html_node(sel) %>%
html_attr("href")
x <- read_xml("../inst/masterdata.xml")
mdbs <- xml_find_all(x, "MDB")

ids <- c()
genders <- c()
for (mdb in mdbs) {
xml_get(mdb, "ID") -> mdb_id
xml_find_first(mdb, "BIOGRAFISCHE_ANGABEN") %>%
xml_get("GESCHLECHT") ->
mdb_gender
ids <- c(ids, mdb_id)
genders <- c(genders, if (mdb_gender == "männlich") "male" else "female")
}

first_content_p_text <- function(url) {
res <- NA
i <- 1
while(is.na(res)) {
read_html(url) %>%
html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>%
html_text() -> res
i <- i + 1
}
res
}

abgeordneten_list_html <- read_html(
"https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")

selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
link <- str_c("https://de.wikipedia.org", link_part2)

text <- sapply(link, first_content_p_text)
text %>%
str_extract(" ist ein.") %>%
str_replace(" ist eine", "female") %>%
str_replace(" ist ein ", "male") ->
gender

text %>%
str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>%
str_trim() ->
names

gender <- tibble(speaker = names,
gender = gender)

speaker %>%
unite("speaker", vorname, nachname, sep = " ") %>%
right_join(gender, by = "speaker") ->
speaker_with_gender
gender <- tibble(id = ids, gender = genders)
speaker_with_gender <- left_join(res$speaker, gender)
```

## Analyse
@@ -161,7 +134,7 @@ speeches %>%


party_order <- factor(c("Fraktionslos", "AfD&Fraktionslos",
"DIE LINKE", "BÜNDNIS 90 / DIE GRÜNEN", "SPD", "CDU/CSU",
"DIE LINKE", "BÜNDNIS 90/DIE GRÜNEN", "SPD", "CDU/CSU",
"FDP", "AfD", NA_character_))

speech_distribution %>%
@@ -179,9 +152,8 @@ speeches %>%
summarize(n = n()) %>%
ungroup() %>%
arrange(-n) %>%
left_join(speaker, by=c("speaker" = "id")) %>%
unite(name, vorname, nachname, sep = " ") %>%
inner_join(gender, by=c("name"= "speaker")) %>%
join_speaker(res) %>%
left_join(gender, by=c("speaker"="id")) %>%
group_by(gender) %>%
summarise(absolute=sum(n)) %>%
filter(gender %in% c("female", "male")) %>%


Loading…
取消
儲存