An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

225 строки
6.1KB

  1. ---
  2. title: "genderequality"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{genderequality}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. library(rvest)
  22. ```
  23. ## Preparation of data
  24. First, you need to download all records of the current legislative period.
  25. ```r
  26. fetch_all("../records/") # path to directory where records should be stored
  27. ```
  28. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  29. ```r
  30. read_all("../records/") %>% repair() -> res
  31. ```
  32. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  33. the result into more descriptive variables.
  34. For development purposes, we load the tables from csv files.
  35. ```{r}
  36. res <- read_from_csv('../inst/csv/')
  37. ```
  38. and unpack our tibbles
  39. ```{r}
  40. comments <- res$comments
  41. speeches <- res$speeches
  42. speaker <- res$speaker
  43. talks <- res$talks
  44. ```
  45. Bevor we can do our analysis, we have to assign a gender to our politicans.
  46. ```{r}
  47. extract_href <- function(sel, html) {
  48. html %>%
  49. html_node(sel) %>%
  50. html_attr("href")
  51. }
  52. first_content_p_text <- function(url) {
  53. res <- NA
  54. i <- 1
  55. while(is.na(res)) {
  56. read_html(url) %>%
  57. html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>%
  58. html_text() -> res
  59. i <- i + 1
  60. }
  61. res
  62. }
  63. abgeordneten_list_html <- read_html(
  64. "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")
  65. selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
  66. link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
  67. link <- str_c("https://de.wikipedia.org", link_part2)
  68. text <- sapply(link, first_content_p_text)
  69. text %>%
  70. str_extract(" ist ein.") %>%
  71. str_replace(" ist eine", "female") %>%
  72. str_replace(" ist ein ", "male") ->
  73. gender
  74. text %>%
  75. str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>%
  76. str_trim() ->
  77. names
  78. gender <- tibble(speaker = names,
  79. gender = gender)
  80. speaker %>%
  81. unite("speaker", vorname, nachname, sep = " ") %>%
  82. right_join(gender, by = "speaker") ->
  83. speaker_with_gender
  84. ```
  85. #Analyse
  86. First, let's look at the relative distribution of the sexes throughout the whole Bundestag.
  87. ```{r}
  88. speaker_with_gender %>%
  89. select(gender) %>%
  90. group_by(gender) %>%
  91. summarise("count" = n()) %>%
  92. filter(gender %in% c("male", "female")) %>%
  93. mutate(portion = 100*count/sum(count)) ->
  94. plot1
  95. bp <- ggplot(plot1, aes(x = "", y = portion, fill = gender))+
  96. geom_bar(width = 1, stat = "identity")
  97. pie <- bp + coord_polar("y", start=0)
  98. pie +
  99. scale_fill_manual(values=c("pink", "blue")) +
  100. ggtitle("Relative distribution of sexes") +
  101. xlab("") +
  102. ylab("")
  103. ```
  104. Next we look at the individual distributions between men and women in relation to the individual parties.
  105. ```{r}
  106. speaker_with_gender %>%
  107. select(fraction, gender) %>%
  108. group_by(fraction, gender) %>%
  109. summarise("count" = n()) %>%
  110. filter(gender %in% c("male", "female")) %>%
  111. filter(!is.na(fraction)) %>%
  112. group_by(fraction) %>%
  113. mutate(portion = 100*count/sum(count)) ->
  114. plot2
  115. plot2 %>%
  116. filter(fraction == "AfD") %>%
  117. ggplot(aes(x = "", y = portion, fill = gender))+
  118. geom_bar(width = 1, stat = "identity") ->
  119. bp
  120. pie1 <- bp + coord_polar("y", start=0) + ggtitle("AfD") + xlab("") + ylab("")
  121. plot2 %>%
  122. filter(fraction == "BÜNDNIS 90 / DIE GRÜNEN") %>%
  123. ggplot(aes(x = "", y = portion, fill = gender))+
  124. geom_bar(width = 1, stat = "identity") ->
  125. bp
  126. pie2 <- bp + coord_polar("y", start=0) + ggtitle("DIE GRÜNEN") + xlab("") + ylab("")
  127. plot2 %>%
  128. filter(fraction == "CDU/CSU") %>%
  129. ggplot(aes(x = "", y = portion, fill = gender))+
  130. geom_bar(width = 1, stat = "identity") ->
  131. bp
  132. pie3 <- bp + coord_polar("y", start=0) + ggtitle("CDU/CSU") + xlab("") + ylab("")
  133. plot2 %>%
  134. filter(fraction == "DIE LINKE") %>%
  135. ggplot(aes(x = "", y = portion, fill = gender))+
  136. geom_bar(width = 1, stat = "identity") ->
  137. bp
  138. pie4 <- bp + coord_polar("y", start=0) + ggtitle("DIE LINKE") + xlab("") + ylab("")
  139. plot2 %>%
  140. filter(fraction == "FDP") %>%
  141. ggplot(aes(x = "", y = portion, fill = gender))+
  142. geom_bar(width = 1, stat = "identity") ->
  143. bp
  144. pie5 <- bp + coord_polar("y", start=0) + ggtitle("FDP") + xlab("") + ylab("")
  145. plot2 %>%
  146. filter(fraction == "SPD") %>%
  147. ggplot(aes(x = "", y = portion, fill = gender))+
  148. geom_bar(width = 1, stat = "identity") ->
  149. bp
  150. pie6 <- bp + coord_polar("y", start=0) + ggtitle("SPD") + xlab("") + ylab("")
  151. gridExtra::grid.arrange(pie1,pie2,pie3,pie4,pie5,pie6,nrow=2)
  152. ```
  153. Now let's analyze whether there are any differences in the amount of speeches given.
  154. ```{r}
  155. speeches %>%
  156. group_by(speaker) %>%
  157. summarize(n = n()) %>%
  158. ungroup() %>%
  159. arrange(-n) %>%
  160. left_join(speaker, by=c("speaker" = "id")) %>%
  161. unite(name, vorname, nachname, sep = " ") %>%
  162. inner_join(gender, by=c("name"= "speaker")) %>%
  163. group_by(gender) %>%
  164. summarise(absolute=sum(n)) %>%
  165. filter(gender %in% c("female", "male")) %>%
  166. mutate(absolute2=absolute/sum(absolute)) %>%
  167. mutate(portion=c(0.32, 0.68)) %>%
  168. mutate(relative=absolute*(1-portion)) %>%
  169. mutate(relative2=relative/sum(relative)) ->
  170. plot3
  171. ```
  172. At first lets take a look at the absolute difference in the amount of speeches by the two sexes.
  173. ```{r}
  174. barplot(plot3$absolute2,
  175. ylab = "amount of speeches",
  176. main = "Absolute comparison of speech shares",
  177. las = 1,
  178. names.arg = c("women", "men"),
  179. col = c("pink", "darkblue"),
  180. font.main = 4,
  181. cex.axis = 0.7)
  182. ```
  183. Since there are more men represented in the German Bundestag, we now consider the relative proportions of speeches, depending on the ratio of men and women.
  184. ```{r}
  185. barplot(plot3$relative2,
  186. ylab = "amount of speeches",
  187. main = "Relative comparison of speech shares",
  188. las = 1,
  189. names.arg = c("women", "men"),
  190. col = c("pink", "darkblue"),
  191. font.main = 4,
  192. cex.axis = 0.7)
  193. ```