An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

237 lines
6.5KB

  1. ---
  2. title: "genderequality"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{genderequality}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. library(rvest)
  22. ```
  23. ## Preparation of data
  24. First, you need to download all records of the current legislative period.
  25. ```r
  26. fetch_all("../records/") # path to directory where records should be stored
  27. ```
  28. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  29. ```r
  30. read_all("../records/") %>% repair() -> res
  31. ```
  32. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  33. the result into more descriptive variables.
  34. For development purposes, we load the tables from csv files.
  35. ```{r}
  36. res <- read_from_csv('../inst/csv/')
  37. ```
  38. and unpack our tibbles
  39. ```{r}
  40. comments <- res$comments
  41. speeches <- res$speeches
  42. speaker <- res$speaker
  43. talks <- res$talks
  44. ```
  45. Bevor we can do our analysis, we have to assign a gender to our politicans.
  46. ```{r}
  47. extract_href <- function(sel, html) {
  48. html %>%
  49. html_node(sel) %>%
  50. html_attr("href")
  51. }
  52. first_content_p_text <- function(url) {
  53. res <- NA
  54. i <- 1
  55. while(is.na(res)) {
  56. read_html(url) %>%
  57. html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>%
  58. html_text() -> res
  59. i <- i + 1
  60. }
  61. res
  62. }
  63. abgeordneten_list_html <- read_html(
  64. "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")
  65. selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
  66. link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
  67. link <- str_c("https://de.wikipedia.org", link_part2)
  68. text <- sapply(link, first_content_p_text)
  69. text %>%
  70. str_extract(" ist ein.") %>%
  71. str_replace(" ist eine", "female") %>%
  72. str_replace(" ist ein ", "male") ->
  73. gender
  74. text %>%
  75. str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>%
  76. str_trim() ->
  77. names
  78. gender <- tibble(speaker = names,
  79. gender = gender)
  80. speaker %>%
  81. unite("speaker", vorname, nachname, sep = " ") %>%
  82. right_join(gender, by = "speaker") ->
  83. speaker_with_gender
  84. ```
  85. #Analyse
  86. First, let's look at the relative distribution of the sexes throughout the whole Bundestag.
  87. ```{r}
  88. speaker_with_gender %>%
  89. select(gender) %>%
  90. group_by(gender) %>%
  91. summarise("count" = n()) %>%
  92. filter(gender %in% c("male", "female")) %>%
  93. mutate(portion = 100*count/sum(count)) ->
  94. plot1
  95. bp <- ggplot(plot1, aes(x = "", y = portion, fill = gender))+
  96. geom_bar(width = 1, stat = "identity")
  97. pie <- bp + coord_polar("y", start=0)
  98. pie +
  99. scale_fill_manual(values=c("pink", "blue")) +
  100. ggtitle("Relative distribution of sexes") +
  101. xlab("") +
  102. ylab("")
  103. ```
  104. Next we look at the individual distributions between men and women in the different fractions.
  105. ```{r}
  106. speaker_with_gender %>%
  107. group_by(fraction) %>%
  108. summarize(n = n()) ->
  109. fraction_size
  110. speaker_with_gender %>%
  111. filter(gender=="female") %>%
  112. group_by(fraction) %>%
  113. summarize(n_female = n()) %>%
  114. left_join(fraction_size) %>%
  115. mutate(q = n_female/n) -> women_per_fraction
  116. bar_plot_fractions(women_per_fraction, x_variable=fraction, y_variable=q, title="Frauenanteil nach Partei")
  117. ```
  118. ```r
  119. speaker_with_gender %>%
  120. select(fraction, gender) %>%
  121. group_by(fraction, gender) %>%
  122. summarise("count" = n()) %>%
  123. filter(gender %in% c("male", "female")) %>%
  124. filter(!is.na(fraction)) %>%
  125. group_by(fraction) %>%
  126. mutate(portion = 100*count/sum(count)) ->
  127. plot2
  128. plot2 %>%
  129. filter(fraction == "AfD") %>%
  130. ggplot(aes(x = "", y = portion, fill = gender))+
  131. geom_bar(width = 1, stat = "identity") ->
  132. bp
  133. pie1 <- bp + coord_polar("y", start=0) + ggtitle("AfD") + xlab("") + ylab("")
  134. plot2 %>%
  135. filter(fraction == "BÜNDNIS 90 / DIE GRÜNEN") %>%
  136. ggplot(aes(x = "", y = portion, fill = gender))+
  137. geom_bar(width = 1, stat = "identity") ->
  138. bp
  139. pie2 <- bp + coord_polar("y", start=0) + ggtitle("DIE GRÜNEN") + xlab("") + ylab("")
  140. plot2 %>%
  141. filter(fraction == "CDU/CSU") %>%
  142. ggplot(aes(x = "", y = portion, fill = gender))+
  143. geom_bar(width = 1, stat = "identity") ->
  144. bp
  145. pie3 <- bp + coord_polar("y", start=0) + ggtitle("CDU/CSU") + xlab("") + ylab("")
  146. plot2 %>%
  147. filter(fraction == "DIE LINKE") %>%
  148. ggplot(aes(x = "", y = portion, fill = gender))+
  149. geom_bar(width = 1, stat = "identity") ->
  150. bp
  151. pie4 <- bp + coord_polar("y", start=0) + ggtitle("DIE LINKE") + xlab("") + ylab("")
  152. plot2 %>%
  153. filter(fraction == "FDP") %>%
  154. ggplot(aes(x = "", y = portion, fill = gender))+
  155. geom_bar(width = 1, stat = "identity") ->
  156. bp
  157. pie5 <- bp + coord_polar("y", start=0) + ggtitle("FDP") + xlab("") + ylab("")
  158. plot2 %>%
  159. filter(fraction == "SPD") %>%
  160. ggplot(aes(x = "", y = portion, fill = gender))+
  161. geom_bar(width = 1, stat = "identity") ->
  162. bp
  163. pie6 <- bp + coord_polar("y", start=0) + ggtitle("SPD") + xlab("") + ylab("")
  164. gridExtra::grid.arrange(pie1,pie2,pie3,pie4,pie5,pie6,nrow=2)
  165. ```
  166. Now let's analyze whether there are any differences in the amount of speeches given.
  167. ```{r}
  168. speeches %>%
  169. group_by(speaker) %>%
  170. summarize(n = n()) %>%
  171. ungroup() %>%
  172. arrange(-n) %>%
  173. left_join(speaker, by=c("speaker" = "id")) %>%
  174. unite(name, vorname, nachname, sep = " ") %>%
  175. inner_join(gender, by=c("name"= "speaker")) %>%
  176. group_by(gender) %>%
  177. summarise(absolute=sum(n)) %>%
  178. filter(gender %in% c("female", "male")) %>%
  179. mutate(absolute2=absolute/sum(absolute)) %>%
  180. mutate(portion=c(0.32, 0.68)) %>%
  181. mutate(relative=absolute*(1-portion)) %>%
  182. mutate(relative2=relative/sum(relative)) ->
  183. plot3
  184. ```
  185. At first lets take a look at the absolute difference in the amount of speeches by the two sexes.
  186. ```{r}
  187. barplot(plot3$absolute2,
  188. ylab = "amount of speeches",
  189. main = "Absolute comparison of speech shares",
  190. las = 1,
  191. names.arg = c("women", "men"),
  192. col = c("pink", "darkblue"),
  193. font.main = 4,
  194. cex.axis = 0.7)
  195. ```
  196. Since there are more men represented in the German Bundestag, we now consider the relative proportions of speeches, depending on the ratio of men and women.
  197. ```{r}
  198. barplot(plot3$relative2,
  199. ylab = "amount of speeches",
  200. main = "Relative comparison of speech shares",
  201. las = 1,
  202. names.arg = c("women", "men"),
  203. col = c("pink", "darkblue"),
  204. font.main = 4,
  205. cex.axis = 0.7)
  206. ```