An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

225 řádky
6.1KB

  1. ---
  2. title: "genderequality"
  3. output: rmarkdown::html_vignette
  4. vignette: >
  5. %\VignetteIndexEntry{genderequality}
  6. %\VignetteEngine{knitr::rmarkdown}
  7. %\VignetteEncoding{UTF-8}
  8. ---
  9. ```{r, include = FALSE}
  10. knitr::opts_chunk$set(
  11. collapse = TRUE,
  12. comment = "#>"
  13. )
  14. ```
  15. ```{r setup}
  16. library(hateimparlament)
  17. library(dplyr)
  18. library(ggplot2)
  19. library(stringr)
  20. library(tidyr)
  21. library(rvest)
  22. ```
  23. ## Preparation of data
  24. First, you need to download all records of the current legislative period.
  25. ```r
  26. fetch_all("../records/") # path to directory where records should be stored
  27. ```
  28. Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
  29. ```r
  30. read_all("../records/") %>% repair() -> res
  31. ```
  32. We also used `repair` to fix a bunch of formatting issues in the records and unpacked
  33. the result into more descriptive variables.
  34. For development purposes, we load the tables from csv files.
  35. ```{r}
  36. res <- read_from_csv('../inst/csv/')
  37. ```
  38. and unpack our tibbles
  39. ```{r}
  40. comments <- res$comments
  41. speeches <- res$speeches
  42. speaker <- res$speaker
  43. talks <- res$talks
  44. ```
  45. Bevor we can do our analysis, we have to assign a gender to our politicans.
  46. ```{r}
  47. extract_href <- function(sel, html) {
  48. html %>%
  49. html_node(sel) %>%
  50. html_attr("href")
  51. }
  52. first_content_p_text <- function(url) {
  53. res <- NA
  54. i <- 1
  55. while(is.na(res)) {
  56. read_html(url) %>%
  57. html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})")) %>%
  58. html_text() -> res
  59. i <- i + 1
  60. }
  61. res
  62. }
  63. abgeordneten_list_html <- read_html(
  64. "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")
  65. selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
  66. link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
  67. link <- str_c("https://de.wikipedia.org", link_part2)
  68. text <- sapply(link, first_content_p_text)
  69. text %>%
  70. str_extract(" ist ein.") %>%
  71. str_replace(" ist eine", "female") %>%
  72. str_replace(" ist ein ", "male") ->
  73. gender
  74. text %>%
  75. str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>%
  76. str_trim() ->
  77. names
  78. gender <- tibble(speaker = names,
  79. gender = gender)
  80. speaker %>%
  81. unite("speaker", vorname, nachname, sep = " ") %>%
  82. right_join(gender, by = "speaker") ->
  83. speaker_with_gender
  84. ```
  85. #Analyse
  86. First, let's look at the relative distribution of the sexes throughout the whole Bundestag.
  87. ```{r}
  88. speaker_with_gender %>%
  89. select(gender) %>%
  90. group_by(gender) %>%
  91. summarise("count" = n()) %>%
  92. filter(gender %in% c("male", "female")) %>%
  93. mutate(portion = 100*count/sum(count)) ->
  94. plot1
  95. bp <- ggplot(plot1, aes(x = "", y = portion, fill = gender))+
  96. geom_bar(width = 1, stat = "identity")
  97. pie <- bp + coord_polar("y", start=0)
  98. pie +
  99. scale_fill_manual(values=c("pink", "blue")) +
  100. ggtitle("Relative distribution of sexes") +
  101. xlab("") +
  102. ylab("")
  103. ```
  104. Next we look at the individual distributions between men and women in relation to the individual parties.
  105. ```{r}
  106. speaker_with_gender %>%
  107. select(fraction, gender) %>%
  108. group_by(fraction, gender) %>%
  109. summarise("count" = n()) %>%
  110. filter(gender %in% c("male", "female")) %>%
  111. filter(!is.na(fraction)) %>%
  112. group_by(fraction) %>%
  113. mutate(portion = 100*count/sum(count)) ->
  114. plot2
  115. plot2 %>%
  116. filter(fraction == "AfD") %>%
  117. ggplot(aes(x = "", y = portion, fill = gender))+
  118. geom_bar(width = 1, stat = "identity") ->
  119. bp
  120. pie1 <- bp + coord_polar("y", start=0) + ggtitle("AfD") + xlab("") + ylab("")
  121. plot2 %>%
  122. filter(fraction == "BÜNDNIS 90 / DIE GRÜNEN") %>%
  123. ggplot(aes(x = "", y = portion, fill = gender))+
  124. geom_bar(width = 1, stat = "identity") ->
  125. bp
  126. pie2 <- bp + coord_polar("y", start=0) + ggtitle("DIE GRÜNEN") + xlab("") + ylab("")
  127. plot2 %>%
  128. filter(fraction == "CDU/CSU") %>%
  129. ggplot(aes(x = "", y = portion, fill = gender))+
  130. geom_bar(width = 1, stat = "identity") ->
  131. bp
  132. pie3 <- bp + coord_polar("y", start=0) + ggtitle("CDU/CSU") + xlab("") + ylab("")
  133. plot2 %>%
  134. filter(fraction == "DIE LINKE") %>%
  135. ggplot(aes(x = "", y = portion, fill = gender))+
  136. geom_bar(width = 1, stat = "identity") ->
  137. bp
  138. pie4 <- bp + coord_polar("y", start=0) + ggtitle("DIE LINKE") + xlab("") + ylab("")
  139. plot2 %>%
  140. filter(fraction == "FDP") %>%
  141. ggplot(aes(x = "", y = portion, fill = gender))+
  142. geom_bar(width = 1, stat = "identity") ->
  143. bp
  144. pie5 <- bp + coord_polar("y", start=0) + ggtitle("FDP") + xlab("") + ylab("")
  145. plot2 %>%
  146. filter(fraction == "SPD") %>%
  147. ggplot(aes(x = "", y = portion, fill = gender))+
  148. geom_bar(width = 1, stat = "identity") ->
  149. bp
  150. pie6 <- bp + coord_polar("y", start=0) + ggtitle("SPD") + xlab("") + ylab("")
  151. gridExtra::grid.arrange(pie1,pie2,pie3,pie4,pie5,pie6,nrow=2)
  152. ```
  153. Now let's analyze whether there are any differences in the amount of speeches given.
  154. ```{r}
  155. speeches %>%
  156. group_by(speaker) %>%
  157. summarize(n = n()) %>%
  158. ungroup() %>%
  159. arrange(-n) %>%
  160. left_join(speaker, by=c("speaker" = "id")) %>%
  161. unite(name, vorname, nachname, sep = " ") %>%
  162. inner_join(gender, by=c("name"= "speaker")) %>%
  163. group_by(gender) %>%
  164. summarise(absolute=sum(n)) %>%
  165. filter(gender %in% c("female", "male")) %>%
  166. mutate(absolute2=absolute/sum(absolute)) %>%
  167. mutate(portion=c(0.32, 0.68)) %>%
  168. mutate(relative=absolute*(1-portion)) %>%
  169. mutate(relative2=relative/sum(relative)) ->
  170. plot3
  171. ```
  172. At first lets take a look at the absolute difference in the amount of speeches by the two sexes.
  173. ```{r}
  174. barplot(plot3$absolute2,
  175. ylab = "amount of speeches",
  176. main = "Absolute comparison of speech shares",
  177. las = 1,
  178. names.arg = c("women", "men"),
  179. col = c("pink", "darkblue"),
  180. font.main = 4,
  181. cex.axis = 0.7)
  182. ```
  183. Since there are more men represented in the German Bundestag, we now consider the relative proportions of speeches, depending on the ratio of men and women.
  184. ```{r}
  185. barplot(plot3$relative2,
  186. ylab = "amount of speeches",
  187. main = "Relative comparison of speech shares",
  188. las = 1,
  189. names.arg = c("women", "men"),
  190. col = c("pink", "darkblue"),
  191. font.main = 4,
  192. cex.axis = 0.7)
  193. ```