create genderequality, add documentation read_from_csv

4 лет назад · 1433159e09
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -12,6 +12,8 @@ join_speaker <- function(tb, res, fraction_only = F) {
    else joined
 }

 #'Assignment of the official colors to the parties
 #'
 #' @export
 party_colors <- c(
  AfD="#1A9FDD",
--- a/R/parse.R
+++ b/R/parse.R
@@ -223,6 +223,10 @@ write_to_csv <- function(tables, path="csv/", create=F) {
    write.table(tables$applause, str_c(path, "applause.csv"))
 }

 #'Create a Tibble out of the CSV-Document
 #'
 #'@param path The path that shows which Documents you want to use.
 #'
 #' @export
 read_from_csv <- function(path="csv/") {
    list(speaker = read.table(str_c(path, "speaker.csv")) %>%
--- a/man/read_from_csv.Rd
+++ b/man/read_from_csv.Rd
@@ -0,0 +1,14 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/parse.R
 \name{read_from_csv}
 \alias{read_from_csv}
 \title{Create a Tibble out of the CSV-Document}
 \usage{
 read_from_csv(path = "csv/")
 }
 \arguments{
 \item{path}{The path that shows which Documents you want to use.}
 }
 \description{
 Create a Tibble out of the CSV-Document
 }
--- a/vignettes/genderequality.Rmd
+++ b/vignettes/genderequality.Rmd
@@ -0,0 +1,94 @@
 ---
 title: "genderequality"
 output: rmarkdown::html_vignette
 vignette: >
  %\VignetteIndexEntry{genderequality}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
 ---

 ```{r, include = FALSE}
 knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
 )
 ```

 ```{r setup}
 library(hateimparlament)
 library(dplyr)
 library(ggplot2)
 library(stringr)
 library(tidyr)
 library(rvest)
 ```

 ## Preparation of data

 First, you need to download all records of the current legislative period.
 ```r
 fetch_all("../records/") # path to directory where records should be stored
 ```
 Second, those `.xml` files, need to be parsed into `R` `tibbles`. This is accomplished by:
 ```r
 read_all("../records/") %>% repair() -> res
 ```
 We also used `repair` to fix a bunch of formatting issues in the records and unpacked
 the result into more descriptive variables.

 For development purposes, we load the tables from csv files.
 ```{r}
 res <- read_from_csv('../csv/')
 ```
 and unpack our tibbles
 ```{r}
 comments <- res$comments
 speeches <- res$speeches
 speaker <- res$speaker
 talks <- res$talks
 ```

 Bevor we can do our analysis, we have to assign a gender to our politicans.

 ```{r}
 extract_href <- function(sel, html) {
  html %>%
    html_node(sel) %>%
    html_attr("href")
 }

 first_content_p_text <- function(url) {
  res <- NA
  i <- 1
  while(is.na(res)) {
    read_html(url) %>% 
      html_node(str_glue("#mw-content-text > div.mw-parser-output > p:nth-child({i})"))  %>% 
      html_text() -> res
    i <- i + 1
  }
  res
 }

 abgeordneten_list_html <- read_html(
  "https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)")

 selectors <- str_glue("#mw-content-text > div.mw-parser-output > table:nth-child(20) > tbody > tr:nth-child({2:709}) > td:nth-child(2) > a")
 link_part2 <- sapply(selectors, extract_href, abgeordneten_list_html)
 link <- str_c("https://de.wikipedia.org", link_part2)

 text <- sapply(link, first_content_p_text)
 text %>% 
  str_extract(" ist ein.") %>% 
  str_replace(" ist eine", "female") %>% 
  str_replace(" ist ein ", "male") ->
  gender

 text %>% 
  str_extract("^([:upper:]?[:lower:]+[\\s\\-]?)*") %>% 
  str_trim() -> 
  names

 gender <- tibble(name = names,
                 gender = gender)
 ```