From e36498da69e01ff71e2c41ac3fe6b6e5cb6671e3 Mon Sep 17 00:00:00 2001 From: flavis Date: Thu, 24 Jun 2021 21:45:52 +0200 Subject: [PATCH] make hateimparlament a package, create stub vignette with funwithdata content --- DESCRIPTION | 26 ++++++++++++++++++++++++ NAMESPACE | 7 +++++++ {scraping => R}/fetch.R | 25 +++++++++++------------ R/hateimparlament-package.R | 15 ++++++++++++++ {utils => R}/helpers.R | 0 {scraping => R}/parse.R | 20 ++++++------------- {scraping => R}/repair.R | 2 -- analysis/funwithdata.R | 19 ------------------ man/hateimparlament-package.Rd | 18 +++++++++++++++++ scraping/config.R | 2 -- vignettes/.gitignore | 2 ++ vignettes/funwithdata.Rmd | 36 ++++++++++++++++++++++++++++++++++ 12 files changed, 121 insertions(+), 51 deletions(-) create mode 100644 DESCRIPTION create mode 100644 NAMESPACE rename {scraping => R}/fetch.R (66%) create mode 100644 R/hateimparlament-package.R rename {utils => R}/helpers.R (100%) rename {scraping => R}/parse.R (91%) rename {scraping => R}/repair.R (98%) delete mode 100644 analysis/funwithdata.R create mode 100644 man/hateimparlament-package.Rd delete mode 100644 scraping/config.R create mode 100644 vignettes/.gitignore create mode 100644 vignettes/funwithdata.Rmd diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..cee53fb --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,26 @@ +Package: hateimparlament +Title: Protocolanalysis of German Bundestag +Version: 0.0.0.9000 +Authors@R: + person(given = "First", + family = "Last", + role = c("aut", "cre"), + email = "first.last@example.com", + comment = c(ORCID = "YOUR-ORCID-ID")) +Description: Downloads, parses and analyses protocols of the current German parliament (Bundestag). +License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a + license +Encoding: UTF-8 +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.1.1 +Imports: + dplyr, + pbapply, + rvest, + stringr, + xml2 +Suggests: + rmarkdown, + knitr +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..d2b8265 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +import(dplyr) +import(pbapply) +import(stringr) +import(tibble) +import(xml2) diff --git a/scraping/fetch.R b/R/fetch.R similarity index 66% rename from scraping/fetch.R rename to R/fetch.R index dd3bb9b..766eaaa 100644 --- a/scraping/fetch.R +++ b/R/fetch.R @@ -1,9 +1,3 @@ -source("../utils/helpers.R") -source("config.R") -library(rvest) -library(stringr) -library(pbapply) - mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) mk_url <- function(offset) { @@ -11,30 +5,33 @@ mk_url <- function(offset) { offset) } -download_protocol <- function(path, name) { - fp <- paste0(DOWNLOAD_DIR, name) +download_protocol <- function(path, name, download_dir) { + fp <- paste0(download_dir, name) try %$% download.file(mk_absolute_url(path), fp, quiet=T) progress <<- progress + 1 setTimerProgressBar(pb, progress) } -fetch_batch <- function(offset) { +fetch_batch <- function(offset, download_dir) { stopifnot("Offset must be numeric" = is.numeric(offset)) mk_url(offset) %>% - read_html() %>% + rvest::read_html() %>% as.character() %>% str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>% `[[`(1) -> paths - mapply(download_protocol, paths[,1], paths[,2]) + mapply(download_protocol, + paths[,1], + paths[,2], + MoreArgs=list(download_dir = download_dir)) return(length(paths) > 0) } # TODO: error handling # - what if: page not reachable # - wrong format, etc. -fetch_all <- function() { - cat("Fetching all available protocols from bundestag.de. This may take a while ...\n") +fetch_all <- function(download_dir="records/") { + cat("Fetching all available records from bundestag.de. This may take a while ...\n") # create progress bar pb <<- timerProgressBar(min=0, max=250, width=40, char="+") @@ -43,7 +40,7 @@ fetch_all <- function() { on.exit({close(pb); cat("Done.\n")}) # fetch batch by batch offset <- 0 - while(fetch_batch(offset)) offset <- offset + 10 + while(fetch_batch(offset, download_dir)) offset <- offset + 10 # if successful, set progressbar to 100% setTimerProgressBar(pb, 250) } diff --git a/R/hateimparlament-package.R b/R/hateimparlament-package.R new file mode 100644 index 0000000..b2cb3a3 --- /dev/null +++ b/R/hateimparlament-package.R @@ -0,0 +1,15 @@ +#' @details +#' hateimparlament ist ein großartiges Paket! +#' @import tibble +#' @import dplyr +#' @import pbapply +#' @import stringr +#' @import xml2 +#' @keywords internal +"_PACKAGE" + +# The following block is used by usethis to automatically manage +# roxygen namespace tags. Modify with care! +## usethis namespace: start +## usethis namespace: end +NULL diff --git a/utils/helpers.R b/R/helpers.R similarity index 100% rename from utils/helpers.R rename to R/helpers.R diff --git a/scraping/parse.R b/R/parse.R similarity index 91% rename from scraping/parse.R rename to R/parse.R index cf5dd3c..ebe97c3 100644 --- a/scraping/parse.R +++ b/R/parse.R @@ -1,17 +1,9 @@ -source("config.R") -source("../utils/helpers.R") -library("xml2") -library(tibble) -library(dplyr) -library(magrittr) -library(pbapply) - # for usage see the example at the end -read_all <- function() { - cat("Reading all protocols from", DOWNLOAD_DIR, "\n") - available_protocols <- list.files(DOWNLOAD_DIR) - res <- pblapply(available_protocols, read_one) +read_all <- function(path="records/") { + cat("Reading all records from", path, "\n") + available_protocols <- list.files(path) + res <- pblapply(available_protocols, read_one, path=path) lapply(res, `[[`, "redner") %>% bind_rows() %>% @@ -32,8 +24,8 @@ read_all <- function() { } # this reads all currently parseable data from one xml -read_one <- function(name) { - x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), +read_one <- function(name, path) { + x <- tryCatch(read_xml(paste0(path, name)), error = function(c) NULL) if (is.null(x)) return(NULL) cs <- xml_children(x) diff --git a/scraping/repair.R b/R/repair.R similarity index 98% rename from scraping/repair.R rename to R/repair.R index 76d1f21..bffc2ed 100644 --- a/scraping/repair.R +++ b/R/repair.R @@ -1,4 +1,3 @@ -source("../utils/helpers.R") fraktionen <- c("AFD" = "AfD", "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", @@ -48,4 +47,3 @@ repair <- function(parse_output) { reden = repair_reden(parse_output$reden), talks = repair_talks(parse_output$talks)) } - diff --git a/analysis/funwithdata.R b/analysis/funwithdata.R deleted file mode 100644 index 9277361..0000000 --- a/analysis/funwithdata.R +++ /dev/null @@ -1,19 +0,0 @@ -library(tidyverse) -source("../scraping/fetch.R") -source("../scraping/parse.R") -source("../scraping/repair.R") - -# fetch_all() -read_all() %>% repair() -> res - -reden <- res$reden -redner <- res$redner -talks <- res$talks - -# first tries - -left_join(reden, redner, by=c("redner" = "id")) %>% - group_by(fraktion) %>% - summarize(n = n()) %>% - ggplot(aes(x = fraktion, y = n)) + - geom_bar(stat = "identity") diff --git a/man/hateimparlament-package.Rd b/man/hateimparlament-package.Rd new file mode 100644 index 0000000..44ff202 --- /dev/null +++ b/man/hateimparlament-package.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hateimparlament-package.R +\docType{package} +\name{hateimparlament-package} +\alias{hateimparlament} +\alias{hateimparlament-package} +\title{hateimparlament: Protocolanalysis of German Bundestag} +\description{ +Downloads, parses and analyses protocols of the current German parliament (Bundestag). +} +\details{ +hateimparlament ist ein großartiges Paket! +} +\author{ +\strong{Maintainer}: First Last \email{first.last@example.com} (\href{https://orcid.org/YOUR-ORCID-ID}{ORCID}) + +} +\keyword{internal} diff --git a/scraping/config.R b/scraping/config.R deleted file mode 100644 index 740a1e2..0000000 --- a/scraping/config.R +++ /dev/null @@ -1,2 +0,0 @@ -DOWNLOAD_DIR = "../data/" # warning: this is not created (should maybe) - diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd new file mode 100644 index 0000000..1658939 --- /dev/null +++ b/vignettes/funwithdata.Rmd @@ -0,0 +1,36 @@ +--- +title: "funwithdata" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{funwithdata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```r +read_all() %>% repair() -> res + +reden <- res$reden +redner <- res$redner +talks <- res$talks + +# first tries + +left_join(reden, redner, by=c("redner" = "id")) %>% + group_by(fraktion) %>% + summarize(n = n()) %>% + ggplot(aes(x = fraktion, y = n)) + + geom_bar(stat = "identity") +``` + + +```{r setup} +library(hateimparlament) +```