From e36498da69e01ff71e2c41ac3fe6b6e5cb6671e3 Mon Sep 17 00:00:00 2001
From: flavis <christian@flavigny.de>
Date: Thu, 24 Jun 2021 21:45:52 +0200
Subject: [PATCH] make hateimparlament a package, create stub vignette with
 funwithdata content

---
 DESCRIPTION                    | 26 ++++++++++++++++++++++++
 NAMESPACE                      |  7 +++++++
 {scraping => R}/fetch.R        | 25 +++++++++++------------
 R/hateimparlament-package.R    | 15 ++++++++++++++
 {utils => R}/helpers.R         |  0
 {scraping => R}/parse.R        | 20 ++++++-------------
 {scraping => R}/repair.R       |  2 --
 analysis/funwithdata.R         | 19 ------------------
 man/hateimparlament-package.Rd | 18 +++++++++++++++++
 scraping/config.R              |  2 --
 vignettes/.gitignore           |  2 ++
 vignettes/funwithdata.Rmd      | 36 ++++++++++++++++++++++++++++++++++
 12 files changed, 121 insertions(+), 51 deletions(-)
 create mode 100644 DESCRIPTION
 create mode 100644 NAMESPACE
 rename {scraping => R}/fetch.R (66%)
 create mode 100644 R/hateimparlament-package.R
 rename {utils => R}/helpers.R (100%)
 rename {scraping => R}/parse.R (91%)
 rename {scraping => R}/repair.R (98%)
 delete mode 100644 analysis/funwithdata.R
 create mode 100644 man/hateimparlament-package.Rd
 delete mode 100644 scraping/config.R
 create mode 100644 vignettes/.gitignore
 create mode 100644 vignettes/funwithdata.Rmd

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..cee53fb
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: hateimparlament
+Title: Protocolanalysis of German Bundestag
+Version: 0.0.0.9000
+Authors@R: 
+    person(given = "First",
+           family = "Last",
+           role = c("aut", "cre"),
+           email = "first.last@example.com",
+           comment = c(ORCID = "YOUR-ORCID-ID"))
+Description: Downloads, parses and analyses protocols of the current German parliament (Bundestag).
+License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
+    license
+Encoding: UTF-8
+LazyData: true
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.1.1
+Imports: 
+    dplyr,
+    pbapply,
+    rvest,
+    stringr,
+    xml2
+Suggests: 
+    rmarkdown,
+    knitr
+VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..d2b8265
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+import(dplyr)
+import(pbapply)
+import(stringr)
+import(tibble)
+import(xml2)
diff --git a/scraping/fetch.R b/R/fetch.R
similarity index 66%
rename from scraping/fetch.R
rename to R/fetch.R
index dd3bb9b..766eaaa 100644
--- a/scraping/fetch.R
+++ b/R/fetch.R
@@ -1,9 +1,3 @@
-source("../utils/helpers.R")
-source("config.R")
-library(rvest)
-library(stringr)
-library(pbapply)
-
 mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path)
 
 mk_url <- function(offset) {
@@ -11,30 +5,33 @@ mk_url <- function(offset) {
                                 offset)
 }
 
-download_protocol <- function(path, name) {
-    fp <- paste0(DOWNLOAD_DIR, name)
+download_protocol <- function(path, name, download_dir) {
+    fp <- paste0(download_dir, name)
     try %$% download.file(mk_absolute_url(path), fp, quiet=T)
     progress <<- progress + 1
     setTimerProgressBar(pb, progress)
 }
 
-fetch_batch <- function(offset) {
+fetch_batch <- function(offset, download_dir) {
     stopifnot("Offset must be numeric" = is.numeric(offset))
     mk_url(offset) %>%
-        read_html() %>%
+        rvest::read_html() %>%
         as.character() %>%
         str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>%
         `[[`(1) ->
         paths
-    mapply(download_protocol, paths[,1], paths[,2])
+    mapply(download_protocol,
+           paths[,1],
+           paths[,2],
+           MoreArgs=list(download_dir = download_dir))
     return(length(paths) > 0)
 }
 
 # TODO: error handling
 # - what if: page not reachable
 # - wrong format, etc.
-fetch_all <- function() {
-    cat("Fetching all available protocols from bundestag.de. This may take a while ...\n")
+fetch_all <- function(download_dir="records/") {
+    cat("Fetching all available records from bundestag.de. This may take a while ...\n")
     
     # create progress bar
     pb <<- timerProgressBar(min=0, max=250, width=40, char="+")
@@ -43,7 +40,7 @@ fetch_all <- function() {
     on.exit({close(pb); cat("Done.\n")})
     # fetch batch by batch
     offset <- 0
-    while(fetch_batch(offset)) offset <- offset + 10
+    while(fetch_batch(offset, download_dir)) offset <- offset + 10
     # if successful, set progressbar to 100%
     setTimerProgressBar(pb, 250)
 }
diff --git a/R/hateimparlament-package.R b/R/hateimparlament-package.R
new file mode 100644
index 0000000..b2cb3a3
--- /dev/null
+++ b/R/hateimparlament-package.R
@@ -0,0 +1,15 @@
+#' @details
+#' hateimparlament ist ein großartiges Paket!
+#' @import tibble
+#' @import dplyr
+#' @import pbapply
+#' @import stringr
+#' @import xml2
+#' @keywords internal
+"_PACKAGE"
+
+# The following block is used by usethis to automatically manage
+# roxygen namespace tags. Modify with care!
+## usethis namespace: start
+## usethis namespace: end
+NULL
diff --git a/utils/helpers.R b/R/helpers.R
similarity index 100%
rename from utils/helpers.R
rename to R/helpers.R
diff --git a/scraping/parse.R b/R/parse.R
similarity index 91%
rename from scraping/parse.R
rename to R/parse.R
index cf5dd3c..ebe97c3 100644
--- a/scraping/parse.R
+++ b/R/parse.R
@@ -1,17 +1,9 @@
-source("config.R")
-source("../utils/helpers.R")
-library("xml2")
-library(tibble)
-library(dplyr)
-library(magrittr)
-library(pbapply)
-
 # for usage see the example at the end
 
-read_all <- function() {
-    cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
-    available_protocols <- list.files(DOWNLOAD_DIR)
-    res <- pblapply(available_protocols, read_one)
+read_all <- function(path="records/") {
+    cat("Reading all records from", path, "\n")
+    available_protocols <- list.files(path)
+    res <- pblapply(available_protocols, read_one, path=path)
 
     lapply(res, `[[`, "redner") %>%
         bind_rows() %>%
@@ -32,8 +24,8 @@ read_all <- function() {
 }
 
 # this reads all currently parseable data from one xml
-read_one <- function(name) {
-    x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
+read_one <- function(name, path) {
+    x <- tryCatch(read_xml(paste0(path, name)),
                   error = function(c) NULL)
     if (is.null(x)) return(NULL)
     cs <- xml_children(x)
diff --git a/scraping/repair.R b/R/repair.R
similarity index 98%
rename from scraping/repair.R
rename to R/repair.R
index 76d1f21..bffc2ed 100644
--- a/scraping/repair.R
+++ b/R/repair.R
@@ -1,4 +1,3 @@
-source("../utils/helpers.R")
 fraktionen <- c("AFD" = "AfD",
                 "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
                 "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
@@ -48,4 +47,3 @@ repair <- function(parse_output) {
          reden = repair_reden(parse_output$reden),
          talks = repair_talks(parse_output$talks))
 }
-
diff --git a/analysis/funwithdata.R b/analysis/funwithdata.R
deleted file mode 100644
index 9277361..0000000
--- a/analysis/funwithdata.R
+++ /dev/null
@@ -1,19 +0,0 @@
-library(tidyverse)
-source("../scraping/fetch.R")
-source("../scraping/parse.R")
-source("../scraping/repair.R")
-
-# fetch_all()
-read_all() %>% repair() -> res
-
-reden <- res$reden
-redner <- res$redner
-talks <- res$talks
-
-# first tries
-
-left_join(reden, redner, by=c("redner" = "id")) %>%
-    group_by(fraktion) %>%
-    summarize(n = n()) %>%
-    ggplot(aes(x = fraktion, y = n)) +
-    geom_bar(stat = "identity")
diff --git a/man/hateimparlament-package.Rd b/man/hateimparlament-package.Rd
new file mode 100644
index 0000000..44ff202
--- /dev/null
+++ b/man/hateimparlament-package.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/hateimparlament-package.R
+\docType{package}
+\name{hateimparlament-package}
+\alias{hateimparlament}
+\alias{hateimparlament-package}
+\title{hateimparlament: Protocolanalysis of German Bundestag}
+\description{
+Downloads, parses and analyses protocols of the current German parliament (Bundestag).
+}
+\details{
+hateimparlament ist ein großartiges Paket!
+}
+\author{
+\strong{Maintainer}: First Last \email{first.last@example.com} (\href{https://orcid.org/YOUR-ORCID-ID}{ORCID})
+
+}
+\keyword{internal}
diff --git a/scraping/config.R b/scraping/config.R
deleted file mode 100644
index 740a1e2..0000000
--- a/scraping/config.R
+++ /dev/null
@@ -1,2 +0,0 @@
-DOWNLOAD_DIR = "../data/" # warning: this is not created (should maybe)
-
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/funwithdata.Rmd b/vignettes/funwithdata.Rmd
new file mode 100644
index 0000000..1658939
--- /dev/null
+++ b/vignettes/funwithdata.Rmd
@@ -0,0 +1,36 @@
+---
+title: "funwithdata"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{funwithdata}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+```r
+read_all() %>% repair() -> res
+
+reden <- res$reden
+redner <- res$redner
+talks <- res$talks
+
+# first tries
+
+left_join(reden, redner, by=c("redner" = "id")) %>%
+    group_by(fraktion) %>%
+    summarize(n = n()) %>%
+    ggplot(aes(x = fraktion, y = n)) +
+    geom_bar(stat = "identity")
+```
+
+
+```{r setup}
+library(hateimparlament)
+```