| @@ -0,0 +1,26 @@ | |||||
| Package: hateimparlament | |||||
| Title: Protocolanalysis of German Bundestag | |||||
| Version: 0.0.0.9000 | |||||
| Authors@R: | |||||
| person(given = "First", | |||||
| family = "Last", | |||||
| role = c("aut", "cre"), | |||||
| email = "first.last@example.com", | |||||
| comment = c(ORCID = "YOUR-ORCID-ID")) | |||||
| Description: Downloads, parses and analyses protocols of the current German parliament (Bundestag). | |||||
| License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a | |||||
| license | |||||
| Encoding: UTF-8 | |||||
| LazyData: true | |||||
| Roxygen: list(markdown = TRUE) | |||||
| RoxygenNote: 7.1.1 | |||||
| Imports: | |||||
| dplyr, | |||||
| pbapply, | |||||
| rvest, | |||||
| stringr, | |||||
| xml2 | |||||
| Suggests: | |||||
| rmarkdown, | |||||
| knitr | |||||
| VignetteBuilder: knitr | |||||
| @@ -0,0 +1,7 @@ | |||||
| # Generated by roxygen2: do not edit by hand | |||||
| import(dplyr) | |||||
| import(pbapply) | |||||
| import(stringr) | |||||
| import(tibble) | |||||
| import(xml2) | |||||
| @@ -1,9 +1,3 @@ | |||||
| source("../utils/helpers.R") | |||||
| source("config.R") | |||||
| library(rvest) | |||||
| library(stringr) | |||||
| library(pbapply) | |||||
| mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) | mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) | ||||
| mk_url <- function(offset) { | mk_url <- function(offset) { | ||||
| @@ -11,30 +5,33 @@ mk_url <- function(offset) { | |||||
| offset) | offset) | ||||
| } | } | ||||
| download_protocol <- function(path, name) { | |||||
| fp <- paste0(DOWNLOAD_DIR, name) | |||||
| download_protocol <- function(path, name, download_dir) { | |||||
| fp <- paste0(download_dir, name) | |||||
| try %$% download.file(mk_absolute_url(path), fp, quiet=T) | try %$% download.file(mk_absolute_url(path), fp, quiet=T) | ||||
| progress <<- progress + 1 | progress <<- progress + 1 | ||||
| setTimerProgressBar(pb, progress) | setTimerProgressBar(pb, progress) | ||||
| } | } | ||||
| fetch_batch <- function(offset) { | |||||
| fetch_batch <- function(offset, download_dir) { | |||||
| stopifnot("Offset must be numeric" = is.numeric(offset)) | stopifnot("Offset must be numeric" = is.numeric(offset)) | ||||
| mk_url(offset) %>% | mk_url(offset) %>% | ||||
| read_html() %>% | |||||
| rvest::read_html() %>% | |||||
| as.character() %>% | as.character() %>% | ||||
| str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>% | str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>% | ||||
| `[[`(1) -> | `[[`(1) -> | ||||
| paths | paths | ||||
| mapply(download_protocol, paths[,1], paths[,2]) | |||||
| mapply(download_protocol, | |||||
| paths[,1], | |||||
| paths[,2], | |||||
| MoreArgs=list(download_dir = download_dir)) | |||||
| return(length(paths) > 0) | return(length(paths) > 0) | ||||
| } | } | ||||
| # TODO: error handling | # TODO: error handling | ||||
| # - what if: page not reachable | # - what if: page not reachable | ||||
| # - wrong format, etc. | # - wrong format, etc. | ||||
| fetch_all <- function() { | |||||
| cat("Fetching all available protocols from bundestag.de. This may take a while ...\n") | |||||
| fetch_all <- function(download_dir="records/") { | |||||
| cat("Fetching all available records from bundestag.de. This may take a while ...\n") | |||||
| # create progress bar | # create progress bar | ||||
| pb <<- timerProgressBar(min=0, max=250, width=40, char="+") | pb <<- timerProgressBar(min=0, max=250, width=40, char="+") | ||||
| @@ -43,7 +40,7 @@ fetch_all <- function() { | |||||
| on.exit({close(pb); cat("Done.\n")}) | on.exit({close(pb); cat("Done.\n")}) | ||||
| # fetch batch by batch | # fetch batch by batch | ||||
| offset <- 0 | offset <- 0 | ||||
| while(fetch_batch(offset)) offset <- offset + 10 | |||||
| while(fetch_batch(offset, download_dir)) offset <- offset + 10 | |||||
| # if successful, set progressbar to 100% | # if successful, set progressbar to 100% | ||||
| setTimerProgressBar(pb, 250) | setTimerProgressBar(pb, 250) | ||||
| } | } | ||||
| @@ -0,0 +1,15 @@ | |||||
| #' @details | |||||
| #' hateimparlament ist ein großartiges Paket! | |||||
| #' @import tibble | |||||
| #' @import dplyr | |||||
| #' @import pbapply | |||||
| #' @import stringr | |||||
| #' @import xml2 | |||||
| #' @keywords internal | |||||
| "_PACKAGE" | |||||
| # The following block is used by usethis to automatically manage | |||||
| # roxygen namespace tags. Modify with care! | |||||
| ## usethis namespace: start | |||||
| ## usethis namespace: end | |||||
| NULL | |||||
| @@ -1,17 +1,9 @@ | |||||
| source("config.R") | |||||
| source("../utils/helpers.R") | |||||
| library("xml2") | |||||
| library(tibble) | |||||
| library(dplyr) | |||||
| library(magrittr) | |||||
| library(pbapply) | |||||
| # for usage see the example at the end | # for usage see the example at the end | ||||
| read_all <- function() { | |||||
| cat("Reading all protocols from", DOWNLOAD_DIR, "\n") | |||||
| available_protocols <- list.files(DOWNLOAD_DIR) | |||||
| res <- pblapply(available_protocols, read_one) | |||||
| read_all <- function(path="records/") { | |||||
| cat("Reading all records from", path, "\n") | |||||
| available_protocols <- list.files(path) | |||||
| res <- pblapply(available_protocols, read_one, path=path) | |||||
| lapply(res, `[[`, "redner") %>% | lapply(res, `[[`, "redner") %>% | ||||
| bind_rows() %>% | bind_rows() %>% | ||||
| @@ -32,8 +24,8 @@ read_all <- function() { | |||||
| } | } | ||||
| # this reads all currently parseable data from one xml | # this reads all currently parseable data from one xml | ||||
| read_one <- function(name) { | |||||
| x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)), | |||||
| read_one <- function(name, path) { | |||||
| x <- tryCatch(read_xml(paste0(path, name)), | |||||
| error = function(c) NULL) | error = function(c) NULL) | ||||
| if (is.null(x)) return(NULL) | if (is.null(x)) return(NULL) | ||||
| cs <- xml_children(x) | cs <- xml_children(x) | ||||
| @@ -1,4 +1,3 @@ | |||||
| source("../utils/helpers.R") | |||||
| fraktionen <- c("AFD" = "AfD", | fraktionen <- c("AFD" = "AfD", | ||||
| "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", | "BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN", | ||||
| "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", | "BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN", | ||||
| @@ -48,4 +47,3 @@ repair <- function(parse_output) { | |||||
| reden = repair_reden(parse_output$reden), | reden = repair_reden(parse_output$reden), | ||||
| talks = repair_talks(parse_output$talks)) | talks = repair_talks(parse_output$talks)) | ||||
| } | } | ||||
| @@ -1,19 +0,0 @@ | |||||
| library(tidyverse) | |||||
| source("../scraping/fetch.R") | |||||
| source("../scraping/parse.R") | |||||
| source("../scraping/repair.R") | |||||
| # fetch_all() | |||||
| read_all() %>% repair() -> res | |||||
| reden <- res$reden | |||||
| redner <- res$redner | |||||
| talks <- res$talks | |||||
| # first tries | |||||
| left_join(reden, redner, by=c("redner" = "id")) %>% | |||||
| group_by(fraktion) %>% | |||||
| summarize(n = n()) %>% | |||||
| ggplot(aes(x = fraktion, y = n)) + | |||||
| geom_bar(stat = "identity") | |||||
| @@ -0,0 +1,18 @@ | |||||
| % Generated by roxygen2: do not edit by hand | |||||
| % Please edit documentation in R/hateimparlament-package.R | |||||
| \docType{package} | |||||
| \name{hateimparlament-package} | |||||
| \alias{hateimparlament} | |||||
| \alias{hateimparlament-package} | |||||
| \title{hateimparlament: Protocolanalysis of German Bundestag} | |||||
| \description{ | |||||
| Downloads, parses and analyses protocols of the current German parliament (Bundestag). | |||||
| } | |||||
| \details{ | |||||
| hateimparlament ist ein großartiges Paket! | |||||
| } | |||||
| \author{ | |||||
| \strong{Maintainer}: First Last \email{first.last@example.com} (\href{https://orcid.org/YOUR-ORCID-ID}{ORCID}) | |||||
| } | |||||
| \keyword{internal} | |||||
| @@ -1,2 +0,0 @@ | |||||
| DOWNLOAD_DIR = "../data/" # warning: this is not created (should maybe) | |||||
| @@ -0,0 +1,2 @@ | |||||
| *.html | |||||
| *.R | |||||
| @@ -0,0 +1,36 @@ | |||||
| --- | |||||
| title: "funwithdata" | |||||
| output: rmarkdown::html_vignette | |||||
| vignette: > | |||||
| %\VignetteIndexEntry{funwithdata} | |||||
| %\VignetteEngine{knitr::rmarkdown} | |||||
| %\VignetteEncoding{UTF-8} | |||||
| --- | |||||
| ```{r, include = FALSE} | |||||
| knitr::opts_chunk$set( | |||||
| collapse = TRUE, | |||||
| comment = "#>" | |||||
| ) | |||||
| ``` | |||||
| ```r | |||||
| read_all() %>% repair() -> res | |||||
| reden <- res$reden | |||||
| redner <- res$redner | |||||
| talks <- res$talks | |||||
| # first tries | |||||
| left_join(reden, redner, by=c("redner" = "id")) %>% | |||||
| group_by(fraktion) %>% | |||||
| summarize(n = n()) %>% | |||||
| ggplot(aes(x = fraktion, y = n)) + | |||||
| geom_bar(stat = "identity") | |||||
| ``` | |||||
| ```{r setup} | |||||
| library(hateimparlament) | |||||
| ``` | |||||