Преглед изворни кода

make hateimparlament a package, create stub vignette with funwithdata content

package
flavis пре 4 година
родитељ
комит
e36498da69
12 измењених фајлова са 121 додато и 51 уклоњено
  1. +26
    -0
      DESCRIPTION
  2. +7
    -0
      NAMESPACE
  3. +11
    -14
      R/fetch.R
  4. +15
    -0
      R/hateimparlament-package.R
  5. +0
    -0
      R/helpers.R
  6. +6
    -14
      R/parse.R
  7. +0
    -2
      R/repair.R
  8. +0
    -19
      analysis/funwithdata.R
  9. +18
    -0
      man/hateimparlament-package.Rd
  10. +0
    -2
      scraping/config.R
  11. +2
    -0
      vignettes/.gitignore
  12. +36
    -0
      vignettes/funwithdata.Rmd

+ 26
- 0
DESCRIPTION Прегледај датотеку

@@ -0,0 +1,26 @@
Package: hateimparlament
Title: Protocolanalysis of German Bundestag
Version: 0.0.0.9000
Authors@R:
person(given = "First",
family = "Last",
role = c("aut", "cre"),
email = "first.last@example.com",
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: Downloads, parses and analyses protocols of the current German parliament (Bundestag).
License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
license
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
Imports:
dplyr,
pbapply,
rvest,
stringr,
xml2
Suggests:
rmarkdown,
knitr
VignetteBuilder: knitr

+ 7
- 0
NAMESPACE Прегледај датотеку

@@ -0,0 +1,7 @@
# Generated by roxygen2: do not edit by hand

import(dplyr)
import(pbapply)
import(stringr)
import(tibble)
import(xml2)

scraping/fetch.R → R/fetch.R Прегледај датотеку

@@ -1,9 +1,3 @@
source("../utils/helpers.R")
source("config.R")
library(rvest)
library(stringr)
library(pbapply)

mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path)

mk_url <- function(offset) {
@@ -11,30 +5,33 @@ mk_url <- function(offset) {
offset)
}

download_protocol <- function(path, name) {
fp <- paste0(DOWNLOAD_DIR, name)
download_protocol <- function(path, name, download_dir) {
fp <- paste0(download_dir, name)
try %$% download.file(mk_absolute_url(path), fp, quiet=T)
progress <<- progress + 1
setTimerProgressBar(pb, progress)
}

fetch_batch <- function(offset) {
fetch_batch <- function(offset, download_dir) {
stopifnot("Offset must be numeric" = is.numeric(offset))
mk_url(offset) %>%
read_html() %>%
rvest::read_html() %>%
as.character() %>%
str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>%
`[[`(1) ->
paths
mapply(download_protocol, paths[,1], paths[,2])
mapply(download_protocol,
paths[,1],
paths[,2],
MoreArgs=list(download_dir = download_dir))
return(length(paths) > 0)
}

# TODO: error handling
# - what if: page not reachable
# - wrong format, etc.
fetch_all <- function() {
cat("Fetching all available protocols from bundestag.de. This may take a while ...\n")
fetch_all <- function(download_dir="records/") {
cat("Fetching all available records from bundestag.de. This may take a while ...\n")
# create progress bar
pb <<- timerProgressBar(min=0, max=250, width=40, char="+")
@@ -43,7 +40,7 @@ fetch_all <- function() {
on.exit({close(pb); cat("Done.\n")})
# fetch batch by batch
offset <- 0
while(fetch_batch(offset)) offset <- offset + 10
while(fetch_batch(offset, download_dir)) offset <- offset + 10
# if successful, set progressbar to 100%
setTimerProgressBar(pb, 250)
}

+ 15
- 0
R/hateimparlament-package.R Прегледај датотеку

@@ -0,0 +1,15 @@
#' @details
#' hateimparlament ist ein großartiges Paket!
#' @import tibble
#' @import dplyr
#' @import pbapply
#' @import stringr
#' @import xml2
#' @keywords internal
"_PACKAGE"

# The following block is used by usethis to automatically manage
# roxygen namespace tags. Modify with care!
## usethis namespace: start
## usethis namespace: end
NULL

utils/helpers.R → R/helpers.R Прегледај датотеку


scraping/parse.R → R/parse.R Прегледај датотеку

@@ -1,17 +1,9 @@
source("config.R")
source("../utils/helpers.R")
library("xml2")
library(tibble)
library(dplyr)
library(magrittr)
library(pbapply)

# for usage see the example at the end

read_all <- function() {
cat("Reading all protocols from", DOWNLOAD_DIR, "\n")
available_protocols <- list.files(DOWNLOAD_DIR)
res <- pblapply(available_protocols, read_one)
read_all <- function(path="records/") {
cat("Reading all records from", path, "\n")
available_protocols <- list.files(path)
res <- pblapply(available_protocols, read_one, path=path)

lapply(res, `[[`, "redner") %>%
bind_rows() %>%
@@ -32,8 +24,8 @@ read_all <- function() {
}

# this reads all currently parseable data from one xml
read_one <- function(name) {
x <- tryCatch(read_xml(paste0(DOWNLOAD_DIR, name)),
read_one <- function(name, path) {
x <- tryCatch(read_xml(paste0(path, name)),
error = function(c) NULL)
if (is.null(x)) return(NULL)
cs <- xml_children(x)

scraping/repair.R → R/repair.R Прегледај датотеку

@@ -1,4 +1,3 @@
source("../utils/helpers.R")
fraktionen <- c("AFD" = "AfD",
"BÜNDNIS90/" = "BÜNDNIS 90 / DIE GRÜNEN",
"BÜNDNIS90/DIEGRÜNEN" = "BÜNDNIS 90 / DIE GRÜNEN",
@@ -48,4 +47,3 @@ repair <- function(parse_output) {
reden = repair_reden(parse_output$reden),
talks = repair_talks(parse_output$talks))
}


+ 0
- 19
analysis/funwithdata.R Прегледај датотеку

@@ -1,19 +0,0 @@
library(tidyverse)
source("../scraping/fetch.R")
source("../scraping/parse.R")
source("../scraping/repair.R")

# fetch_all()
read_all() %>% repair() -> res

reden <- res$reden
redner <- res$redner
talks <- res$talks

# first tries

left_join(reden, redner, by=c("redner" = "id")) %>%
group_by(fraktion) %>%
summarize(n = n()) %>%
ggplot(aes(x = fraktion, y = n)) +
geom_bar(stat = "identity")

+ 18
- 0
man/hateimparlament-package.Rd Прегледај датотеку

@@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hateimparlament-package.R
\docType{package}
\name{hateimparlament-package}
\alias{hateimparlament}
\alias{hateimparlament-package}
\title{hateimparlament: Protocolanalysis of German Bundestag}
\description{
Downloads, parses and analyses protocols of the current German parliament (Bundestag).
}
\details{
hateimparlament ist ein großartiges Paket!
}
\author{
\strong{Maintainer}: First Last \email{first.last@example.com} (\href{https://orcid.org/YOUR-ORCID-ID}{ORCID})

}
\keyword{internal}

+ 0
- 2
scraping/config.R Прегледај датотеку

@@ -1,2 +0,0 @@
DOWNLOAD_DIR = "../data/" # warning: this is not created (should maybe)


+ 2
- 0
vignettes/.gitignore Прегледај датотеку

@@ -0,0 +1,2 @@
*.html
*.R

+ 36
- 0
vignettes/funwithdata.Rmd Прегледај датотеку

@@ -0,0 +1,36 @@
---
title: "funwithdata"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{funwithdata}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```

```r
read_all() %>% repair() -> res

reden <- res$reden
redner <- res$redner
talks <- res$talks

# first tries

left_join(reden, redner, by=c("redner" = "id")) %>%
group_by(fraktion) %>%
summarize(n = n()) %>%
ggplot(aes(x = fraktion, y = n)) +
geom_bar(stat = "identity")
```


```{r setup}
library(hateimparlament)
```

Loading…
Откажи
Сачувај