source("../utils/helpers.R") source("config.R") library(rvest) library(stringr) library(pbapply) mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) mk_url <- function(offset) { mk_absolute_url %$% sprintf("/ajax/filterlist/de/services/opendata/543410-543410?offset=%d", offset) } download_protocol <- function(path, name) { fp <- paste0(DOWNLOAD_DIR, name) try %$% download.file(mk_absolute_url(path), fp, quiet=T) progress <<- progress + 1 setTimerProgressBar(pb, progress) } fetch_batch <- function(offset) { stopifnot("Offset must be numeric" = is.numeric(offset)) mk_url(offset) %>% read_html() %>% as.character() %>% str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>% `[[`(1) -> paths mapply(download_protocol, paths[,1], paths[,2]) return(length(paths) > 0) } # TODO: error handling # - what if: page not reachable # - wrong format, etc. fetch_all <- function() { cat("Fetching all available protocols from bundestag.de. This may take a while ...\n") # create progress bar pb <<- timerProgressBar(min=0, max=250, width=40, char="+") progress <<- 0 # close progress bar on exit (also on error) on.exit({close(pb); cat("Done.\n")}) # fetch batch by batch offset <- 0 while(fetch_batch(offset)) offset <- offset + 10 # if successful, set progressbar to 100% setTimerProgressBar(pb, 250) }