source("../utils/helpers.R") source("config.R") library(rvest) library(stringr) mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) mk_url <- function(offset) { mk_absolute_url %$% sprintf("/ajax/filterlist/de/services/opendata/543410-543410?offset=%d", offset) } download_protocol <- function(path, name) { fp <- paste0(DOWNLOAD_DIR, name) try %$% download.file(mk_absolute_url(path), fp, quiet=T) } fetch_batch <- function(offset) { mk_url(offset) %>% read_html() %>% as.character() %>% str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>% `[[`(1) -> paths mapply(download_protocol, paths[,1], paths[,2]) return(length(paths) > 0) } # TODO: error handling # - what if: page not reachable # - wrong format, etc. fetch_all <- function() { offset <- 0 while(fetch_batch(offset)) offset <- offset + 10 }