|
- source("../utils/helpers.R")
- source("config.R")
- library(rvest)
- library(stringr)
-
- mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path)
-
- mk_url <- function(offset) {
- mk_absolute_url %$% sprintf("/ajax/filterlist/de/services/opendata/543410-543410?offset=%d",
- offset)
- }
-
- download_protocol <- function(path, name) {
- fp <- paste0(DOWNLOAD_DIR, name)
- try %$% download.file(mk_absolute_url(path), fp, quiet=T)
- }
-
- fetch_batch <- function(offset) {
- stopifnot("Offset must be numeric" = is.numeric(offset))
- mk_url(offset) %>%
- read_html() %>%
- as.character() %>%
- str_match_all("/resource/blob/.*?/([0-9]*-data\\.xml)") %>%
- `[[`(1) ->
- paths
- mapply(download_protocol, paths[,1], paths[,2])
- return(length(paths) > 0)
- }
-
- # TODO: error handling
- # - what if: page not reachable
- # - wrong format, etc.
- fetch_all <- function() {
- offset <- 0
- while(fetch_batch(offset)) offset <- offset + 10
- }
|