commit 65573634b1ec185ef6c68d37eae0c234bbca0eb7 Author: flavis Date: Wed Jun 16 23:45:47 2021 +0200 add first fetching utilities diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6722cd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.xml diff --git a/scraping/fetch.R b/scraping/fetch.R new file mode 100644 index 0000000..cfbf6e4 --- /dev/null +++ b/scraping/fetch.R @@ -0,0 +1,33 @@ +source("../utils/helpers.R") +library(RCurl) +library(stringr) + +DOWNLOAD_DIR = "../data/" # warning: this is not created (should maybe) + +mk_absolute_url <- function(path) paste0("https://www.bundestag.de", path) + +mk_url <- function(offset) { + mk_absolute_url %$% sprintf("/ajax/filterlist/de/services/opendata/543410-543410?offset=%d", + offset) +} + +download_protocol <- function(path, name) { + fp <- paste0(DOWNLOAD_DIR, name) + try %$% download.file(mk_absolute_url(path), fp, quiet=T) +} + +fetch_batch <- function(offset) { + url <- mk_url(offset) + res <- getURL(url) + paths <- str_match_all(res, "/resource/blob/.*?/([0-9]*-data\\.xml)")[[1]] + mapply(download_protocol, paths[,1], paths[,2]) + return(length(paths) > 0) +} + +# TODO: error handling +# - what if: page not reachable +# - wrong format, etc. +fetch_all <- function() { + offset <- 0 + while(fetch_batch(offset)) offset <- offset + 10 +} diff --git a/utils/helpers.R b/utils/helpers.R new file mode 100644 index 0000000..ed6a09c --- /dev/null +++ b/utils/helpers.R @@ -0,0 +1,2 @@ +`%$%` <- function(f, x) f(x) +`%.%` <- function(f, g) function(...) f(g(...))