Sfoglia il codice sorgente

add first parsing utilities

package
flavis 4 anni fa
parent
commit
f0f6a9ca3f
1 ha cambiato i file con 120 aggiunte e 0 eliminazioni
  1. +120
    -0
      scraping/parse.R

+ 120
- 0
scraping/parse.R Vedi File

@@ -0,0 +1,120 @@
source("config.R")
source("../utils/helpers.R")
library("xml2")
library(tibble)
library(magrittr)

# for usage see the example at the end

# this reads all currently parseable data from one xml
read_one <- function(name) {
x <- read_xml(paste0(DOWNLOAD_DIR, name))
cs <- xml_children(x)

verlauf <- xml_find_first(x, "sitzungsverlauf")
rednerl <- xml_find_first(x, "rednerliste")

xml_children(rednerl) %>%
parse_rednerliste() ->
redner

xml_children(verlauf) %>%
xml_find_all("rede") %>%
parse_redenliste() ->
res
list(redner = redner, reden = res$reden, talks = res$talks)
}

xml_get <- function(node, name) {
res <- xml_text %$% xml_find_all(node, name)
if (length(res) == 0) NA_character_
else res
}

# parse one redner
parse_redner <- function(redner_xml) {
redner_id <- xml_attr(redner_xml, "id")
nm <- xml_child(redner_xml)
vorname <- xml_get(nm, "vorname")
nachname <- xml_get(nm, "nachname")
fraktion <- xml_get(nm, "fraktion")
titel <- xml_get(nm, "titel")
rolle <- xml_find_all(nm, "rolle")
if (length(rolle) > 0) {
rolle_lang <- xml_get(rolle, "rolle_lang")
rolle_kurz <- xml_get(rolle, "rolle_kurz")
} else rolle_kurz <- rolle_lang <- NA_character_
c(id = redner_id, vorname = vorname, nachname = nachname, fraktion = fraktion, titel = titel,
rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
}

# parse one rede
# returns: - a rede (with rede id and redner id)
# - all talks appearing in the rede (with corresponding content)
parse_rede <- function(rede_xml) {
rede_id <- xml_attr(rede_xml, "id")
cs <- xml_children(rede_xml)
cur_redner <- NA_character_
principal_redner <- NA_character_
cur_content <- ""
reden <- list()
for (node in cs) {
if (xml_name(node) == "p") {
if (xml_attr(node, "klasse") == "redner") {
if (!is.na(cur_redner)) {
rede <- c(rede_id = rede_id,
redner = cur_redner,
content = cur_content)
reden <- c(reden, list(rede))
cur_content <- ""
} else {
principal_redner <- xml_child(node) %>% xml_attr("id")
}
cur_redner <- xml_child(node) %>% xml_attr("id")
} else {
cur_content <- paste0(cur_content, xml_text(node), sep="\n")
}
}
}
rede <- c(rede_id = rede_id,
redner = cur_redner,
content = cur_content)
reden <- c(reden, list(rede))
list(rede = c(id = rede_id, redner = principal_redner),
parts = reden)
}

# creates a tibble of reden and a tibble of talks from a list of xml nodes representing reden
parse_redenliste <- function(redenliste_xml) {
d <- sapply(redenliste_xml, parse_rede)
reden <- simplify2array(d["rede", ])
parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
list(reden = tibble(id = reden["id",], redner = reden["redner",]),
talks = tibble(rede_id = parts["rede_id", ],
redner = parts["redner", ],
content = parts["content", ]))
}

# create a tibble of redner from a list of xml nodes representing redner
parse_rednerliste <- function(rednerliste_xml) {
d <- sapply(rednerliste_xml, parse_redner)
tibble(id = d["id",],
vorname = d["vorname",],
nachname = d["nachname",],
fraktion = d["fraktion",],
titel = d["titel",],
rolle_kurz = d["rolle_kurz",],
rolle_lang = d["rolle_lang",])
}

# -------------------------------
# EXAMPLE USE

# make sure data ist downloaded via fetch.R
res <- read_one("19038-data.xml")

res$redner
res$reden
res$talks

# -------------------------------

Loading…
Annulla
Salva