|
- # for usage see the example at the end
-
- #' Parse xml records
- #'
- #' Creates a list of tibbles containing relevant information from all records
- #' stored in the input directory.
- #'
- #' @param path character
- #'
- #' @export
- read_all <- function(path="data/records/") {
- cat("Reading all records from", path, "\n")
- available_protocols <- list.files(path)
- res <- pblapply(available_protocols, read_one, path=path)
-
- if (length(available_protocols) == 0)
- stop("The given directory is empty or does not exist.")
-
- lapply(res, `[[`, "speaker") %>%
- bind_rows() %>%
- distinct() ->
- speaker
-
- lapply(res, `[[`, "speeches") %>%
- bind_rows() %>%
- distinct() %>%
- mutate(date = as.Date(date, format="%d.%m.%Y")) ->
- speeches
-
- lapply(res, `[[`, "talks") %>%
- bind_rows() %>%
- distinct() ->
- talks
-
- lapply(res, `[[`, "comments") %>%
- bind_rows() %>%
- distinct() ->
- commentsandapplause
-
- filter(commentsandapplause, type == "comment") %>%
- select(-type) ->
- comments
- filter(commentsandapplause, type == "applause") %>%
- select(-type, -kommentator, -content) %>%
- mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
- "SPD" = str_detect(fraction, "SPD"),
- "FDP" = str_detect(fraction, "FDP"),
- "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
- "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
- "AfD" = str_detect(fraction, "AfD")) %>%
- select(-fraction) ->
- applause
-
- list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
- }
-
- # this reads all currently parseable data from one xml
- read_one <- function(name, path) {
- x <- tryCatch(read_xml(paste0(path, name)),
- error = function(c) NULL)
- if (is.null(x)) return(NULL)
- # extract date of session
- date <- xml_attr(x, "sitzung-datum")
- cs <- xml_children(x)
-
- verlauf <- xml_find_first(x, "sitzungsverlauf")
- speakerl <- xml_find_first(x, "rednerliste")
-
- xml_children(speakerl) %>%
- parse_speakerlist() ->
- speaker
-
- xml_children(verlauf) %>%
- xml_find_all("rede") %>%
- parse_speechlist(date) ->
- res
-
- list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
- }
-
- xml_get <- function(node, name) {
- res <- xml_text %$% xml_find_all(node, name)
- if (length(res) == 0) NA_character_
- else res
- }
-
- # parse one speaker
- parse_speaker <- function(speaker_xml) {
- speaker_id <- xml_attr(speaker_xml, "id")
- nm <- xml_child(speaker_xml)
- vorname <- xml_get(nm, "vorname")
- nachname <- xml_get(nm, "nachname")
- fraction <- xml_get(nm, "fraktion")
- titel <- xml_get(nm, "titel")
- rolle <- xml_find_all(nm, "rolle")
- if (length(rolle) > 0) {
- rolle_lang <- xml_get(rolle, "rolle_lang")
- rolle_kurz <- xml_get(rolle, "rolle_kurz")
- } else rolle_kurz <- rolle_lang <- NA_character_
- c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
- rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
- }
-
- # parse one speech
- # returns: - a speech (with speech id and speaker id)
- # - all talks appearing in the speech (with corresponding content)
- parse_speech <- function(speech_xml, date) {
- speech_id <- xml_attr(speech_xml, "id")
- cs <- xml_children(speech_xml)
- cur_speaker <- NA_character_
- principal_speaker <- NA_character_
- cur_content <- ""
- speeches <- list()
- comments <- list()
- for (node in cs) {
- if (xml_name(node) == "p" || xml_name(node) == "name") {
- klasse <- xml_attr(node, "klasse")
- if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
- if (!is.na(cur_speaker)) {
- speech <- c(speech_id = speech_id,
- speaker = cur_speaker,
- content = cur_content)
- speeches <- c(speeches, list(speech))
- cur_content <- ""
- }
- if (is.na(principal_speaker) && xml_name(node) != "name") {
- principal_speaker <- xml_child(node) %>% xml_attr("id")
- }
- if (xml_name(node) == "name") {
- cur_speaker <- "BTP"
- } else {
- cur_speaker <- xml_child(node) %>% xml_attr("id")
- }
- } else {
- cur_content <- paste0(cur_content, xml_text(node), sep="\n")
- }
- } else if (xml_name(node) == "kommentar") {
- # comments are of the form
- # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
- xml_text(node) %>%
- str_sub(2, -2) %>%
- str_split("–") %>%
- `[[`(1) %>%
- lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
- cs
- comments <- c(comments, cs)
- }
- }
- speech <- c(speech_id = speech_id,
- speaker = cur_speaker,
- content = cur_content)
- speeches <- c(speeches, list(speech))
- list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
- parts = speeches,
- comments = comments)
- }
-
- fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
- fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
-
- parse_comment <- function(comment, speech_id, on_speaker) {
- base <- c(speech_id = speech_id, on_speaker = on_speaker)
- # classify comment
- if(str_detect(comment, "Beifall")) {
- str_extract_all(comment, fractionpattern) %>%
- `[[`(1) %>%
- sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
- str_c(collapse=",") ->
- by
- c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
- } else {
- ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
- c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
- }
- }
-
- # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
- parse_speechlist <- function(speechlist_xml, date) {
- d <- sapply(speechlist_xml, parse_speech, date = date)
- speeches <- simplify2array(d["speech", ])
- parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
- comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
- list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
- date = speeches["date",]),
- talks = tibble(speech_id = parts["speech_id", ],
- speaker = parts["speaker", ],
- content = parts["content", ]),
- comments = tibble(speech_id = comments["speech_id",],
- on_speaker = comments["on_speaker",],
- type = comments["type",],
- fraction = comments["fraction",],
- kommentator = comments["kommentator",],
- content = comments["content", ]))
- }
-
- # create a tibble of speaker from a list of xml nodes representing speaker
- parse_speakerlist <- function(speakerliste_xml) {
- d <- sapply(speakerliste_xml, parse_speaker)
- tibble(id = d["id",],
- vorname = d["vorname",],
- nachname = d["nachname",],
- fraction = d["fraction",],
- titel = d["titel",],
- rolle_kurz = d["rolle_kurz",],
- rolle_lang = d["rolle_lang",])
- }
-
- #' Write the parsed and repaired results into separate csv files
- #'
- #' @param tables list of tables to convert into a csv files.
- #' @param path where to put the csv files.
- #' @param create set TRUE if the path does not exist yet and you want to create it
- #'
- #' @export
- write_to_csv <- function(tables, path="data/csv/", create=F) {
- check_directory(path, create)
- write.table(tables$speaker, str_c(path, "speaker.csv"))
- write.table(tables$speeches, str_c(path, "speeches.csv"))
- write.table(tables$talks, str_c(path, "talks.csv"))
- write.table(tables$comments, str_c(path, "comments.csv"))
- write.table(tables$applause, str_c(path, "applause.csv"))
- }
-
-
- #' create a tibble from the csv file
- #'
- #' @param path directory to read files from
- #'
- #' reading the tables from a csv is way faster than reading and repairing the data every single time
- #'
- #' @export
- read_from_csv <- function(path="data/csv/") {
- list(speaker = read.table(str_c(path, "speaker.csv")) %>%
- tibble() %>%
- mutate(id = as.character(id)),
- speeches = read.table(str_c(path, "speeches.csv")) %>%
- tibble() %>%
- mutate(speaker = as.character(speaker)),
- talks = tibble %$% read.table(str_c(path, "talks.csv")),
- comments = tibble %$% read.table(str_c(path, "comments.csv")),
- applause = tibble %$% read.table(str_c(path, "applause.csv")))
- }
|