#' Parse xml records #' #' Creates a list of tibbles containing relevant information from all records #' stored in the input directory. #' #' @param path path to records directory #' @param pattern search pattern to find records in directory #' #' @export read_all <- function(path="inst/records/", pattern="-data\\.xml") { # append file separator if needed path <- make_directory_path(path) cat("Reading all records from", path, "\n") # list all files in directory and filter by search pattern fs <- list.files(path) available_protocols <- fs[str_detect(fs, pattern)] if (length(available_protocols) == 0) stop(paste0("The given directory does not exist or does not contain files matching \"", pattern, "\".")) # parse records one by one and remove null entries res <- compact %$% pblapply(available_protocols, read_one, path=path) if (length(res) == 0) stop("No valid records found. Did you fetch successfully?") lapply(res, `[[`, "speaker") %>% bind_rows() %>% distinct() -> speaker lapply(res, `[[`, "speeches") %>% bind_rows() %>% distinct() %>% mutate(date = as.Date(date, format="%d.%m.%Y")) -> speeches lapply(res, `[[`, "talks") %>% bind_rows() %>% distinct() -> talks lapply(res, `[[`, "comments") %>% bind_rows() %>% distinct() -> commentsandapplause filter(commentsandapplause, type == "comment") %>% select(-type) -> comments filter(commentsandapplause, type == "applause") %>% select(-type, -commenter, -content) %>% mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"), "SPD" = str_detect(fraction, "SPD"), "FDP" = str_detect(fraction, "FDP"), "DIE_LINKE" = str_detect(fraction, "DIE LINKE"), "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "B\u00DCNDNIS 90/DIE GR\u00DCNEN"), "AfD" = str_detect(fraction, "AfD")) %>% select(-fraction) -> applause list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause) } # this reads all currently parseable data from one xml read_one <- function(name, path) { x <- tryCatch(read_xml(paste0(path, name)), error = function(c) NULL, warning = function(c) NULL) if (is.null(x)) return(NULL) # extract date of session date <- xml_attr(x, "sitzung-datum") cs <- xml_children(x) verlauf <- xml_find_first(x, "sitzungsverlauf") speakerl <- xml_find_first(x, "rednerliste") # check if record is invalid or empty (every record should have at least # one speech, a speaker and a date if (is.na(date) || length(verlauf) == 0 || length(speakerl) == 0) { warning("Invalid record found. Skipping.") return(NULL) } xml_children(speakerl) %>% parse_speakerlist() -> speaker xml_children(verlauf) %>% xml_find_all("rede") %>% parse_speechlist(date) -> res list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments) } xml_get <- function(node, name) { res <- xml_text %$% xml_find_all(node, name) if (length(res) == 0) NA_character_ else res } # parse one speaker parse_speaker <- function(speaker_xml) { speaker_id <- xml_attr(speaker_xml, "id") nm <- xml_child(speaker_xml) prename <- xml_get(nm, "vorname") lastname <- xml_get(nm, "nachname") fraction <- xml_get(nm, "fraktion") title <- xml_get(nm, "titel") role <- xml_find_all(nm, "rolle") if (length(role) > 0) { role_long <- xml_get(role, "rolle_lang") role_short <- xml_get(role, "rolle_kurz") } else role_short <- role_long <- NA_character_ c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title, role_short = role_short, role_long = role_long) } # parse one speech # returns: - a speech (with speech id and speaker id) # - all talks appearing in the speech (with corresponding content) parse_speech <- function(speech_xml, date) { speech_id <- xml_attr(speech_xml, "id") cs <- xml_children(speech_xml) cur_speaker <- NA_character_ principal_speaker <- NA_character_ cur_content <- "" speeches <- list() comments <- list() for (node in cs) { if (xml_name(node) == "p" || xml_name(node) == "name") { klasse <- xml_attr(node, "klasse") if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") { if (!is.na(cur_speaker)) { speech <- c(speech_id = speech_id, speaker = cur_speaker, content = cur_content) speeches <- c(speeches, list(speech)) cur_content <- "" } if (is.na(principal_speaker) && xml_name(node) != "name") { principal_speaker <- xml_child(node) %>% xml_attr("id") } if (xml_name(node) == "name") { cur_speaker <- "BTP" } else { cur_speaker <- xml_child(node) %>% xml_attr("id") } } else { cur_content <- paste0(cur_content, xml_text(node), sep="\n") } } else if (xml_name(node) == "kommentar") { # comments are of the form # (blabla [Fraktion] \u2013 blabla liasdf \u2013 bla) xml_text(node) %>% str_sub(2, -2) %>% str_split("\u2013") %>% `[[`(1) %>% lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) -> cs comments <- c(comments, cs) } } speech <- c(speech_id = speech_id, speaker = cur_speaker, content = cur_content) speeches <- c(speeches, list(speech)) list(speech = c(id = speech_id, speaker = principal_speaker, date = date), parts = speeches, comments = comments) } fractionpattern <- "B\u00DCNDNIS(SES)?\\W*90/DIE\\W*GR\u00DCNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN" fractionnames <- c("B\u00DCNDNIS 90/DIE GR\u00DCNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP", "Fraktionslos") parse_comment <- function(comment, speech_id, on_speaker) { base <- c(speech_id = speech_id, on_speaker = on_speaker) # classify comment if(str_detect(comment, "Beifall")) { str_extract_all(comment, fractionpattern) %>% `[[`(1) %>% sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>% str_c(collapse=",") -> by c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] fraction <- agrep(ps[3], fractionnames, max=0.2, value=T) if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_ c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4]) } } # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches parse_speechlist <- function(speechlist_xml, date) { d <- sapply(speechlist_xml, parse_speech, date = date) speeches <- simplify2array(d["speech", ]) parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE) comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE) list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",], date = speeches["date",]), talks = tibble(speech_id = parts["speech_id", ], speaker = parts["speaker", ], content = parts["content", ]), comments = tibble(speech_id = comments["speech_id",], on_speaker = comments["on_speaker",], type = comments["type",], fraction = comments["fraction",], commenter = comments["commenter",], content = comments["content", ])) } # create a tibble of speaker from a list of xml nodes representing speaker parse_speakerlist <- function(speakerliste_xml) { d <- sapply(speakerliste_xml, parse_speaker) tibble(id = d["id",], prename = d["prename",], lastname = d["lastname",], fraction = d["fraction",], title = d["title",], role_short = d["role_short",], role_long = d["role_long",]) } #' Write the parsed and repaired results into separate csv files #' #' @param tables list of tables to convert into a csv files. #' @param path where to put the csv files. #' @param create set TRUE if the path does not exist yet and you want to create it #' #' @export write_to_csv <- function(tables, path="inst/csv/", create=F) { check_directory(path, create) write.table(tables$speaker, str_c(path, "speaker.csv")) write.table(tables$speeches, str_c(path, "speeches.csv")) write.table(tables$talks, str_c(path, "talks.csv")) write.table(tables$comments, str_c(path, "comments.csv")) write.table(tables$applause, str_c(path, "applause.csv")) } #' create a tibble from the csv file #' #' @param path directory to read files from #' #' reading the tables from a csv is way faster than reading and repairing the data every single time #' #' @export read_from_csv <- function(path="inst/csv/") { list(speaker = read.table(str_c(path, "speaker.csv")) %>% tibble() %>% mutate(id = as.character(id)), speeches = read.table(str_c(path, "speeches.csv")) %>% tibble() %>% mutate(speaker = as.character(speaker), date = as.Date(date)), talks = tibble %$% read.table(str_c(path, "talks.csv")), comments = tibble %$% read.table(str_c(path, "comments.csv")), applause = tibble %$% read.table(str_c(path, "applause.csv"))) }