christian
/
hateimparlament


			
							#' Parse xml records
#'
#' Creates a list of tibbles containing relevant information from all records
#' stored in the input directory.
#'
#' @param path path to records directory
#' @param pattern search pattern to find records in directory
#'
#' @export
read_all <- function(path="inst/records/", pattern="-data\\.xml") {
    # append file separator if needed
    path <- make_directory_path(path)
    
    cat("Reading all records from", path, "\n")

    # list all files in directory and filter by search pattern
    fs <- list.files(path)
    available_protocols <- fs[str_detect(fs, pattern)]
    if (length(available_protocols) == 0)
        stop(paste0("The given directory does not exist or does not contain files matching \"",
                    pattern,
                    "\"."))

    # parse records one by one and remove null entries
    res <- compact %$% pblapply(available_protocols, read_one, path=path)
    if (length(res) == 0) stop("No valid records found. Did you fetch successfully?")

    lapply(res, `[[`, "speaker") %>%
        bind_rows() %>%
        distinct() ->
        speaker
    
    lapply(res, `[[`, "speeches") %>%
        bind_rows() %>%
        distinct() %>%
        mutate(date = as.Date(date, format="%d.%m.%Y")) ->
        speeches

    lapply(res, `[[`, "talks") %>%
        bind_rows() %>%
        distinct() ->
        talks

    lapply(res, `[[`, "comments") %>%
        bind_rows() %>%
        distinct() ->
        commentsandapplause

    filter(commentsandapplause, type == "comment") %>%
        select(-type) ->
        comments
    filter(commentsandapplause, type == "applause") %>%
        select(-type, -commenter, -content) %>%
        mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
               "SPD" = str_detect(fraction, "SPD"),
               "FDP" = str_detect(fraction, "FDP"),
               "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
               "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "B\u00DCNDNIS 90/DIE GR\u00DCNEN"),
               "AfD" = str_detect(fraction, "AfD")) %>%
        select(-fraction) ->
        applause

    list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
}

# this reads all currently parseable data from one xml
read_one <- function(name, path) {
    x <- tryCatch(read_xml(paste0(path, name)),
                  error = function(c) NULL,
                  warning = function(c) NULL)
    if (is.null(x)) return(NULL)
    # extract date of session
    date <- xml_attr(x, "sitzung-datum")
    cs <- xml_children(x)

    verlauf <- xml_find_first(x, "sitzungsverlauf")
    speakerl <- xml_find_first(x, "rednerliste")

    # check if record is invalid or empty (every record should have at least
    # one speech, a speaker and a date
    if (is.na(date) || length(verlauf) == 0 || length(speakerl) == 0) {
        warning("Invalid record found. Skipping.")
        return(NULL)
    }

    xml_children(speakerl) %>%
        parse_speakerlist() ->
        speaker

    xml_children(verlauf) %>%
        xml_find_all("rede") %>%
        parse_speechlist(date) ->
        res

    list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
}

xml_get <- function(node, name) {
    res <- xml_text %$% xml_find_all(node, name)
    if (length(res) == 0) NA_character_
    else res
}

# parse one speaker
parse_speaker <- function(speaker_xml) {
    speaker_id <- xml_attr(speaker_xml, "id")
    nm <- xml_child(speaker_xml)
    prename <- xml_get(nm, "vorname")
    lastname <- xml_get(nm, "nachname")
    fraction <- xml_get(nm, "fraktion")
    title <- xml_get(nm, "titel")
    role <- xml_find_all(nm, "rolle")
    if (length(role) > 0) {
        role_long <- xml_get(role, "rolle_lang")
        role_short <- xml_get(role, "rolle_kurz")
    } else role_short <- role_long <- NA_character_
    c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
      role_short = role_short, role_long = role_long)
}

# parse one speech
# returns: - a speech (with speech id and speaker id)
#          - all talks appearing in the speech (with corresponding content)
parse_speech <- function(speech_xml, date) {
    speech_id <- xml_attr(speech_xml, "id")
    cs <- xml_children(speech_xml)
    cur_speaker <- NA_character_
    principal_speaker <- NA_character_
    cur_content <- ""
    speeches <- list()
    comments <- list()
    for (node in cs) {
        if (xml_name(node) == "p" || xml_name(node) == "name") {
            klasse <- xml_attr(node, "klasse")
            if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
                if (!is.na(cur_speaker)) {
                    speech <- c(speech_id = speech_id,
                              speaker = cur_speaker,
                              content = cur_content)
                    speeches <- c(speeches, list(speech))
                    cur_content <- ""
                } 
                if (is.na(principal_speaker) && xml_name(node) != "name") {
                    principal_speaker <- xml_child(node) %>% xml_attr("id")
                }
                if (xml_name(node) == "name") {
                    cur_speaker <- "BTP"
                } else {
                    cur_speaker <- xml_child(node) %>% xml_attr("id")
                }
            } else {
                cur_content <- paste0(cur_content, xml_text(node), sep="\n")
            }
        } else if (xml_name(node) == "kommentar") {
            # comments are of the form
            # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
            xml_text(node) %>%
                str_sub(2, -2) %>%
                str_split("–") %>%
                `[[`(1) %>%
                lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
                cs
            comments <- c(comments, cs)
        }
    }
    speech <- c(speech_id = speech_id,
              speaker = cur_speaker,
              content = cur_content)
    speeches <- c(speeches, list(speech))
    list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
         parts = speeches,
         comments = comments)
}

fractionpattern <- "B\u00DCNDNIS(SES)?\\W*90/DIE\\W*GR\u00DCNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
fractionnames <- c("B\u00DCNDNIS 90/DIE GR\u00DCNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP",
                   "Fraktionslos")

parse_comment <- function(comment, speech_id, on_speaker) {
    base <- c(speech_id = speech_id, on_speaker = on_speaker)
    # classify comment
    if(str_detect(comment, "Beifall")) {
        str_extract_all(comment, fractionpattern) %>%
            `[[`(1) %>%
            sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
            str_c(collapse=",") ->
            by
        c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
    } else {
        ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
        fraction <- agrep(ps[3], fractionnames, max=0.2, value=T)
        if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_
        c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4])
    }
}

# creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
parse_speechlist <- function(speechlist_xml, date) {
    d <- sapply(speechlist_xml, parse_speech, date = date)
    speeches <- simplify2array(d["speech", ])
    parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
    comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
    list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
                        date = speeches["date",]),
         talks = tibble(speech_id = parts["speech_id", ],
                        speaker = parts["speaker", ],
                        content = parts["content", ]),
         comments = tibble(speech_id = comments["speech_id",],
                           on_speaker = comments["on_speaker",],
                           type = comments["type",],
                           fraction = comments["fraction",],
                           commenter = comments["commenter",],
                           content = comments["content", ]))
}

# create a tibble of speaker from a list of xml nodes representing speaker
parse_speakerlist <- function(speakerliste_xml) {
    d <- sapply(speakerliste_xml, parse_speaker)
    tibble(id = d["id",],
           prename = d["prename",],
           lastname = d["lastname",],
           fraction = d["fraction",],
           title = d["title",],
           role_short = d["role_short",],
           role_long = d["role_long",])
}

#' Write the parsed and repaired results into separate csv files
#' 
#' @param tables list of tables to convert into a csv files.
#' @param path where to put the csv files.
#' @param create set TRUE if the path does not exist yet and you want to create it
#' 
#' @export
write_to_csv <- function(tables, path="inst/csv/", create=F) {
    check_directory(path, create)
    write.table(tables$speaker, str_c(path, "speaker.csv"))
    write.table(tables$speeches, str_c(path, "speeches.csv"))
    write.table(tables$talks, str_c(path, "talks.csv"))
    write.table(tables$comments, str_c(path, "comments.csv"))
    write.table(tables$applause, str_c(path, "applause.csv"))
}


#' create a tibble from the csv file
#'
#' @param path directory to read files from
#'
#' reading the tables from a csv is way faster than reading and repairing the data every single time
#' 
#' @export
read_from_csv <- function(path="inst/csv/") {
    list(speaker = read.table(str_c(path, "speaker.csv")) %>%
             tibble() %>%
             mutate(id = as.character(id)),
         speeches = read.table(str_c(path, "speeches.csv")) %>%
             tibble() %>%
             mutate(speaker = as.character(speaker),
                    date = as.Date(date)),
         talks = tibble %$% read.table(str_c(path, "talks.csv")),
         comments = tibble %$% read.table(str_c(path, "comments.csv")),
         applause = tibble %$% read.table(str_c(path, "applause.csv")))
}