diff --git a/R/parse.R b/R/parse.R index 7785995..1abf1d0 100644 --- a/R/parse.R +++ b/R/parse.R @@ -39,7 +39,7 @@ read_all <- function(path="inst/records/") { select(-type) -> comments filter(commentsandapplause, type == "applause") %>% - select(-type, -kommentator, -content) %>% + select(-type, -commenter, -content) %>% mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"), "SPD" = str_detect(fraction, "SPD"), "FDP" = str_detect(fraction, "FDP"), @@ -86,17 +86,17 @@ xml_get <- function(node, name) { parse_speaker <- function(speaker_xml) { speaker_id <- xml_attr(speaker_xml, "id") nm <- xml_child(speaker_xml) - vorname <- xml_get(nm, "vorname") - nachname <- xml_get(nm, "nachname") + prename <- xml_get(nm, "vorname") + lastname <- xml_get(nm, "nachname") fraction <- xml_get(nm, "fraktion") - titel <- xml_get(nm, "titel") - rolle <- xml_find_all(nm, "rolle") - if (length(rolle) > 0) { - rolle_lang <- xml_get(rolle, "rolle_lang") - rolle_kurz <- xml_get(rolle, "rolle_kurz") - } else rolle_kurz <- rolle_lang <- NA_character_ - c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel, - rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) + title <- xml_get(nm, "titel") + role <- xml_find_all(nm, "rolle") + if (length(role) > 0) { + role_long <- xml_get(role, "rolle_lang") + role_short <- xml_get(role, "rolle_kurz") + } else role_short <- role_long <- NA_character_ + c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title, + role_short = role_short, role_long = role_long) } # parse one speech @@ -165,10 +165,10 @@ parse_comment <- function(comment, speech_id, on_speaker) { sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>% str_c(collapse=",") -> by - c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment) + c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] - c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4]) + c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4]) } } @@ -187,7 +187,7 @@ parse_speechlist <- function(speechlist_xml, date) { on_speaker = comments["on_speaker",], type = comments["type",], fraction = comments["fraction",], - kommentator = comments["kommentator",], + commenter = comments["commenter",], content = comments["content", ])) } @@ -195,12 +195,12 @@ parse_speechlist <- function(speechlist_xml, date) { parse_speakerlist <- function(speakerliste_xml) { d <- sapply(speakerliste_xml, parse_speaker) tibble(id = d["id",], - vorname = d["vorname",], - nachname = d["nachname",], + prename = d["prename",], + lastname = d["lastname",], fraction = d["fraction",], - titel = d["titel",], - rolle_kurz = d["rolle_kurz",], - rolle_lang = d["rolle_lang",]) + title = d["title",], + role_short = d["role_short",], + role_long = d["role_long",]) } #' Write the parsed and repaired results into separate csv files diff --git a/R/repair.R b/R/repair.R index 07e43f0..b1fb8b8 100644 --- a/R/repair.R +++ b/R/repair.R @@ -29,15 +29,13 @@ repair_speaker <- function(speaker) { filter(id != "10000") %>% # invalid id's mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction group_by(id) %>% - summarize(vorname = head(vorname, 1), - nachname = head(nachname, 1), + summarize(prename = head(prename, 1), + lastname = head(lastname, 1), fraction = collect_unique(fraction), - titel = longest_titel(titel), - rolle_kurz = collect_unique(str_squish(rolle_kurz)), - rolle_lang = collect_unique(str_squish(rolle_lang))) %>% + title = longest_titel(title), + role_short = collect_unique(str_squish(role_short)), + role_long = collect_unique(str_squish(role_long))) %>% ungroup() #%>% - # arrange(id) %>% - # distinct(vorname, nachname, fraction, titel) } repair_speeches <- function(speeches) { @@ -68,7 +66,7 @@ repair_talks <- function(talks) { lookup_speaker <- function(tb, speaker, name_variable) { tobereplaced <- "[-–—‑­­-­­­ ]" speaker %>% - unite(name, vorname, nachname, sep=".*") %>% + unite(name, prename, lastname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs find_match <- function(komm) { @@ -88,10 +86,10 @@ repair_comments <- function(comments, speaker) { "Use repair(, repair_commments = FALSE) to skip this.\n")) # try to find a speaker id for each actual comment comments %>% - filter(!is.na(kommentator)) %>% - lookup_speaker(speaker, kommentator) %>% - left_join(comments, ., by="kommentator") %>% - select(-kommentator) + filter(!is.na(commenter)) %>% + lookup_speaker(speaker, commenter) %>% + left_join(comments, ., by="commenter") %>% + select(-commenter) } #' Repair parsed tables diff --git a/inst/reports/implementierung.pdf b/inst/reports/implementierung.pdf index f35108c..5ad94f2 100644 Binary files a/inst/reports/implementierung.pdf and b/inst/reports/implementierung.pdf differ diff --git a/inst/reports/implementierung.tex b/inst/reports/implementierung.tex index bfc9c18..8a627e5 100644 --- a/inst/reports/implementierung.tex +++ b/inst/reports/implementierung.tex @@ -99,7 +99,7 @@ Funktion: \lstinline{fetch_all(download_dir)} \begin{lstlisting}[language=R,basicstyle=\tiny\ttfamily] > res$comments # A tibble: 83,649 x 5 - speech_id on_speaker fraction commentator content + speech_id on_speaker fraction commenter content 1 ID19100300 11003218 BUENDNIS 90/D Katrin Goering Was? 2 ID19100300 11003218 CDU/CSU Volker Kauder Warum habt ihr das bei Ge