From 01ec0de76fd7ce1186a374c6bc4b0a51ebc7e93a Mon Sep 17 00:00:00 2001 From: flavis Date: Tue, 10 Aug 2021 20:18:23 +0200 Subject: [PATCH] replace non ascii characters by unicode points --- R/parse.R | 4 ++-- R/repair.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/parse.R b/R/parse.R index 520c211..d7fb267 100644 --- a/R/parse.R +++ b/R/parse.R @@ -153,10 +153,10 @@ parse_speech <- function(speech_xml, date) { } } else if (xml_name(node) == "kommentar") { # comments are of the form - # (blabla [Fraktion] – blabla liasdf – bla) + # (blabla [Fraktion] \u2013 blabla liasdf \u2013 bla) xml_text(node) %>% str_sub(2, -2) %>% - str_split("–") %>% + str_split("\u2013") %>% `[[`(1) %>% lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) -> cs diff --git a/R/repair.R b/R/repair.R index c52db56..01be4a0 100644 --- a/R/repair.R +++ b/R/repair.R @@ -64,7 +64,7 @@ repair_talks <- function(talks) { #' #' returns a lookup table lookup_speaker <- function(tb, speaker, name_variable) { - tobereplaced <- "[-–—‑­­-­­­ ]" + tobereplaced <- "[\u002D\u2013\u2014\u2011\u00AD ]" speaker %>% unite(name, prename, lastname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) ->