From a7e9ba9655224db25c21086bd0176d0a78917c9a Mon Sep 17 00:00:00 2001 From: flavis Date: Thu, 24 Jun 2021 19:57:26 +0200 Subject: [PATCH] combine duplicate redner by collapsing all data --- scraping/repair.R | 21 +++++++++++++++++---- utils/helpers.R | 2 ++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/scraping/repair.R b/scraping/repair.R index ef2d5fa..76d1f21 100644 --- a/scraping/repair.R +++ b/scraping/repair.R @@ -8,16 +8,28 @@ fraktionen <- c("AFD" = "AfD", "CDU/CSU" = "CDU/CSU", "FDP" = "FDP") - -# expects a tibble of redner and repairs repair_fraktion <- function(fraktion) { cleaned <- str_to_upper %$% str_replace_all(fraktion, "\\s", "") fraktionen[cleaned] } +# takes vector of titel and keeps longest +longest_titel <- function(titel) { + if (all(is.na(titel))) NA_character_ + else titel[which.max %$% str_length(titel)] +} + +# takes character vector, removes duplicates and collapses +collect_unique <- function(xs) xs %>% clear_na() %>% unique() %>% str_c(collapse="&") %>% na_if("") + +# expects a tibble of redner and repairs repair_redner <- function(redner) { - # fix fraktionsnames - redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) + redner %>% mutate(fraktion = Vectorize(repair_fraktion)(fraktion)) %>% # fix fraktion + group_by(id, vorname, nachname) %>% + summarize(fraktion = collect_unique(fraktion), + titel = longest_titel(titel), + rolle_kurz = collect_unique(str_squish(rolle_kurz)), + rolle_lang = collect_unique(str_squish(rolle_lang))) } repair_reden <- function(reden) { @@ -36,3 +48,4 @@ repair <- function(parse_output) { reden = repair_reden(parse_output$reden), talks = repair_talks(parse_output$talks)) } + diff --git a/utils/helpers.R b/utils/helpers.R index ed6a09c..148507f 100644 --- a/utils/helpers.R +++ b/utils/helpers.R @@ -1,2 +1,4 @@ `%$%` <- function(f, x) f(x) `%.%` <- function(f, g) function(...) f(g(...)) + +clear_na <- function(xs) xs[!is.na(xs)]