Kaynağa Gözat

rename some columns to english

genderequality-alternative
flavis 4 yıl önce
ebeveyn
işleme
4649658fa7
4 değiştirilmiş dosya ile 30 ekleme ve 32 silme
  1. +19
    -19
      R/parse.R
  2. +10
    -12
      R/repair.R
  3. BIN
      inst/reports/implementierung.pdf
  4. +1
    -1
      inst/reports/implementierung.tex

+ 19
- 19
R/parse.R Dosyayı Görüntüle

@@ -39,7 +39,7 @@ read_all <- function(path="inst/records/") {
select(-type) -> select(-type) ->
comments comments
filter(commentsandapplause, type == "applause") %>% filter(commentsandapplause, type == "applause") %>%
select(-type, -kommentator, -content) %>%
select(-type, -commenter, -content) %>%
mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"), mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
"SPD" = str_detect(fraction, "SPD"), "SPD" = str_detect(fraction, "SPD"),
"FDP" = str_detect(fraction, "FDP"), "FDP" = str_detect(fraction, "FDP"),
@@ -86,17 +86,17 @@ xml_get <- function(node, name) {
parse_speaker <- function(speaker_xml) { parse_speaker <- function(speaker_xml) {
speaker_id <- xml_attr(speaker_xml, "id") speaker_id <- xml_attr(speaker_xml, "id")
nm <- xml_child(speaker_xml) nm <- xml_child(speaker_xml)
vorname <- xml_get(nm, "vorname")
nachname <- xml_get(nm, "nachname")
prename <- xml_get(nm, "vorname")
lastname <- xml_get(nm, "nachname")
fraction <- xml_get(nm, "fraktion") fraction <- xml_get(nm, "fraktion")
titel <- xml_get(nm, "titel")
rolle <- xml_find_all(nm, "rolle")
if (length(rolle) > 0) {
rolle_lang <- xml_get(rolle, "rolle_lang")
rolle_kurz <- xml_get(rolle, "rolle_kurz")
} else rolle_kurz <- rolle_lang <- NA_character_
c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
title <- xml_get(nm, "titel")
role <- xml_find_all(nm, "rolle")
if (length(role) > 0) {
role_long <- xml_get(role, "rolle_lang")
role_short <- xml_get(role, "rolle_kurz")
} else role_short <- role_long <- NA_character_
c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
role_short = role_short, role_long = role_long)
} }


# parse one speech # parse one speech
@@ -165,10 +165,10 @@ parse_comment <- function(comment, speech_id, on_speaker) {
sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>% sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
str_c(collapse=",") -> str_c(collapse=",") ->
by by
c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
} else { } else {
ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4])
} }
} }


@@ -187,7 +187,7 @@ parse_speechlist <- function(speechlist_xml, date) {
on_speaker = comments["on_speaker",], on_speaker = comments["on_speaker",],
type = comments["type",], type = comments["type",],
fraction = comments["fraction",], fraction = comments["fraction",],
kommentator = comments["kommentator",],
commenter = comments["commenter",],
content = comments["content", ])) content = comments["content", ]))
} }


@@ -195,12 +195,12 @@ parse_speechlist <- function(speechlist_xml, date) {
parse_speakerlist <- function(speakerliste_xml) { parse_speakerlist <- function(speakerliste_xml) {
d <- sapply(speakerliste_xml, parse_speaker) d <- sapply(speakerliste_xml, parse_speaker)
tibble(id = d["id",], tibble(id = d["id",],
vorname = d["vorname",],
nachname = d["nachname",],
prename = d["prename",],
lastname = d["lastname",],
fraction = d["fraction",], fraction = d["fraction",],
titel = d["titel",],
rolle_kurz = d["rolle_kurz",],
rolle_lang = d["rolle_lang",])
title = d["title",],
role_short = d["role_short",],
role_long = d["role_long",])
} }


#' Write the parsed and repaired results into separate csv files #' Write the parsed and repaired results into separate csv files


+ 10
- 12
R/repair.R Dosyayı Görüntüle

@@ -29,15 +29,13 @@ repair_speaker <- function(speaker) {
filter(id != "10000") %>% # invalid id's filter(id != "10000") %>% # invalid id's
mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction
group_by(id) %>% group_by(id) %>%
summarize(vorname = head(vorname, 1),
nachname = head(nachname, 1),
summarize(prename = head(prename, 1),
lastname = head(lastname, 1),
fraction = collect_unique(fraction), fraction = collect_unique(fraction),
titel = longest_titel(titel),
rolle_kurz = collect_unique(str_squish(rolle_kurz)),
rolle_lang = collect_unique(str_squish(rolle_lang))) %>%
title = longest_titel(title),
role_short = collect_unique(str_squish(role_short)),
role_long = collect_unique(str_squish(role_long))) %>%
ungroup() #%>% ungroup() #%>%
# arrange(id) %>%
# distinct(vorname, nachname, fraction, titel)
} }


repair_speeches <- function(speeches) { repair_speeches <- function(speeches) {
@@ -68,7 +66,7 @@ repair_talks <- function(talks) {
lookup_speaker <- function(tb, speaker, name_variable) { lookup_speaker <- function(tb, speaker, name_variable) {
tobereplaced <- "[-–—‑­­-­­­ ]" tobereplaced <- "[-–—‑­­-­­­ ]"
speaker %>% speaker %>%
unite(name, vorname, nachname, sep=".*") %>%
unite(name, prename, lastname, sep=".*") %>%
mutate(name = str_replace_all(name, tobereplaced, ".*")) -> mutate(name = str_replace_all(name, tobereplaced, ".*")) ->
rs rs
find_match <- function(komm) { find_match <- function(komm) {
@@ -88,10 +86,10 @@ repair_comments <- function(comments, speaker) {
"Use repair(, repair_commments = FALSE) to skip this.\n")) "Use repair(, repair_commments = FALSE) to skip this.\n"))
# try to find a speaker id for each actual comment # try to find a speaker id for each actual comment
comments %>% comments %>%
filter(!is.na(kommentator)) %>%
lookup_speaker(speaker, kommentator) %>%
left_join(comments, ., by="kommentator") %>%
select(-kommentator)
filter(!is.na(commenter)) %>%
lookup_speaker(speaker, commenter) %>%
left_join(comments, ., by="commenter") %>%
select(-commenter)
} }


#' Repair parsed tables #' Repair parsed tables


BIN
inst/reports/implementierung.pdf Dosyayı Görüntüle


+ 1
- 1
inst/reports/implementierung.tex Dosyayı Görüntüle

@@ -99,7 +99,7 @@ Funktion: \lstinline{fetch_all(download_dir)}
\begin{lstlisting}[language=R,basicstyle=\tiny\ttfamily] \begin{lstlisting}[language=R,basicstyle=\tiny\ttfamily]
> res$comments > res$comments
# A tibble: 83,649 x 5 # A tibble: 83,649 x 5
speech_id on_speaker fraction commentator content
speech_id on_speaker fraction commenter content
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID19100300 11003218 BUENDNIS 90/D Katrin Goering Was? 1 ID19100300 11003218 BUENDNIS 90/D Katrin Goering Was?
2 ID19100300 11003218 CDU/CSU Volker Kauder Warum habt ihr das bei Ge 2 ID19100300 11003218 CDU/CSU Volker Kauder Warum habt ihr das bei Ge


Yükleniyor…
İptal
Kaydet