From 4649658fa73257c49faafc92656f618f75196878 Mon Sep 17 00:00:00 2001 From: flavis Date: Mon, 9 Aug 2021 15:31:20 +0200 Subject: [PATCH] rename some columns to english --- R/parse.R | 38 +++++++++++++++---------------- R/repair.R | 22 ++++++++---------- inst/reports/implementierung.pdf | Bin 127501 -> 127490 bytes inst/reports/implementierung.tex | 2 +- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/R/parse.R b/R/parse.R index 7785995..1abf1d0 100644 --- a/R/parse.R +++ b/R/parse.R @@ -39,7 +39,7 @@ read_all <- function(path="inst/records/") { select(-type) -> comments filter(commentsandapplause, type == "applause") %>% - select(-type, -kommentator, -content) %>% + select(-type, -commenter, -content) %>% mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"), "SPD" = str_detect(fraction, "SPD"), "FDP" = str_detect(fraction, "FDP"), @@ -86,17 +86,17 @@ xml_get <- function(node, name) { parse_speaker <- function(speaker_xml) { speaker_id <- xml_attr(speaker_xml, "id") nm <- xml_child(speaker_xml) - vorname <- xml_get(nm, "vorname") - nachname <- xml_get(nm, "nachname") + prename <- xml_get(nm, "vorname") + lastname <- xml_get(nm, "nachname") fraction <- xml_get(nm, "fraktion") - titel <- xml_get(nm, "titel") - rolle <- xml_find_all(nm, "rolle") - if (length(rolle) > 0) { - rolle_lang <- xml_get(rolle, "rolle_lang") - rolle_kurz <- xml_get(rolle, "rolle_kurz") - } else rolle_kurz <- rolle_lang <- NA_character_ - c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel, - rolle_kurz = rolle_kurz, rolle_lang = rolle_lang) + title <- xml_get(nm, "titel") + role <- xml_find_all(nm, "rolle") + if (length(role) > 0) { + role_long <- xml_get(role, "rolle_lang") + role_short <- xml_get(role, "rolle_kurz") + } else role_short <- role_long <- NA_character_ + c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title, + role_short = role_short, role_long = role_long) } # parse one speech @@ -165,10 +165,10 @@ parse_comment <- function(comment, speech_id, on_speaker) { sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>% str_c(collapse=",") -> by - c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment) + c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment) } else { ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,] - c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4]) + c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4]) } } @@ -187,7 +187,7 @@ parse_speechlist <- function(speechlist_xml, date) { on_speaker = comments["on_speaker",], type = comments["type",], fraction = comments["fraction",], - kommentator = comments["kommentator",], + commenter = comments["commenter",], content = comments["content", ])) } @@ -195,12 +195,12 @@ parse_speechlist <- function(speechlist_xml, date) { parse_speakerlist <- function(speakerliste_xml) { d <- sapply(speakerliste_xml, parse_speaker) tibble(id = d["id",], - vorname = d["vorname",], - nachname = d["nachname",], + prename = d["prename",], + lastname = d["lastname",], fraction = d["fraction",], - titel = d["titel",], - rolle_kurz = d["rolle_kurz",], - rolle_lang = d["rolle_lang",]) + title = d["title",], + role_short = d["role_short",], + role_long = d["role_long",]) } #' Write the parsed and repaired results into separate csv files diff --git a/R/repair.R b/R/repair.R index 07e43f0..b1fb8b8 100644 --- a/R/repair.R +++ b/R/repair.R @@ -29,15 +29,13 @@ repair_speaker <- function(speaker) { filter(id != "10000") %>% # invalid id's mutate(fraction = Vectorize(repair_fraction)(fraction)) %>% # fix fraction group_by(id) %>% - summarize(vorname = head(vorname, 1), - nachname = head(nachname, 1), + summarize(prename = head(prename, 1), + lastname = head(lastname, 1), fraction = collect_unique(fraction), - titel = longest_titel(titel), - rolle_kurz = collect_unique(str_squish(rolle_kurz)), - rolle_lang = collect_unique(str_squish(rolle_lang))) %>% + title = longest_titel(title), + role_short = collect_unique(str_squish(role_short)), + role_long = collect_unique(str_squish(role_long))) %>% ungroup() #%>% - # arrange(id) %>% - # distinct(vorname, nachname, fraction, titel) } repair_speeches <- function(speeches) { @@ -68,7 +66,7 @@ repair_talks <- function(talks) { lookup_speaker <- function(tb, speaker, name_variable) { tobereplaced <- "[-–—‑­­-­­­ ]" speaker %>% - unite(name, vorname, nachname, sep=".*") %>% + unite(name, prename, lastname, sep=".*") %>% mutate(name = str_replace_all(name, tobereplaced, ".*")) -> rs find_match <- function(komm) { @@ -88,10 +86,10 @@ repair_comments <- function(comments, speaker) { "Use repair(, repair_commments = FALSE) to skip this.\n")) # try to find a speaker id for each actual comment comments %>% - filter(!is.na(kommentator)) %>% - lookup_speaker(speaker, kommentator) %>% - left_join(comments, ., by="kommentator") %>% - select(-kommentator) + filter(!is.na(commenter)) %>% + lookup_speaker(speaker, commenter) %>% + left_join(comments, ., by="commenter") %>% + select(-commenter) } #' Repair parsed tables diff --git a/inst/reports/implementierung.pdf b/inst/reports/implementierung.pdf index f35108c8386b545a42e8d3ef4fa2abf3fc447554..5ad94f2fc3a695fd0b038239e57d3f21ef2ccf0f 100644 GIT binary patch delta 5020 zcmai#WmFX0_QoBgr9lKq>1G&mNU5PahlZh~hDLG-ks1kU7*Zrf1f)Ae!l5OHP)ejk zO8T$wdhdJhhyPvo!&%RH_I}n``@=r#_hKC{avkrTr9GCAXljZ%S89r@Faj&0JTMEE z0lYIotdVhoPj{Q2T z*~eko{;{RO8@PQ5x0eBa{w9GW$E4;dIoC3}{9q+L3S2ABd9UDbC|QEG?|ewisVE)Y zCkv&hoz;ITshYfGx0!)3#k83@UuP2TK{`!oZyV_Sk4T0ZV#F}x<5o+y)>E`C#p_iF zrMO;=_sbd^EwSv|l$1_sMHvu|G5pv3muQt5QCjDYlhRLR>q%VIUO1^zNqmpSjyR?w z^`>{!s^YPE%!g8n1Cyi+sQE#q<=M)7*`WJ138g{SX_oYXMDX4uy%;LKQ4o6%Sg|+cY&&dE{-QkUjU``;hyL0tBj2(-$QM{7%-5kq#nD8}d-uFr0wSk7lG z%Xlv4B-Q5=f6VTwd6t4->9~+Iaq5XNQ3v{c(6~`+^t^3$p^NW-H|87s((Ndt(AAM< z3@F?Gi+EMV=3*LFofcby9fLe z)qPC^G{LGyt1}Rz4{xL} z)4YJBpAEW_5|aIdEBMkmEL?EqgppyD>cBYD9)Ez-q`f2yh*FRT5B<8oo( zsYCXRqWE3J6484Q8n7yN|G78NWtRTYgVE%Bcr-dfH?fCm=_+ZW8HCPVqY_DPej0fM zt{Z>c3nm(BT=q*8S`{t?6;5_Y()#F$oywO-ISS(EY}SpZ+$glM&GefSPW1N?$Ag{= za0G3ZvSTfh4>sU`%Ngfw!XJJX-#tx{Wkb9+pSAE5(RA^>`Yn_4<;QVM*`8v3)HoN? zKr{mPIDhBNk?NMY>+xVsh2|iwHd_7d{917DJ2BtQ z4bw`Dq&PX7`TOpk2VWG~)&MpE-1$Z(Yjk4j;%AjnQ17T!HE>odczKg35pCW4iNv#3 zFM}I~$XO8}GWQ!h#+j;N5DneaOPvt!>ifjd?W9oz40=TMI49qgbTn`P3U@o=l)i4{ zk-Ig_WVcq}d8FA&I>p&-a-?K7DS0UiGSaj`YHF;}-uuEEK;acb9zyUdH>A?O@#?_gsb|qthj>+wB8XNXRXb~&?Ks#jx=@a?iYj7eK+AYEFYY7P+ z+Q{W#WZWJOXhwdboesSB6!&R!OV)V3zn!G39MTH=ysb^Hod@R1*O7!ORv~MucZdku zF?M;NFM}HGe`<{x$tH>%xcoXuN)+^L0_M~)U}_oQg!%j{VDL16rog!2mU12g(XGd)ntdFR;>Gr9T2wA0~UD?R{7Tc`H`W>&H$#p z4+h;%;cd=WloG5Gf1OEW&RE0*36f(XV~u6|Nr}mzMcEpvtKfPee^z{0>BwX1zITvC z10BO9Sr>8#&7Ko_rkooZob9-Bj1H^J4UNeaJ>|dE@;IfuP2zexyGu%Z6JB5uvFRe^ zenEMI;Gt1vNtJcw#u5_$8#mNs?97D<$o6xTg33 zuUFN!-Q$&)0^SnO-iPf72~1FH|1r9=+PfG}Tgoeljaskm-^}r!CwA+kSjd`TW7l+r zkxk9d;MF!L@k5j-hD*NnJN57}DY-T1fF^=ev)Cq`oGm1q25bzU7_~2Ti6Snz^xcj<6XhpKx5_Qyn-?pB)VMpyqN@ zqsM;PtD=x!uIzK`$A>p<3;5#ZT)Cwg*akg>d^Gs>L!r*sM$!JP)^CuvW+ZKaG3yJ8 zbRN#mU6?<3I)@Cs{s&Lz6dIG99HU%{{4X@~$<@tIbCaxGLE}L?TG={?AtpVuHA0R6 zMyNvmmR2dwtMb`Y_+oer*$R#~huw-`gB zrA!I!gR%Iwog-G%wp&C z9eVJHIZskWZzoT8K8O|}##{bI=gTOe)0ZeU9^hU$H%{50>6^#>sdGG*=0@d)?y9?} z+PMZ_0v|ui_BHaJqJw-I4U&6_xGhJW^<92D`6N^hB{MfgpE`Aj=*y2ri@I#jLrWJ^ z?Q;XLErt7@K0gEuzyWetQ*diAtOpaPXCMB6(j(fP(`t+xG}dm@#-49g!8E$@urF>a z)yCZ?7*<+tljD*IuqK}luLJ|6B}~|Ac^8hhUYrrt#l@D+=W}J|6aWwL>pw_XLN>cH z53|h4QRjpH6BUTNuEu%5VNJ#PMU^9>&XAR0-*AI&ai7Sbu1-@0mk~X}%D@iI8Z0K8 zpS_ELQJeFb4rZiFM83k~!ylwR8=S&L`9#nZS$s_w?_1I}E)XYrn4utY z=owXvEb3Jn!@O6rk-}=B2BZ^ocqbHUmg8&kM-a>a^IufF2&mG48--ErF)1E}H21J_4Dw>r*cu+XEP=-AIXnt!A&i3o8no_wSq$H8UWYU?ELNenW3G<19hn z!gzKj3{l~|gHcL$o4~?*(5s?4kQr{NCta<+X8wmyRe$lx>kpq6{^AptqoCd#*GQTW z8vfvUb*1oi^!99GPUmU)(6`(b+fB>=f~U8wVK)`=><=ELlW{D?Y*Vi81hViy$qQx5 z=0R`(A*HVmkVT%*&AcOF~>K-B;#5&6EeJkhcf4OFM?mfnMCB!sefV0iv5b<;U7*N z-)YYKn^TxnQcow@fnVeUQmM{fj9gudmj&;IP)E{I&)VB&ukz=W2aOo`X4=4cauX-hS)AnTlImwRGg(OIS z8xZgfvJf48v1`^cmr52r9tM2Hs% z;sgRYxd{ce;Py(kz7EV>N|GQTNC+qa1Ph7&i8M8C0H|;jA@X35h=LdxqyPj1Ma9JB zMU}d5F4? zE1N+QijIjT7{~L)@|b%2)8oD#DMap?tq!8G%PrL{gTKz`;^C1)1YeqRooU9PK9pm` zA^J_4QJvY$pc#~N$05osEf4(q!?nA6>Rw%+%1?DGNzBbIn6xISs;#|=cKH*XP6&&W zL(^_bN1e{;2ms6#W6vD35lf#ALwH+g52}IL%Qle6A)NX2DP+xsUS5Eq?9_$T1wh6p z`5G3)qOf@G<UZ4H#+Kj2Y7%WkB(JPdQ|+wqhDA@hNT#nkA5DzCJ^6_>WITt<^HX+s@ zBVWj@OU8Bm?LSI@x5qu!!%seD&+D?p;!O*Tp- zuO=akjn^aZNk;$w%Cqf{$Jw^KCZS99qj)tN)ilo1k)HL(48D18*n(f^$-}Xk0jhT> zehcD{h9dr50I1rKZ~3%dl`?&XC7h*vq0u(#(U1tlrt5*cBX|2-`h$bjUOFf_#*U1} zKtblMv7F!~*~!~Lkus)^9rOROcUpBun;Rl0Yn|)V)=`MCl0}D@U8>Af(l&>r-Sp5p zYmz~EFxy=)pUQeUed?OGg7z@&e8!a1?8Voz*9DQI!(gyr!7xq>SLS-1MM{;o*4vRk z-^N?RV}zlFm(-rGj0t(>cc5W1izoL;XT3r?bqmC8@KeERm^Y9gzFyrNV?u>J?UI{-%ZW8BaY9+ey)X8vZ+6Q+r3YduMiuppXTlEvG+MZdiiPN*ZO)Zb z_F8Ah=|+4y_uTGr8)xAAFgW0^-0!;x%|jyM=eYk^?1fEKSYx4JqOx%LfIRHg3j*>N zu@EKJ%$x(jL)s{Ro-%8<{e+b-%bE2(<9D8r;v_s*?kl#Jyt zXA5;BvE1YsPd6#wMV!a_3S_c56ar#Osj$Cc_WJS|;dX?Ojcu zA>j4O?Irqa2yl*P(Y~$ufPOd4vd$c1N_sjhJ1>a0UP%8kfhKi0ChuAD>sNyk@tVln@sKU;T_m5w{C7?`j-DpO1OT{W_R+$;8(3j&2LW zLYH)0!Ph5Up(CqRJtk2#xYop-5C5KMM@ifIx~$9Xg5tU>N$P3NQR&#_Og(DsrV1`I zc3yvQRy~A5934^?44;n)UQpBZLBY32A(?Lv0UTtrn^#G2gVEct)EiRiKo!}{^+NzN Pfv^}MJG-(Dgz*0WJhXyZ delta 5034 zcmajgXHXN&x(9FqBE5<+^co>_LXAQw(z_G^DJmW5q1OlqNQVRnC4>$tB1#A8C=z;6 zx`Id$q!|I}aP(Z~%zI|;o%>Em1spxY!=v`d;O8U0OG%zDg6TkvN2g{5x!dcXIBSADB`t}!42%rS z18iGcjcUpjm`xVuQ8Y#xMjE3BZ!>y=KWTn_Oy6ogmsCgQ+r00nv!1VHO;9(&1}<-E zi`L>&-HwDpnH?%+-iHjhA(wt+jrG-Ehdj8;9+NZD_&KQ>>?UQ8vb?S#2&cIdB4J+@ zflBCzXAxkRkpl6{Z-A5wIXkAnS{02F zlifI#=zQs8e~NX`hYClR3cnLI6LG!}A6bM5=mE{$^^3k-WnP)S0+okB~6I$Bk>Tt(^3Myxz{`xHyj{#5=?czL+GTZ#xaS+O?u6epMqOyzqaSS<`( zo6$|>81`E^a3k{hCFak*H4!o;R?M*W%$J5MKMhSuTDi-LygG6pG6tFObIQBxLACr`2OHxTrYM!n#=H{mf}9%3TG11COlL{x(t!% zb&SnAgUro-@d&5XAIz>KJ~gRj!tv46FAD=aCUGWV}CSdTPb`rbK~Kx z;g9Jp9iIwVsn$dKM#0;1+`xc1Fd_spHgMSY>ZTmvjSl+2LR6Egw!FUrU2pCr>(goQLHmw_JUfTjOC-(E7^x zBX#MB%!l>}S!Eu~&T%rmKlQQmh;iZcH9dXL7F|`@8VtdFiCo`Gp@6~FE@HM~|Hp8b zJXwuCa+i6pF!7cWsKK3noW^A>76USpI)6g+{iyXpHak!!Jl(d?toiDUGy zu1Ti0D-PeP41Y{VhKq`eG7A**6aPNc!4m-3A}9Y4uI^vbIVPH*M9dzLC2A*>Y!@3O zzHD>T=?Wx8hymuHdjpVYXfd;713A!=KbXD(N&mU}b6i!m_V}rk%)#wU&JvuJjNwO1 z;P;lALCc>5I*heN^xPgsbPBDr zNNB~;R!^;Ym?1LwxS^_|ru`xGcs1y83)!*9GZsyWOj7+x_bpzSnJ;NkZ*AdX{Z@%d!LjLGwYw8RBN2|za*9A!*?rP-J@Y#P_R9Rn)O-9Vr7fu|Q|!&; zEX*_EwwliEq#Gonp9}G)N*FFH z$(!`=U;MB4Ng=#lpdhyQ{TUMM{=B70qoF)jw(`2$BPT&?zk)!8W(rGJ*5bM3ku|ZX zUc&9#6XoI+vr@-+=zA`yd)IVR*^3`c4T0Y5F+cLeTsCt5s)N* z)mN?(91__CJqIZcdzblhw|$bD=VyAh%7BeJ@a2N{9P;+P3IA0X-zi%3*t+W;{}G#* zzS>VM-Z@u8ai$!ADF0Dq#5tZF*{d|_9sjDyZ#XD6E;^=>R($mtHMMBd(Di=B8 zF7L(Y&+8tejhrNrr=%3ea^&b4GQ?0waY~u9KxTOjUp}<2zExfY!X>N}SK=8WoNK#Z zHxrs*9PptoxgQP=JxrpXr_`15{Wj2sf7DuGx!A95x?I7iIDExlzbto?-?%OR_F|_9 zg`d5*ArSr*&Fc$|+eLehsEOpqN7M?~*OdS8Pv?2_Zb%IIy7|DVc;DjcTuS(B=(#&= zbCnb2&hXos;xj~ign`QnI~a4$1g4f4|6`5bZ)X z9E|*9P0m7yBUK1aJ{Oo=VrO*MTwtsrivyB((7)?8&gYPrXZ02ZF(tP4vkuMkBbA>b zkSQ_0vAMc^ACmstnupoGsi*b#`=R$aI(nVj4hxvQQTuc_Cnu^T7+eg~DNY=8?{yma z6wT#zVnXJ#BI^B2v4Z;KIZ4=nuC%{7=1Bj_wm*P9Z>_uTnQ3bY@k&}`0sp>a3j59Oto@l8}LdyI^ArE4h zPHNunYIC!Ne76-jo;loNR}z)G`fn9N7kmg6Jhag!Kom=Ur{J}-tO?&+TS zh);tI|5_uhK|X2bM|y}Vx@qX7>-&)$8pRfKB+toe^~XxD^Z@!yla=kmO$ba*DdoZm0Z%p4>x4-! zKcVZdrrah80eqdAoP_%iYV}KYvHF0FojyWWONKVdOen_e7*&QX)U}RjwosN=ydgGM z7%)NuT*4-MD+YR(8u`=RnT+vcB2AP7j}yB-Fz4Jy(tVxHnG0@erL`#69zXw04O1p( zH177lsab>lp$4zRQ~QXXrZN69$hO+VZR~@!OwgmS%M0RtX_B8I4m4Rdb?;?+U&;BO z6D;QzZpcFftn0hFHHf^-yCknf(p}7R!+SN?SM35M^>B=e5DHWI_w(4{*GDw5TO=CD zRHz~A-i29T^73Mr@<%22%RrmJ?PA%tuh4w7->Me$ zItJX9zhrC+Ghi}cF*m#cqa+OS*RF(48S1-i#gD6{cm9Tk<_|R1ct|Ltue&wy>8MwE zAn(WCb&p`Iq423=lyiOB_CX$Pe#*gk2DiYTYw7#((3JYtZNt_Ls?WuIZMS+4>%U)x z10vVLHX3fM`y3h0ofY0#Hw&7ee~=ns5n)^4zwi_txELl_eJ5bT`30JO%%Emm=QlY@ zNA@XUC~s~VWwhu+Hkt+Wfv?N|=z_NO_)?&~|`4^fgV-b@8dHF=nW!)IPZd&QKxYdzM1UNR_{dGS?v**h%7o)IX zy!EH;c7t7KyRhXeFz`*s%1zX#TCHCL@L35m7!L&@odcQ8k1$Kc`@(nyooq%1eqk#j zCeywrm;TJ13YuV8ONiOzp5zY^{YHqkVcBsQacbeSAmrI#*b$~)c+VF8AOdayL-Eu2 z`bsW#&}|cQHio-F>n9T7HA;wW%e0taI7n&B3Ob*4r?Du1Q}QgC`j%5%x@kXTg+-Sd zT%MFOWaRewTZ9A+V(y}@gIpU>Mq)w&dBmbZ%|o6}`nS9fl%&ZaABxshE%(Usv|n|) zZCDk#fO%Z3A7N^8*qLL3T0CK-VTygCrp{Y}x!43;Ln3245qC%WVY3C3$DMl)Q58PM zj6#%|hOAN1Dyl_1e+rgM)THE56KP!kSX$EMK0U%Vj%+3=6R$$Y)}-@TK-9G4aiq7R zio1;O3rRlaR7Z-@0&aCNN?!U;*o0(HzTAkQ@bk+9ZCQ`7tLh*Xye|QxH10E5{vQFj zH8i@u_Wm7!3e=O!D@5fIx5T4BNgo~azlP*l&JF40i~5?dfup6YHYTzZ3P!K=j+mF| zlqE2hpER9BfP_lrZjoo;S0;>9D_82V z8AgN`SoAJJReS*4B5JYn+y=dYxaBn<)*^P`s94DmbkZ7BUbPWZn6CBH64!4UavlBC z#;#v4L=;>@NNKw|=^ePJ-+82O&3$l&?YKExxYoq&Zd-zWky!;hTfU79)OW^vL}A2t z0Gu2##k^zAt9!lU9Rk4N>6_wKdp<&O%k z9`(MwB~0T_T-9ax*f8lk03gzLEcM$S`cnkadf@~i2V&#d>bO7t%GFXvlnqFi6sOT! z^5DPA766++Z4lyk*@ioOWPLE9Lsr;Jo8nB)zakkgnk(k#_;Lt+RI36U;Ik6db9^R> zCe*6k8LY=f?!WI?5r*;DF5j?{V1Ijb|CNs1;P3YrVKdj~o#LBcbN%z{3b-SV_v1Sb zRbhNugd7gc6@*D2;filHsJWZ7xsao|lN+1mhp-dx*Ju&I5E5-Ly|A-D+k4YB3Ec)# zhu#16aWfQwO*Ao;KlQH@TAHtNH$%9apVJ*Td#~)BEYj`=oM?Z6)nVz0iQv9R( zz4^R*ItzBTo)ig^G>fT`{LmZyxAX78Iqz%7-q>?A>3^=E{M|=^IR0*O?*L(V@1n+= zHC#*f@Zpb-OI6(#F*TR((RHwe7xb^^U;M%t*p^)E`ckC%* res$comments # A tibble: 83,649 x 5 - speech_id on_speaker fraction commentator content + speech_id on_speaker fraction commenter content 1 ID19100300 11003218 BUENDNIS 90/D Katrin Goering Was? 2 ID19100300 11003218 CDU/CSU Volker Kauder Warum habt ihr das bei Ge