An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

243 Zeilen
9.0KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="data/records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. if (length(available_protocols) == 0)
  15. stop("The given directory is empty or does not exist.")
  16. lapply(res, `[[`, "speaker") %>%
  17. bind_rows() %>%
  18. distinct() ->
  19. speaker
  20. lapply(res, `[[`, "speeches") %>%
  21. bind_rows() %>%
  22. distinct() %>%
  23. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  24. speeches
  25. lapply(res, `[[`, "talks") %>%
  26. bind_rows() %>%
  27. distinct() ->
  28. talks
  29. lapply(res, `[[`, "comments") %>%
  30. bind_rows() %>%
  31. distinct() ->
  32. commentsandapplause
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  39. "SPD" = str_detect(fraction, "SPD"),
  40. "FDP" = str_detect(fraction, "FDP"),
  41. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  42. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraction, "AfD")) %>%
  44. select(-fraction) ->
  45. applause
  46. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. speakerl <- xml_find_first(x, "rednerliste")
  58. xml_children(speakerl) %>%
  59. parse_speakerlist() ->
  60. speaker
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_speechlist(date) ->
  64. res
  65. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one speaker
  73. parse_speaker <- function(speaker_xml) {
  74. speaker_id <- xml_attr(speaker_xml, "id")
  75. nm <- xml_child(speaker_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraction <- xml_get(nm, "fraktion")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one speech
  89. # returns: - a speech (with speech id and speaker id)
  90. # - all talks appearing in the speech (with corresponding content)
  91. parse_speech <- function(speech_xml, date) {
  92. speech_id <- xml_attr(speech_xml, "id")
  93. cs <- xml_children(speech_xml)
  94. cur_speaker <- NA_character_
  95. principal_speaker <- NA_character_
  96. cur_content <- ""
  97. speeches <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  103. if (!is.na(cur_speaker)) {
  104. speech <- c(speech_id = speech_id,
  105. speaker = cur_speaker,
  106. content = cur_content)
  107. speeches <- c(speeches, list(speech))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_speaker) && xml_name(node) != "name") {
  111. principal_speaker <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_speaker <- "BTP"
  115. } else {
  116. cur_speaker <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. speech <- c(speech_id = speech_id,
  134. speaker = cur_speaker,
  135. content = cur_content)
  136. speeches <- c(speeches, list(speech))
  137. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  138. parts = speeches,
  139. comments = comments)
  140. }
  141. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, speech_id, on_speaker) {
  144. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fractionpattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  159. parse_speechlist <- function(speechlist_xml, date) {
  160. d <- sapply(speechlist_xml, parse_speech, date = date)
  161. speeches <- simplify2array(d["speech", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  165. date = speeches["date",]),
  166. talks = tibble(speech_id = parts["speech_id", ],
  167. speaker = parts["speaker", ],
  168. content = parts["content", ]),
  169. comments = tibble(speech_id = comments["speech_id",],
  170. on_speaker = comments["on_speaker",],
  171. type = comments["type",],
  172. fraction = comments["fraction",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of speaker from a list of xml nodes representing speaker
  177. parse_speakerlist <- function(speakerliste_xml) {
  178. d <- sapply(speakerliste_xml, parse_speaker)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraction = d["fraction",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' Write the parsed and repaired results into separate csv files
  188. #'
  189. #' @param tables list of tables to convert into a csv files.
  190. #' @param path where to put the csv files.
  191. #' @param create set TRUE if the path does not exist yet and you want to create it
  192. #'
  193. #' @export
  194. write_to_csv <- function(tables, path="data/csv/", create=F) {
  195. check_directory(path, create)
  196. write.table(tables$speaker, str_c(path, "speaker.csv"))
  197. write.table(tables$speeches, str_c(path, "speeches.csv"))
  198. write.table(tables$talks, str_c(path, "talks.csv"))
  199. write.table(tables$comments, str_c(path, "comments.csv"))
  200. write.table(tables$applause, str_c(path, "applause.csv"))
  201. }
  202. #' create a tibble from the csv file
  203. #'
  204. #' @param path directory to read files from
  205. #'
  206. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  207. #'
  208. #' @export
  209. read_from_csv <- function(path="data/csv/") {
  210. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  211. tibble() %>%
  212. mutate(id = as.character(id)),
  213. speeches = read.table(str_c(path, "speeches.csv")) %>%
  214. tibble() %>%
  215. mutate(speaker = as.character(speaker)),
  216. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  217. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  218. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  219. }