An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

254 Zeilen
9.6KB

  1. #' Parse xml records
  2. #'
  3. #' Creates a list of tibbles containing relevant information from all records
  4. #' stored in the input directory.
  5. #'
  6. #' @param path path to records directory
  7. #' @param pattern search pattern to find records in directory
  8. #'
  9. #' @export
  10. read_all <- function(path="inst/records/", pattern="-data\\.xml") {
  11. # append file separator if needed
  12. path <- make_directory_path(path)
  13. cat("Reading all records from", path, "\n")
  14. # list all files in directory and filter by search pattern
  15. fs <- list.files(path)
  16. available_protocols <- fs[str_detect(fs, pattern)]
  17. if (length(available_protocols) == 0)
  18. stop(paste0("The given directory does not exist or does not contain files matching \"",
  19. pattern,
  20. "\"."))
  21. # parse records one by one and remove null entries
  22. res <- compact %$% pblapply(available_protocols, read_one, path=path)
  23. if (length(res) == 0) stop("No valid records found. Did you fetch successfully?")
  24. lapply(res, `[[`, "speaker") %>%
  25. bind_rows() %>%
  26. distinct() ->
  27. speaker
  28. lapply(res, `[[`, "speeches") %>%
  29. bind_rows() %>%
  30. distinct() %>%
  31. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  32. speeches
  33. lapply(res, `[[`, "talks") %>%
  34. bind_rows() %>%
  35. distinct() ->
  36. talks
  37. lapply(res, `[[`, "comments") %>%
  38. bind_rows() %>%
  39. distinct() ->
  40. commentsandapplause
  41. filter(commentsandapplause, type == "comment") %>%
  42. select(-type) ->
  43. comments
  44. filter(commentsandapplause, type == "applause") %>%
  45. select(-type, -commenter, -content) %>%
  46. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  47. "SPD" = str_detect(fraction, "SPD"),
  48. "FDP" = str_detect(fraction, "FDP"),
  49. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  50. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  51. "AfD" = str_detect(fraction, "AfD")) %>%
  52. select(-fraction) ->
  53. applause
  54. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  55. }
  56. # this reads all currently parseable data from one xml
  57. read_one <- function(name, path) {
  58. x <- tryCatch(read_xml(paste0(path, name)),
  59. error = function(c) NULL,
  60. warning = function(c) NULL)
  61. if (is.null(x)) return(NULL)
  62. # extract date of session
  63. date <- xml_attr(x, "sitzung-datum")
  64. cs <- xml_children(x)
  65. verlauf <- xml_find_first(x, "sitzungsverlauf")
  66. speakerl <- xml_find_first(x, "rednerliste")
  67. xml_children(speakerl) %>%
  68. parse_speakerlist() ->
  69. speaker
  70. xml_children(verlauf) %>%
  71. xml_find_all("rede") %>%
  72. parse_speechlist(date) ->
  73. res
  74. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  75. }
  76. xml_get <- function(node, name) {
  77. res <- xml_text %$% xml_find_all(node, name)
  78. if (length(res) == 0) NA_character_
  79. else res
  80. }
  81. # parse one speaker
  82. parse_speaker <- function(speaker_xml) {
  83. speaker_id <- xml_attr(speaker_xml, "id")
  84. nm <- xml_child(speaker_xml)
  85. prename <- xml_get(nm, "vorname")
  86. lastname <- xml_get(nm, "nachname")
  87. fraction <- xml_get(nm, "fraktion")
  88. title <- xml_get(nm, "titel")
  89. role <- xml_find_all(nm, "rolle")
  90. if (length(role) > 0) {
  91. role_long <- xml_get(role, "rolle_lang")
  92. role_short <- xml_get(role, "rolle_kurz")
  93. } else role_short <- role_long <- NA_character_
  94. c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
  95. role_short = role_short, role_long = role_long)
  96. }
  97. # parse one speech
  98. # returns: - a speech (with speech id and speaker id)
  99. # - all talks appearing in the speech (with corresponding content)
  100. parse_speech <- function(speech_xml, date) {
  101. speech_id <- xml_attr(speech_xml, "id")
  102. cs <- xml_children(speech_xml)
  103. cur_speaker <- NA_character_
  104. principal_speaker <- NA_character_
  105. cur_content <- ""
  106. speeches <- list()
  107. comments <- list()
  108. for (node in cs) {
  109. if (xml_name(node) == "p" || xml_name(node) == "name") {
  110. klasse <- xml_attr(node, "klasse")
  111. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  112. if (!is.na(cur_speaker)) {
  113. speech <- c(speech_id = speech_id,
  114. speaker = cur_speaker,
  115. content = cur_content)
  116. speeches <- c(speeches, list(speech))
  117. cur_content <- ""
  118. }
  119. if (is.na(principal_speaker) && xml_name(node) != "name") {
  120. principal_speaker <- xml_child(node) %>% xml_attr("id")
  121. }
  122. if (xml_name(node) == "name") {
  123. cur_speaker <- "BTP"
  124. } else {
  125. cur_speaker <- xml_child(node) %>% xml_attr("id")
  126. }
  127. } else {
  128. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  129. }
  130. } else if (xml_name(node) == "kommentar") {
  131. # comments are of the form
  132. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  133. xml_text(node) %>%
  134. str_sub(2, -2) %>%
  135. str_split("–") %>%
  136. `[[`(1) %>%
  137. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  138. cs
  139. comments <- c(comments, cs)
  140. }
  141. }
  142. speech <- c(speech_id = speech_id,
  143. speaker = cur_speaker,
  144. content = cur_content)
  145. speeches <- c(speeches, list(speech))
  146. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  147. parts = speeches,
  148. comments = comments)
  149. }
  150. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  151. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  152. parse_comment <- function(comment, speech_id, on_speaker) {
  153. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  154. # classify comment
  155. if(str_detect(comment, "Beifall")) {
  156. str_extract_all(comment, fractionpattern) %>%
  157. `[[`(1) %>%
  158. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  159. str_c(collapse=",") ->
  160. by
  161. c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
  162. } else {
  163. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  164. c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4])
  165. }
  166. }
  167. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  168. parse_speechlist <- function(speechlist_xml, date) {
  169. d <- sapply(speechlist_xml, parse_speech, date = date)
  170. speeches <- simplify2array(d["speech", ])
  171. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  172. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  173. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  174. date = speeches["date",]),
  175. talks = tibble(speech_id = parts["speech_id", ],
  176. speaker = parts["speaker", ],
  177. content = parts["content", ]),
  178. comments = tibble(speech_id = comments["speech_id",],
  179. on_speaker = comments["on_speaker",],
  180. type = comments["type",],
  181. fraction = comments["fraction",],
  182. commenter = comments["commenter",],
  183. content = comments["content", ]))
  184. }
  185. # create a tibble of speaker from a list of xml nodes representing speaker
  186. parse_speakerlist <- function(speakerliste_xml) {
  187. d <- sapply(speakerliste_xml, parse_speaker)
  188. tibble(id = d["id",],
  189. prename = d["prename",],
  190. lastname = d["lastname",],
  191. fraction = d["fraction",],
  192. title = d["title",],
  193. role_short = d["role_short",],
  194. role_long = d["role_long",])
  195. }
  196. #' Write the parsed and repaired results into separate csv files
  197. #'
  198. #' @param tables list of tables to convert into a csv files.
  199. #' @param path where to put the csv files.
  200. #' @param create set TRUE if the path does not exist yet and you want to create it
  201. #'
  202. #' @export
  203. write_to_csv <- function(tables, path="inst/csv/", create=F) {
  204. check_directory(path, create)
  205. write.table(tables$speaker, str_c(path, "speaker.csv"))
  206. write.table(tables$speeches, str_c(path, "speeches.csv"))
  207. write.table(tables$talks, str_c(path, "talks.csv"))
  208. write.table(tables$comments, str_c(path, "comments.csv"))
  209. write.table(tables$applause, str_c(path, "applause.csv"))
  210. }
  211. #' create a tibble from the csv file
  212. #'
  213. #' @param path directory to read files from
  214. #'
  215. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  216. #'
  217. #' @export
  218. read_from_csv <- function(path="inst/csv/") {
  219. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  220. tibble() %>%
  221. mutate(id = as.character(id)),
  222. speeches = read.table(str_c(path, "speeches.csv")) %>%
  223. tibble() %>%
  224. mutate(speaker = as.character(speaker),
  225. date = as.Date(date)),
  226. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  227. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  228. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  229. }