An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

294 Zeilen
11KB

  1. #' Parse xml records
  2. #'
  3. #' Creates a list of tibbles containing relevant information from all records
  4. #' stored in the input directory.
  5. #'
  6. #' @param path path to records directory
  7. #' @param pattern search pattern to find records in directory
  8. #'
  9. #' @export
  10. read_all <- function(path="inst/records/", pattern="-data\\.xml") {
  11. # append file separator if needed
  12. path <- make_directory_path(path)
  13. cat("Reading all records from", path, "\n")
  14. # list all files in directory and filter by search pattern
  15. fs <- list.files(path)
  16. available_protocols <- fs[str_detect(fs, pattern)]
  17. if (length(available_protocols) == 0)
  18. stop(paste0("The given directory does not exist or does not contain files matching \"",
  19. pattern,
  20. "\"."))
  21. # parse records one by one and remove null entries
  22. res <- compact %$% pblapply(available_protocols, read_one, path=path)
  23. if (length(res) == 0) stop("No valid records found. Did you fetch successfully?")
  24. lapply(res, `[[`, "speaker") %>%
  25. bind_rows() %>%
  26. distinct() ->
  27. speaker
  28. lapply(res, `[[`, "speeches") %>%
  29. bind_rows() %>%
  30. distinct() %>%
  31. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  32. speeches
  33. lapply(res, `[[`, "talks") %>%
  34. bind_rows() %>%
  35. distinct() ->
  36. talks
  37. lapply(res, `[[`, "comments") %>%
  38. bind_rows() %>%
  39. distinct() ->
  40. commentsandapplause
  41. filter(commentsandapplause, type == "comment") %>%
  42. select(-type) ->
  43. comments
  44. filter(commentsandapplause, type == "applause") %>%
  45. select(-type, -commenter, -content) %>%
  46. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  47. "SPD" = str_detect(fraction, "SPD"),
  48. "FDP" = str_detect(fraction, "FDP"),
  49. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  50. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "B\u00DCNDNIS 90/DIE GR\u00DCNEN"),
  51. "AfD" = str_detect(fraction, "AfD")) %>%
  52. select(-fraction) ->
  53. applause
  54. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  55. }
  56. # this reads all currently parseable data from one xml
  57. read_one <- function(name, path) {
  58. x <- tryCatch(read_xml(paste0(path, name)),
  59. error = function(c) NULL,
  60. warning = function(c) NULL)
  61. if (is.null(x)) return(NULL)
  62. # extract date of session
  63. date <- xml_attr(x, "sitzung-datum")
  64. cs <- xml_children(x)
  65. verlauf <- xml_find_first(x, "sitzungsverlauf")
  66. speakerl <- xml_find_first(x, "rednerliste")
  67. # check if record is invalid or empty (every record should have at least
  68. # one speech, a speaker and a date
  69. if (is.na(date) || length(verlauf) == 0 || length(speakerl) == 0) {
  70. warning("Invalid record found. Skipping.")
  71. return(NULL)
  72. }
  73. xml_children(speakerl) %>%
  74. parse_speakerlist() ->
  75. speaker
  76. xml_children(verlauf) %>%
  77. xml_find_all("rede") %>%
  78. parse_speechlist(date) ->
  79. res
  80. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  81. }
  82. xml_get <- function(node, name) {
  83. res <- xml_text %$% xml_find_all(node, name)
  84. if (length(res) == 0) NA_character_
  85. else res
  86. }
  87. # parse one speaker
  88. parse_speaker <- function(speaker_xml) {
  89. speaker_id <- xml_attr(speaker_xml, "id")
  90. nm <- xml_child(speaker_xml)
  91. prename <- xml_get(nm, "vorname")
  92. lastname <- xml_get(nm, "nachname")
  93. fraction <- xml_get(nm, "fraktion")
  94. title <- xml_get(nm, "titel")
  95. role <- xml_find_all(nm, "rolle")
  96. if (length(role) > 0) {
  97. role_long <- xml_get(role, "rolle_lang")
  98. role_short <- xml_get(role, "rolle_kurz")
  99. } else role_short <- role_long <- NA_character_
  100. c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
  101. role_short = role_short, role_long = role_long)
  102. }
  103. # parse one speech
  104. # returns: - a speech (with speech id and speaker id)
  105. # - all talks appearing in the speech (with corresponding content)
  106. parse_speech <- function(speech_xml, date) {
  107. speech_id <- xml_attr(speech_xml, "id")
  108. cs <- xml_children(speech_xml)
  109. cur_speaker <- NA_character_
  110. principal_speaker <- NA_character_
  111. cur_content <- ""
  112. speeches <- list()
  113. comments <- list()
  114. for (node in cs) {
  115. if (xml_name(node) == "p" || xml_name(node) == "name") {
  116. klasse <- xml_attr(node, "klasse")
  117. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  118. if (!is.na(cur_speaker)) {
  119. speech <- c(speech_id = speech_id,
  120. speaker = cur_speaker,
  121. content = cur_content)
  122. speeches <- c(speeches, list(speech))
  123. cur_content <- ""
  124. }
  125. if (is.na(principal_speaker) && xml_name(node) != "name") {
  126. principal_speaker <- xml_child(node) %>% xml_attr("id")
  127. }
  128. if (xml_name(node) == "name") {
  129. cur_speaker <- "BTP"
  130. } else {
  131. cur_speaker <- xml_child(node) %>% xml_attr("id")
  132. }
  133. } else {
  134. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  135. }
  136. } else if (xml_name(node) == "kommentar") {
  137. # comments are of the form
  138. # <kommentar>(blabla [Fraktion] \u2013 blabla liasdf \u2013 bla)</kommentar>
  139. xml_text(node) %>%
  140. str_sub(2, -2) %>%
  141. str_split("\u2013") %>%
  142. `[[`(1) %>%
  143. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  144. cs
  145. comments <- c(comments, cs)
  146. }
  147. }
  148. speech <- c(speech_id = speech_id,
  149. speaker = cur_speaker,
  150. content = cur_content)
  151. speeches <- c(speeches, list(speech))
  152. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  153. parts = speeches,
  154. comments = comments)
  155. }
  156. fractionpattern <- "B\u00DCNDNIS(SES)?\\W*90/DIE\\W*GR\u00DCNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  157. fractionnames <- c("B\u00DCNDNIS 90/DIE GR\u00DCNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP",
  158. "Fraktionslos")
  159. parse_comment <- function(comment, speech_id, on_speaker) {
  160. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  161. # classify comment
  162. if(str_detect(comment, "Beifall")) {
  163. str_extract_all(comment, fractionpattern) %>%
  164. `[[`(1) %>%
  165. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  166. str_c(collapse=",") ->
  167. by
  168. c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
  169. } else {
  170. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  171. fraction <- agrep(ps[3], fractionnames, max=0.2, value=T)
  172. if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_
  173. c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4])
  174. }
  175. }
  176. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  177. parse_speechlist <- function(speechlist_xml, date) {
  178. d <- sapply(speechlist_xml, parse_speech, date = date)
  179. speeches <- simplify2array(d["speech", ])
  180. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  181. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  182. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  183. date = speeches["date",]),
  184. talks = tibble(speech_id = parts["speech_id", ],
  185. speaker = parts["speaker", ],
  186. content = parts["content", ]),
  187. comments = tibble(speech_id = comments["speech_id",],
  188. on_speaker = comments["on_speaker",],
  189. type = comments["type",],
  190. fraction = comments["fraction",],
  191. commenter = comments["commenter",],
  192. content = comments["content", ]))
  193. }
  194. # create a tibble of speaker from a list of xml nodes representing speaker
  195. parse_speakerlist <- function(speakerliste_xml) {
  196. d <- sapply(speakerliste_xml, parse_speaker)
  197. tibble(id = d["id",],
  198. prename = d["prename",],
  199. lastname = d["lastname",],
  200. fraction = d["fraction",],
  201. title = d["title",],
  202. role_short = d["role_short",],
  203. role_long = d["role_long",])
  204. }
  205. #' Write the parsed and repaired results into separate csv files
  206. #'
  207. #' @param tables list of tables to convert into a csv files.
  208. #' @param path where to put the csv files.
  209. #' @param create set TRUE if the path does not exist yet and you want to create it
  210. #'
  211. #' @export
  212. write_to_csv <- function(tables, path="inst/csv/", create=F) {
  213. is_valid_res(tables)
  214. stopifnot("path must be of type character" = is.character(path))
  215. stopifnot("create must be of type logical" = is.logical(create))
  216. path <- make_directory_path(path)
  217. check_directory(path, create)
  218. write.table(tables$speaker, str_c(path, "speaker.csv"))
  219. write.table(tables$speeches, str_c(path, "speeches.csv"))
  220. write.table(tables$talks, str_c(path, "talks.csv"))
  221. write.table(tables$comments, str_c(path, "comments.csv"))
  222. write.table(tables$applause, str_c(path, "applause.csv"))
  223. }
  224. #' create a tibble from the csv file
  225. #'
  226. #' @param path directory to read files from
  227. #'
  228. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  229. #'
  230. #' @export
  231. read_from_csv <- function(path="inst/csv/") {
  232. stopifnot("path must be of type character" = is.character(path))
  233. path <- make_directory_path(path)
  234. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  235. tibble() %>%
  236. mutate(id = as.character(id)),
  237. speeches = read.table(str_c(path, "speeches.csv")) %>%
  238. tibble() %>%
  239. mutate(speaker = as.character(speaker),
  240. date = as.Date(date)),
  241. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  242. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  243. applause = tibble %$% read.table(str_c(path, "applause.csv"))) -> res
  244. is_valid_res(res)
  245. res
  246. }
  247. #' Read records from csv or fetch
  248. #'
  249. #' @param path base directory where csv files are expected under path/csv
  250. #' and possibly records fetched and stored under path/records
  251. #'
  252. #' @export
  253. read_from_csv_or_fetch <- function(path="inst/") {
  254. path <- make_directory_path(path)
  255. res <- tryCatch(read_from_csv(str_c(path, "csv/")),
  256. error = function(c) NULL)
  257. if (!is.null(res)) return(res)
  258. fetch_all(str_c(path, "records/"), create=T)
  259. read_all(str_c(path, "records/")) %>%
  260. repair() ->
  261. res
  262. write_to_csv(res, str_c(path, "csv/"), create=T)
  263. res
  264. }