An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

241 lines
9.0KB

  1. #' Parse xml records
  2. #'
  3. #' Creates a list of tibbles containing relevant information from all records
  4. #' stored in the input directory.
  5. #'
  6. #' @param path character
  7. #'
  8. #' @export
  9. read_all <- function(path="inst/records/") {
  10. cat("Reading all records from", path, "\n")
  11. available_protocols <- list.files(path)
  12. res <- pblapply(available_protocols, read_one, path=path)
  13. if (length(available_protocols) == 0)
  14. stop("The given directory is empty or does not exist.")
  15. lapply(res, `[[`, "speaker") %>%
  16. bind_rows() %>%
  17. distinct() ->
  18. speaker
  19. lapply(res, `[[`, "speeches") %>%
  20. bind_rows() %>%
  21. distinct() %>%
  22. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  23. speeches
  24. lapply(res, `[[`, "talks") %>%
  25. bind_rows() %>%
  26. distinct() ->
  27. talks
  28. lapply(res, `[[`, "comments") %>%
  29. bind_rows() %>%
  30. distinct() ->
  31. commentsandapplause
  32. filter(commentsandapplause, type == "comment") %>%
  33. select(-type) ->
  34. comments
  35. filter(commentsandapplause, type == "applause") %>%
  36. select(-type, -kommentator, -content) %>%
  37. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  38. "SPD" = str_detect(fraction, "SPD"),
  39. "FDP" = str_detect(fraction, "FDP"),
  40. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  41. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  42. "AfD" = str_detect(fraction, "AfD")) %>%
  43. select(-fraction) ->
  44. applause
  45. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  46. }
  47. # this reads all currently parseable data from one xml
  48. read_one <- function(name, path) {
  49. x <- tryCatch(read_xml(paste0(path, name)),
  50. error = function(c) NULL)
  51. if (is.null(x)) return(NULL)
  52. # extract date of session
  53. date <- xml_attr(x, "sitzung-datum")
  54. cs <- xml_children(x)
  55. verlauf <- xml_find_first(x, "sitzungsverlauf")
  56. speakerl <- xml_find_first(x, "rednerliste")
  57. xml_children(speakerl) %>%
  58. parse_speakerlist() ->
  59. speaker
  60. xml_children(verlauf) %>%
  61. xml_find_all("rede") %>%
  62. parse_speechlist(date) ->
  63. res
  64. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  65. }
  66. xml_get <- function(node, name) {
  67. res <- xml_text %$% xml_find_all(node, name)
  68. if (length(res) == 0) NA_character_
  69. else res
  70. }
  71. # parse one speaker
  72. parse_speaker <- function(speaker_xml) {
  73. speaker_id <- xml_attr(speaker_xml, "id")
  74. nm <- xml_child(speaker_xml)
  75. vorname <- xml_get(nm, "vorname")
  76. nachname <- xml_get(nm, "nachname")
  77. fraction <- xml_get(nm, "fraktion")
  78. titel <- xml_get(nm, "titel")
  79. rolle <- xml_find_all(nm, "rolle")
  80. if (length(rolle) > 0) {
  81. rolle_lang <- xml_get(rolle, "rolle_lang")
  82. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  83. } else rolle_kurz <- rolle_lang <- NA_character_
  84. c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
  85. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  86. }
  87. # parse one speech
  88. # returns: - a speech (with speech id and speaker id)
  89. # - all talks appearing in the speech (with corresponding content)
  90. parse_speech <- function(speech_xml, date) {
  91. speech_id <- xml_attr(speech_xml, "id")
  92. cs <- xml_children(speech_xml)
  93. cur_speaker <- NA_character_
  94. principal_speaker <- NA_character_
  95. cur_content <- ""
  96. speeches <- list()
  97. comments <- list()
  98. for (node in cs) {
  99. if (xml_name(node) == "p" || xml_name(node) == "name") {
  100. klasse <- xml_attr(node, "klasse")
  101. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  102. if (!is.na(cur_speaker)) {
  103. speech <- c(speech_id = speech_id,
  104. speaker = cur_speaker,
  105. content = cur_content)
  106. speeches <- c(speeches, list(speech))
  107. cur_content <- ""
  108. }
  109. if (is.na(principal_speaker) && xml_name(node) != "name") {
  110. principal_speaker <- xml_child(node) %>% xml_attr("id")
  111. }
  112. if (xml_name(node) == "name") {
  113. cur_speaker <- "BTP"
  114. } else {
  115. cur_speaker <- xml_child(node) %>% xml_attr("id")
  116. }
  117. } else {
  118. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  119. }
  120. } else if (xml_name(node) == "kommentar") {
  121. # comments are of the form
  122. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  123. xml_text(node) %>%
  124. str_sub(2, -2) %>%
  125. str_split("–") %>%
  126. `[[`(1) %>%
  127. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  128. cs
  129. comments <- c(comments, cs)
  130. }
  131. }
  132. speech <- c(speech_id = speech_id,
  133. speaker = cur_speaker,
  134. content = cur_content)
  135. speeches <- c(speeches, list(speech))
  136. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  137. parts = speeches,
  138. comments = comments)
  139. }
  140. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  141. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  142. parse_comment <- function(comment, speech_id, on_speaker) {
  143. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  144. # classify comment
  145. if(str_detect(comment, "Beifall")) {
  146. str_extract_all(comment, fractionpattern) %>%
  147. `[[`(1) %>%
  148. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  149. str_c(collapse=",") ->
  150. by
  151. c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
  152. } else {
  153. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  154. c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
  155. }
  156. }
  157. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  158. parse_speechlist <- function(speechlist_xml, date) {
  159. d <- sapply(speechlist_xml, parse_speech, date = date)
  160. speeches <- simplify2array(d["speech", ])
  161. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  162. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  163. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  164. date = speeches["date",]),
  165. talks = tibble(speech_id = parts["speech_id", ],
  166. speaker = parts["speaker", ],
  167. content = parts["content", ]),
  168. comments = tibble(speech_id = comments["speech_id",],
  169. on_speaker = comments["on_speaker",],
  170. type = comments["type",],
  171. fraction = comments["fraction",],
  172. kommentator = comments["kommentator",],
  173. content = comments["content", ]))
  174. }
  175. # create a tibble of speaker from a list of xml nodes representing speaker
  176. parse_speakerlist <- function(speakerliste_xml) {
  177. d <- sapply(speakerliste_xml, parse_speaker)
  178. tibble(id = d["id",],
  179. vorname = d["vorname",],
  180. nachname = d["nachname",],
  181. fraction = d["fraction",],
  182. titel = d["titel",],
  183. rolle_kurz = d["rolle_kurz",],
  184. rolle_lang = d["rolle_lang",])
  185. }
  186. #' Write the parsed and repaired results into separate csv files
  187. #'
  188. #' @param tables list of tables to convert into a csv files.
  189. #' @param path where to put the csv files.
  190. #' @param create set TRUE if the path does not exist yet and you want to create it
  191. #'
  192. #' @export
  193. write_to_csv <- function(tables, path="inst/csv/", create=F) {
  194. check_directory(path, create)
  195. write.table(tables$speaker, str_c(path, "speaker.csv"))
  196. write.table(tables$speeches, str_c(path, "speeches.csv"))
  197. write.table(tables$talks, str_c(path, "talks.csv"))
  198. write.table(tables$comments, str_c(path, "comments.csv"))
  199. write.table(tables$applause, str_c(path, "applause.csv"))
  200. }
  201. #' create a tibble from the csv file
  202. #'
  203. #' @param path directory to read files from
  204. #'
  205. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  206. #'
  207. #' @export
  208. read_from_csv <- function(path="inst/csv/") {
  209. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  210. tibble() %>%
  211. mutate(id = as.character(id)),
  212. speeches = read.table(str_c(path, "speeches.csv")) %>%
  213. tibble() %>%
  214. mutate(speaker = as.character(speaker)),
  215. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  216. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  217. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  218. }