An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

241 lines
9.0KB

  1. #' Parse xml records
  2. #'
  3. #' Creates a list of tibbles containing relevant information from all records
  4. #' stored in the input directory.
  5. #'
  6. #' @param path character
  7. #'
  8. #' @export
  9. read_all <- function(path="inst/records/") {
  10. cat("Reading all records from", path, "\n")
  11. available_protocols <- list.files(path)
  12. res <- pblapply(available_protocols, read_one, path=path)
  13. if (length(available_protocols) == 0)
  14. stop("The given directory is empty or does not exist.")
  15. lapply(res, `[[`, "speaker") %>%
  16. bind_rows() %>%
  17. distinct() ->
  18. speaker
  19. lapply(res, `[[`, "speeches") %>%
  20. bind_rows() %>%
  21. distinct() %>%
  22. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  23. speeches
  24. lapply(res, `[[`, "talks") %>%
  25. bind_rows() %>%
  26. distinct() ->
  27. talks
  28. lapply(res, `[[`, "comments") %>%
  29. bind_rows() %>%
  30. distinct() ->
  31. commentsandapplause
  32. filter(commentsandapplause, type == "comment") %>%
  33. select(-type) ->
  34. comments
  35. filter(commentsandapplause, type == "applause") %>%
  36. select(-type, -commenter, -content) %>%
  37. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  38. "SPD" = str_detect(fraction, "SPD"),
  39. "FDP" = str_detect(fraction, "FDP"),
  40. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  41. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  42. "AfD" = str_detect(fraction, "AfD")) %>%
  43. select(-fraction) ->
  44. applause
  45. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  46. }
  47. # this reads all currently parseable data from one xml
  48. read_one <- function(name, path) {
  49. x <- tryCatch(read_xml(paste0(path, name)),
  50. error = function(c) NULL)
  51. if (is.null(x)) return(NULL)
  52. # extract date of session
  53. date <- xml_attr(x, "sitzung-datum")
  54. cs <- xml_children(x)
  55. verlauf <- xml_find_first(x, "sitzungsverlauf")
  56. speakerl <- xml_find_first(x, "rednerliste")
  57. xml_children(speakerl) %>%
  58. parse_speakerlist() ->
  59. speaker
  60. xml_children(verlauf) %>%
  61. xml_find_all("rede") %>%
  62. parse_speechlist(date) ->
  63. res
  64. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  65. }
  66. xml_get <- function(node, name) {
  67. res <- xml_text %$% xml_find_all(node, name)
  68. if (length(res) == 0) NA_character_
  69. else res
  70. }
  71. # parse one speaker
  72. parse_speaker <- function(speaker_xml) {
  73. speaker_id <- xml_attr(speaker_xml, "id")
  74. nm <- xml_child(speaker_xml)
  75. prename <- xml_get(nm, "vorname")
  76. lastname <- xml_get(nm, "nachname")
  77. fraction <- xml_get(nm, "fraktion")
  78. title <- xml_get(nm, "titel")
  79. role <- xml_find_all(nm, "rolle")
  80. if (length(role) > 0) {
  81. role_long <- xml_get(role, "rolle_lang")
  82. role_short <- xml_get(role, "rolle_kurz")
  83. } else role_short <- role_long <- NA_character_
  84. c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
  85. role_short = role_short, role_long = role_long)
  86. }
  87. # parse one speech
  88. # returns: - a speech (with speech id and speaker id)
  89. # - all talks appearing in the speech (with corresponding content)
  90. parse_speech <- function(speech_xml, date) {
  91. speech_id <- xml_attr(speech_xml, "id")
  92. cs <- xml_children(speech_xml)
  93. cur_speaker <- NA_character_
  94. principal_speaker <- NA_character_
  95. cur_content <- ""
  96. speeches <- list()
  97. comments <- list()
  98. for (node in cs) {
  99. if (xml_name(node) == "p" || xml_name(node) == "name") {
  100. klasse <- xml_attr(node, "klasse")
  101. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  102. if (!is.na(cur_speaker)) {
  103. speech <- c(speech_id = speech_id,
  104. speaker = cur_speaker,
  105. content = cur_content)
  106. speeches <- c(speeches, list(speech))
  107. cur_content <- ""
  108. }
  109. if (is.na(principal_speaker) && xml_name(node) != "name") {
  110. principal_speaker <- xml_child(node) %>% xml_attr("id")
  111. }
  112. if (xml_name(node) == "name") {
  113. cur_speaker <- "BTP"
  114. } else {
  115. cur_speaker <- xml_child(node) %>% xml_attr("id")
  116. }
  117. } else {
  118. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  119. }
  120. } else if (xml_name(node) == "kommentar") {
  121. # comments are of the form
  122. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  123. xml_text(node) %>%
  124. str_sub(2, -2) %>%
  125. str_split("–") %>%
  126. `[[`(1) %>%
  127. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  128. cs
  129. comments <- c(comments, cs)
  130. }
  131. }
  132. speech <- c(speech_id = speech_id,
  133. speaker = cur_speaker,
  134. content = cur_content)
  135. speeches <- c(speeches, list(speech))
  136. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  137. parts = speeches,
  138. comments = comments)
  139. }
  140. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  141. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  142. parse_comment <- function(comment, speech_id, on_speaker) {
  143. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  144. # classify comment
  145. if(str_detect(comment, "Beifall")) {
  146. str_extract_all(comment, fractionpattern) %>%
  147. `[[`(1) %>%
  148. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  149. str_c(collapse=",") ->
  150. by
  151. c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
  152. } else {
  153. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  154. c(base, type = "comment", fraction = ps[3], commenter = ps[2], content = ps[4])
  155. }
  156. }
  157. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  158. parse_speechlist <- function(speechlist_xml, date) {
  159. d <- sapply(speechlist_xml, parse_speech, date = date)
  160. speeches <- simplify2array(d["speech", ])
  161. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  162. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  163. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  164. date = speeches["date",]),
  165. talks = tibble(speech_id = parts["speech_id", ],
  166. speaker = parts["speaker", ],
  167. content = parts["content", ]),
  168. comments = tibble(speech_id = comments["speech_id",],
  169. on_speaker = comments["on_speaker",],
  170. type = comments["type",],
  171. fraction = comments["fraction",],
  172. commenter = comments["commenter",],
  173. content = comments["content", ]))
  174. }
  175. # create a tibble of speaker from a list of xml nodes representing speaker
  176. parse_speakerlist <- function(speakerliste_xml) {
  177. d <- sapply(speakerliste_xml, parse_speaker)
  178. tibble(id = d["id",],
  179. prename = d["prename",],
  180. lastname = d["lastname",],
  181. fraction = d["fraction",],
  182. title = d["title",],
  183. role_short = d["role_short",],
  184. role_long = d["role_long",])
  185. }
  186. #' Write the parsed and repaired results into separate csv files
  187. #'
  188. #' @param tables list of tables to convert into a csv files.
  189. #' @param path where to put the csv files.
  190. #' @param create set TRUE if the path does not exist yet and you want to create it
  191. #'
  192. #' @export
  193. write_to_csv <- function(tables, path="inst/csv/", create=F) {
  194. check_directory(path, create)
  195. write.table(tables$speaker, str_c(path, "speaker.csv"))
  196. write.table(tables$speeches, str_c(path, "speeches.csv"))
  197. write.table(tables$talks, str_c(path, "talks.csv"))
  198. write.table(tables$comments, str_c(path, "comments.csv"))
  199. write.table(tables$applause, str_c(path, "applause.csv"))
  200. }
  201. #' create a tibble from the csv file
  202. #'
  203. #' @param path directory to read files from
  204. #'
  205. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  206. #'
  207. #' @export
  208. read_from_csv <- function(path="inst/csv/") {
  209. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  210. tibble() %>%
  211. mutate(id = as.character(id)),
  212. speeches = read.table(str_c(path, "speeches.csv")) %>%
  213. tibble() %>%
  214. mutate(speaker = as.character(speaker)),
  215. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  216. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  217. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  218. }