An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

244 líneas
9.0KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="data/records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. if (length(available_protocols) == 0)
  15. stop("The given directory is empty or does not exist.")
  16. lapply(res, `[[`, "speaker") %>%
  17. bind_rows() %>%
  18. distinct() ->
  19. speaker
  20. lapply(res, `[[`, "speeches") %>%
  21. bind_rows() %>%
  22. distinct() %>%
  23. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  24. speeches
  25. lapply(res, `[[`, "talks") %>%
  26. bind_rows() %>%
  27. distinct() ->
  28. talks
  29. lapply(res, `[[`, "comments") %>%
  30. bind_rows() %>%
  31. distinct() ->
  32. commentsandapplause
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  39. "SPD" = str_detect(fraction, "SPD"),
  40. "FDP" = str_detect(fraction, "FDP"),
  41. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  42. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraction, "AfD")) %>%
  44. select(-fraction) ->
  45. applause
  46. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. speakerl <- xml_find_first(x, "rednerliste")
  58. xml_children(speakerl) %>%
  59. parse_speakerlist() ->
  60. speaker
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_speechlist(date) ->
  64. res
  65. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one speaker
  73. parse_speaker <- function(speaker_xml) {
  74. speaker_id <- xml_attr(speaker_xml, "id")
  75. nm <- xml_child(speaker_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraction <- xml_get(nm, "fraktion")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one speech
  89. # returns: - a speech (with speech id and speaker id)
  90. # - all talks appearing in the speech (with corresponding content)
  91. parse_speech <- function(speech_xml, date) {
  92. speech_id <- xml_attr(speech_xml, "id")
  93. cs <- xml_children(speech_xml)
  94. cur_speaker <- NA_character_
  95. principal_speaker <- NA_character_
  96. cur_content <- ""
  97. speeches <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  103. if (!is.na(cur_speaker)) {
  104. speech <- c(speech_id = speech_id,
  105. speaker = cur_speaker,
  106. content = cur_content)
  107. speeches <- c(speeches, list(speech))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_speaker) && xml_name(node) != "name") {
  111. principal_speaker <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_speaker <- "BTP"
  115. } else {
  116. cur_speaker <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. speech <- c(speech_id = speech_id,
  134. speaker = cur_speaker,
  135. content = cur_content)
  136. speeches <- c(speeches, list(speech))
  137. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  138. parts = speeches,
  139. comments = comments)
  140. }
  141. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, speech_id, on_speaker) {
  144. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fractionpattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  159. parse_speechlist <- function(speechlist_xml, date) {
  160. d <- sapply(speechlist_xml, parse_speech, date = date)
  161. speeches <- simplify2array(d["speech", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  165. date = speeches["date",]),
  166. talks = tibble(speech_id = parts["speech_id", ],
  167. speaker = parts["speaker", ],
  168. content = parts["content", ]),
  169. comments = tibble(speech_id = comments["speech_id",],
  170. on_speaker = comments["on_speaker",],
  171. type = comments["type",],
  172. fraction = comments["fraction",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of speaker from a list of xml nodes representing speaker
  177. parse_speakerlist <- function(speakerliste_xml) {
  178. d <- sapply(speakerliste_xml, parse_speaker)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraction = d["fraction",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' Write the parsed and repaired results into a csv file to make loading and developing faster and easier
  188. #'
  189. #' @param tables tibble list
  190. #' @param path char
  191. #' @param create bool
  192. #'
  193. #' if create is set to TRUE, the directory given in path is created
  194. #'
  195. #' @export
  196. write_to_csv <- function(tables, path="data/csv/", create=F) {
  197. check_directory(path, create)
  198. write.table(tables$speaker, str_c(path, "speaker.csv"))
  199. write.table(tables$speeches, str_c(path, "speeches.csv"))
  200. write.table(tables$talks, str_c(path, "talks.csv"))
  201. write.table(tables$comments, str_c(path, "comments.csv"))
  202. write.table(tables$applause, str_c(path, "applause.csv"))
  203. }
  204. #' Read the needed tables for developing from a csv file.
  205. #'
  206. #' @param path char
  207. #'
  208. #' Reading the tables from a csv is way faster than reading and repairing the data every single time
  209. #'
  210. #' @export
  211. read_from_csv <- function(path="data/csv/") {
  212. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  213. tibble() %>%
  214. mutate(id = as.character(id)),
  215. speeches = read.table(str_c(path, "speeches.csv")) %>%
  216. tibble() %>%
  217. mutate(speaker = as.character(speaker)),
  218. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  219. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  220. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  221. }