An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

230 řádky
8.6KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. if (length(available_protocols) == 0)
  15. stop("The given directory is empty or does not exist.")
  16. lapply(res, `[[`, "speaker") %>%
  17. bind_rows() %>%
  18. distinct() ->
  19. speaker
  20. lapply(res, `[[`, "speeches") %>%
  21. bind_rows() %>%
  22. distinct() %>%
  23. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  24. speeches
  25. lapply(res, `[[`, "talks") %>%
  26. bind_rows() %>%
  27. distinct() ->
  28. talks
  29. lapply(res, `[[`, "comments") %>%
  30. bind_rows() %>%
  31. distinct() ->
  32. commentsandapplause
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  39. "SPD" = str_detect(fraction, "SPD"),
  40. "FDP" = str_detect(fraction, "FDP"),
  41. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  42. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraction, "AfD")) %>%
  44. select(-fraction) ->
  45. applause
  46. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. speakerl <- xml_find_first(x, "rednerliste")
  58. xml_children(speakerl) %>%
  59. parse_speakerlist() ->
  60. speaker
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_speechlist(date) ->
  64. res
  65. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one speaker
  73. parse_speaker <- function(speaker_xml) {
  74. speaker_id <- xml_attr(speaker_xml, "id")
  75. nm <- xml_child(speaker_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraction <- xml_get(nm, "fraction")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one speech
  89. # returns: - a speech (with speech id and speaker id)
  90. # - all talks appearing in the speech (with corresponding content)
  91. parse_speech <- function(speech_xml, date) {
  92. speech_id <- xml_attr(speech_xml, "id")
  93. cs <- xml_children(speech_xml)
  94. cur_speaker <- NA_character_
  95. principal_speaker <- NA_character_
  96. cur_content <- ""
  97. speeches <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "speaker") || xml_name(node) == "name") {
  103. if (!is.na(cur_speaker)) {
  104. speech <- c(speech_id = speech_id,
  105. speaker = cur_speaker,
  106. content = cur_content)
  107. speeches <- c(speeches, list(speech))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_speaker) && xml_name(node) != "name") {
  111. principal_speaker <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_speaker <- "BTP"
  115. } else {
  116. cur_speaker <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. speech <- c(speech_id = speech_id,
  134. speaker = cur_speaker,
  135. content = cur_content)
  136. speeches <- c(speeches, list(speech))
  137. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  138. parts = speeches,
  139. comments = comments)
  140. }
  141. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, speech_id, on_speaker) {
  144. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fractionpattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  159. parse_speechlist <- function(speechlist_xml, date) {
  160. d <- sapply(speechlist_xml, parse_speech, date = date)
  161. speeches <- simplify2array(d["speech", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  165. date = speeches["date",]),
  166. talks = tibble(speech_id = parts["speech_id", ],
  167. speaker = parts["speaker", ],
  168. content = parts["content", ]),
  169. comments = tibble(speech_id = comments["speech_id",],
  170. on_speaker = comments["on_speaker",],
  171. type = comments["type",],
  172. fraction = comments["fraction",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of speaker from a list of xml nodes representing speaker
  177. parse_speakerliste <- function(speakerliste_xml) {
  178. d <- sapply(speakerliste_xml, parse_speaker)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraction = d["fraction",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' @export
  188. write_to_csv <- function(tables, path="csv/", create=F) {
  189. check_directory(path, create)
  190. write.table(tables$speaker, str_c(path, "speaker.csv"))
  191. write.table(tables$speeches, str_c(path, "speeches.csv"))
  192. write.table(tables$talks, str_c(path, "talks.csv"))
  193. write.table(tables$comments, str_c(path, "comments.csv"))
  194. write.table(tables$applause, str_c(path, "applause.csv"))
  195. }
  196. #' @export
  197. read_from_csv <- function(path="csv/") {
  198. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  199. tibble() %>%
  200. mutate(id = as.character(id)),
  201. speeches = read.table(str_c(path, "speeches.csv")) %>%
  202. tibble() %>%
  203. mutate(speaker = as.character(speaker)),
  204. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  205. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  206. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  207. }