An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

242 rindas
8.8KB

  1. # for usage see the example at the end
  2. #' Parse xml records
  3. #'
  4. #' Creates a list of tibbles containing relevant information from all records
  5. #' stored in the input directory.
  6. #'
  7. #' @param path character
  8. #'
  9. #' @export
  10. read_all <- function(path="records/") {
  11. cat("Reading all records from", path, "\n")
  12. available_protocols <- list.files(path)
  13. res <- pblapply(available_protocols, read_one, path=path)
  14. lapply(res, `[[`, "speaker") %>%
  15. bind_rows() %>%
  16. distinct() ->
  17. speaker
  18. lapply(res, `[[`, "speeches") %>%
  19. bind_rows() %>%
  20. distinct() %>%
  21. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  22. speeches
  23. lapply(res, `[[`, "talks") %>%
  24. bind_rows() %>%
  25. distinct() ->
  26. talks
  27. lapply(res, `[[`, "comments") %>%
  28. bind_rows() %>%
  29. distinct() ->
  30. commentsandapplause
  31. if (length(available_protocols) == 0)
  32. warning("The given directory is empty or does not exist.")
  33. filter(commentsandapplause, type == "comment") %>%
  34. select(-type) ->
  35. comments
  36. filter(commentsandapplause, type == "applause") %>%
  37. select(-type, -kommentator, -content) %>%
  38. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  39. "SPD" = str_detect(fraction, "SPD"),
  40. "FDP" = str_detect(fraction, "FDP"),
  41. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  42. "BÜNDNIS_90_DIE_GRÜNEN" = str_detect(fraction, "BÜNDNIS 90/DIE GRÜNEN"),
  43. "AfD" = str_detect(fraction, "AfD")) %>%
  44. select(-fraction) ->
  45. applause
  46. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  47. }
  48. # this reads all currently parseable data from one xml
  49. read_one <- function(name, path) {
  50. x <- tryCatch(read_xml(paste0(path, name)),
  51. error = function(c) NULL)
  52. if (is.null(x)) return(NULL)
  53. # extract date of session
  54. date <- xml_attr(x, "sitzung-datum")
  55. cs <- xml_children(x)
  56. verlauf <- xml_find_first(x, "sitzungsverlauf")
  57. speakerl <- xml_find_first(x, "rednerliste")
  58. xml_children(speakerl) %>%
  59. parse_speakerlist() ->
  60. speaker
  61. xml_children(verlauf) %>%
  62. xml_find_all("rede") %>%
  63. parse_speechlist(date) ->
  64. res
  65. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  66. }
  67. xml_get <- function(node, name) {
  68. res <- xml_text %$% xml_find_all(node, name)
  69. if (length(res) == 0) NA_character_
  70. else res
  71. }
  72. # parse one speaker
  73. parse_speaker <- function(speaker_xml) {
  74. speaker_id <- xml_attr(speaker_xml, "id")
  75. nm <- xml_child(speaker_xml)
  76. vorname <- xml_get(nm, "vorname")
  77. nachname <- xml_get(nm, "nachname")
  78. fraction <- xml_get(nm, "fraction")
  79. titel <- xml_get(nm, "titel")
  80. rolle <- xml_find_all(nm, "rolle")
  81. if (length(rolle) > 0) {
  82. rolle_lang <- xml_get(rolle, "rolle_lang")
  83. rolle_kurz <- xml_get(rolle, "rolle_kurz")
  84. } else rolle_kurz <- rolle_lang <- NA_character_
  85. c(id = speaker_id, vorname = vorname, nachname = nachname, fraction = fraction, titel = titel,
  86. rolle_kurz = rolle_kurz, rolle_lang = rolle_lang)
  87. }
  88. # parse one speech
  89. # returns: - a speech (with speech id and speaker id)
  90. # - all talks appearing in the speech (with corresponding content)
  91. parse_speech <- function(speech_xml, date) {
  92. speech_id <- xml_attr(speech_xml, "id")
  93. cs <- xml_children(speech_xml)
  94. cur_speaker <- NA_character_
  95. principal_speaker <- NA_character_
  96. cur_content <- ""
  97. speeches <- list()
  98. comments <- list()
  99. for (node in cs) {
  100. if (xml_name(node) == "p" || xml_name(node) == "name") {
  101. klasse <- xml_attr(node, "klasse")
  102. if ((!is.na(klasse) && klasse == "speaker") || xml_name(node) == "name") {
  103. if (!is.na(cur_speaker)) {
  104. speech <- c(speech_id = speech_id,
  105. speaker = cur_speaker,
  106. content = cur_content)
  107. speeches <- c(speeches, list(speech))
  108. cur_content <- ""
  109. }
  110. if (is.na(principal_speaker) && xml_name(node) != "name") {
  111. principal_speaker <- xml_child(node) %>% xml_attr("id")
  112. }
  113. if (xml_name(node) == "name") {
  114. cur_speaker <- "BTP"
  115. } else {
  116. cur_speaker <- xml_child(node) %>% xml_attr("id")
  117. }
  118. } else {
  119. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  120. }
  121. } else if (xml_name(node) == "kommentar") {
  122. # comments are of the form
  123. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  124. xml_text(node) %>%
  125. str_sub(2, -2) %>%
  126. str_split("–") %>%
  127. `[[`(1) %>%
  128. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  129. cs
  130. comments <- c(comments, cs)
  131. }
  132. }
  133. speech <- c(speech_id = speech_id,
  134. speaker = cur_speaker,
  135. content = cur_content)
  136. speeches <- c(speeches, list(speech))
  137. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  138. parts = speeches,
  139. comments = comments)
  140. }
  141. fractionpattern <- "BÜNDNIS(SES)?\\W*90/DIE\\W*GRÜNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  142. fractionnames <- c("BÜNDNIS 90/DIE GRÜNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP")
  143. parse_comment <- function(comment, speech_id, on_speaker) {
  144. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  145. # classify comment
  146. if(str_detect(comment, "Beifall")) {
  147. str_extract_all(comment, fractionpattern) %>%
  148. `[[`(1) %>%
  149. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  150. str_c(collapse=",") ->
  151. by
  152. c(base, type = "applause", fraction = by, kommentator = NA_character_, content = comment)
  153. } else {
  154. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  155. c(base, type = "comment", fraction = ps[3], kommentator = ps[2], content = ps[4])
  156. }
  157. }
  158. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  159. parse_speechlist <- function(speechlist_xml, date) {
  160. d <- sapply(speechlist_xml, parse_speech, date = date)
  161. speeches <- simplify2array(d["speech", ])
  162. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  163. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  164. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  165. date = speeches["date",]),
  166. talks = tibble(speech_id = parts["speech_id", ],
  167. speaker = parts["speaker", ],
  168. content = parts["content", ]),
  169. comments = tibble(speech_id = comments["speech_id",],
  170. on_speaker = comments["on_speaker",],
  171. type = comments["type",],
  172. fraction = comments["fraction",],
  173. kommentator = comments["kommentator",],
  174. content = comments["content", ]))
  175. }
  176. # create a tibble of speaker from a list of xml nodes representing speaker
  177. parse_speakerliste <- function(speakerliste_xml) {
  178. d <- sapply(speakerliste_xml, parse_speaker)
  179. tibble(id = d["id",],
  180. vorname = d["vorname",],
  181. nachname = d["nachname",],
  182. fraction = d["fraction",],
  183. titel = d["titel",],
  184. rolle_kurz = d["rolle_kurz",],
  185. rolle_lang = d["rolle_lang",])
  186. }
  187. #' @export
  188. write_to_csv <- function(tables, path="csv/", create=F) {
  189. check_directory(path, create)
  190. write.table(tables$speaker, str_c(path, "speaker.csv"))
  191. write.table(tables$speeches, str_c(path, "speeches.csv"))
  192. write.table(tables$talks, str_c(path, "talks.csv"))
  193. write.table(tables$comments, str_c(path, "comments.csv"))
  194. write.table(tables$applause, str_c(path, "applause.csv"))
  195. }
  196. #' @export
  197. read_from_csv <- function(path="csv/") {
  198. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  199. tibble() %>%
  200. mutate(id = as.character(id)),
  201. speeches = read.table(str_c(path, "speeches.csv")) %>%
  202. tibble() %>%
  203. mutate(speaker = as.character(speaker)),
  204. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  205. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  206. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  207. }
  208. # -------------------------------
  209. # EXAMPLE USE
  210. # make sure data ist downloaded via fetch.R
  211. # res <- read_one("records/19126-data.xml")
  212. #
  213. # res$speaker
  214. # res$speeches
  215. # res$talks
  216. # -------------------------------