An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

264 行
10KB

  1. #' Parse xml records
  2. #'
  3. #' Creates a list of tibbles containing relevant information from all records
  4. #' stored in the input directory.
  5. #'
  6. #' @param path path to records directory
  7. #' @param pattern search pattern to find records in directory
  8. #'
  9. #' @export
  10. read_all <- function(path="inst/records/", pattern="-data\\.xml") {
  11. # append file separator if needed
  12. path <- make_directory_path(path)
  13. cat("Reading all records from", path, "\n")
  14. # list all files in directory and filter by search pattern
  15. fs <- list.files(path)
  16. available_protocols <- fs[str_detect(fs, pattern)]
  17. if (length(available_protocols) == 0)
  18. stop(paste0("The given directory does not exist or does not contain files matching \"",
  19. pattern,
  20. "\"."))
  21. # parse records one by one and remove null entries
  22. res <- compact %$% pblapply(available_protocols, read_one, path=path)
  23. if (length(res) == 0) stop("No valid records found. Did you fetch successfully?")
  24. lapply(res, `[[`, "speaker") %>%
  25. bind_rows() %>%
  26. distinct() ->
  27. speaker
  28. lapply(res, `[[`, "speeches") %>%
  29. bind_rows() %>%
  30. distinct() %>%
  31. mutate(date = as.Date(date, format="%d.%m.%Y")) ->
  32. speeches
  33. lapply(res, `[[`, "talks") %>%
  34. bind_rows() %>%
  35. distinct() ->
  36. talks
  37. lapply(res, `[[`, "comments") %>%
  38. bind_rows() %>%
  39. distinct() ->
  40. commentsandapplause
  41. filter(commentsandapplause, type == "comment") %>%
  42. select(-type) ->
  43. comments
  44. filter(commentsandapplause, type == "applause") %>%
  45. select(-type, -commenter, -content) %>%
  46. mutate("CDU_CSU" = str_detect(fraction, "CDU/CSU"),
  47. "SPD" = str_detect(fraction, "SPD"),
  48. "FDP" = str_detect(fraction, "FDP"),
  49. "DIE_LINKE" = str_detect(fraction, "DIE LINKE"),
  50. "BUENDNIS_90_DIE_GRUENEN" = str_detect(fraction, "B\u00DCNDNIS 90/DIE GR\u00DCNEN"),
  51. "AfD" = str_detect(fraction, "AfD")) %>%
  52. select(-fraction) ->
  53. applause
  54. list(speaker = speaker, speeches = speeches, talks = talks, comments = comments, applause = applause)
  55. }
  56. # this reads all currently parseable data from one xml
  57. read_one <- function(name, path) {
  58. x <- tryCatch(read_xml(paste0(path, name)),
  59. error = function(c) NULL,
  60. warning = function(c) NULL)
  61. if (is.null(x)) return(NULL)
  62. # extract date of session
  63. date <- xml_attr(x, "sitzung-datum")
  64. cs <- xml_children(x)
  65. verlauf <- xml_find_first(x, "sitzungsverlauf")
  66. speakerl <- xml_find_first(x, "rednerliste")
  67. # check if record is invalid or empty (every record should have at least
  68. # one speech, a speaker and a date
  69. if (is.na(date) || length(verlauf) == 0 || length(speakerl) == 0) {
  70. warning("Invalid record found. Skipping.")
  71. return(NULL)
  72. }
  73. xml_children(speakerl) %>%
  74. parse_speakerlist() ->
  75. speaker
  76. xml_children(verlauf) %>%
  77. xml_find_all("rede") %>%
  78. parse_speechlist(date) ->
  79. res
  80. list(speaker = speaker, speeches = res$speeches, talks = res$talks, comments = res$comments)
  81. }
  82. xml_get <- function(node, name) {
  83. res <- xml_text %$% xml_find_all(node, name)
  84. if (length(res) == 0) NA_character_
  85. else res
  86. }
  87. # parse one speaker
  88. parse_speaker <- function(speaker_xml) {
  89. speaker_id <- xml_attr(speaker_xml, "id")
  90. nm <- xml_child(speaker_xml)
  91. prename <- xml_get(nm, "vorname")
  92. lastname <- xml_get(nm, "nachname")
  93. fraction <- xml_get(nm, "fraktion")
  94. title <- xml_get(nm, "titel")
  95. role <- xml_find_all(nm, "rolle")
  96. if (length(role) > 0) {
  97. role_long <- xml_get(role, "rolle_lang")
  98. role_short <- xml_get(role, "rolle_kurz")
  99. } else role_short <- role_long <- NA_character_
  100. c(id = speaker_id, prename = prename, lastname = lastname, fraction = fraction, title = title,
  101. role_short = role_short, role_long = role_long)
  102. }
  103. # parse one speech
  104. # returns: - a speech (with speech id and speaker id)
  105. # - all talks appearing in the speech (with corresponding content)
  106. parse_speech <- function(speech_xml, date) {
  107. speech_id <- xml_attr(speech_xml, "id")
  108. cs <- xml_children(speech_xml)
  109. cur_speaker <- NA_character_
  110. principal_speaker <- NA_character_
  111. cur_content <- ""
  112. speeches <- list()
  113. comments <- list()
  114. for (node in cs) {
  115. if (xml_name(node) == "p" || xml_name(node) == "name") {
  116. klasse <- xml_attr(node, "klasse")
  117. if ((!is.na(klasse) && klasse == "redner") || xml_name(node) == "name") {
  118. if (!is.na(cur_speaker)) {
  119. speech <- c(speech_id = speech_id,
  120. speaker = cur_speaker,
  121. content = cur_content)
  122. speeches <- c(speeches, list(speech))
  123. cur_content <- ""
  124. }
  125. if (is.na(principal_speaker) && xml_name(node) != "name") {
  126. principal_speaker <- xml_child(node) %>% xml_attr("id")
  127. }
  128. if (xml_name(node) == "name") {
  129. cur_speaker <- "BTP"
  130. } else {
  131. cur_speaker <- xml_child(node) %>% xml_attr("id")
  132. }
  133. } else {
  134. cur_content <- paste0(cur_content, xml_text(node), sep="\n")
  135. }
  136. } else if (xml_name(node) == "kommentar") {
  137. # comments are of the form
  138. # <kommentar>(blabla [Fraktion] – blabla liasdf – bla)</kommentar>
  139. xml_text(node) %>%
  140. str_sub(2, -2) %>%
  141. str_split("–") %>%
  142. `[[`(1) %>%
  143. lapply(parse_comment, speech_id = speech_id, on_speaker = cur_speaker) ->
  144. cs
  145. comments <- c(comments, cs)
  146. }
  147. }
  148. speech <- c(speech_id = speech_id,
  149. speaker = cur_speaker,
  150. content = cur_content)
  151. speeches <- c(speeches, list(speech))
  152. list(speech = c(id = speech_id, speaker = principal_speaker, date = date),
  153. parts = speeches,
  154. comments = comments)
  155. }
  156. fractionpattern <- "B\u00DCNDNIS(SES)?\\W*90/DIE\\W*GR\u00DCNEN|CDU/CSU|AfD|SPD|DIE LINKE|FDP|LINKEN"
  157. fractionnames <- c("B\u00DCNDNIS 90/DIE GR\u00DCNEN", "CDU/CSU", "AfD", "SPD", "DIE LINKE", "FDP",
  158. "Fraktionslos")
  159. parse_comment <- function(comment, speech_id, on_speaker) {
  160. base <- c(speech_id = speech_id, on_speaker = on_speaker)
  161. # classify comment
  162. if(str_detect(comment, "Beifall")) {
  163. str_extract_all(comment, fractionpattern) %>%
  164. `[[`(1) %>%
  165. sapply(partial(flip(head), 1) %.% agrep, x=fractionnames, max=0.2, value=T) %>%
  166. str_c(collapse=",") ->
  167. by
  168. c(base, type = "applause", fraction = by, commenter = NA_character_, content = comment)
  169. } else {
  170. ps <- str_match(comment, "(.*) \\[(.*?)\\]: (.*)")[1,]
  171. fraction <- agrep(ps[3], fractionnames, max=0.2, value=T)
  172. if (all(is.na(fraction)) || length(fraction) == 0) fraction <- NA_character_
  173. c(base, type = "comment", fraction = fraction, commenter = ps[2], content = ps[4])
  174. }
  175. }
  176. # creates a tibble of speeches and a tibble of talks from a list of xml nodes representing speeches
  177. parse_speechlist <- function(speechlist_xml, date) {
  178. d <- sapply(speechlist_xml, parse_speech, date = date)
  179. speeches <- simplify2array(d["speech", ])
  180. parts <- simplify2array %$% unlist(d["parts", ], recursive=FALSE)
  181. comments <- simplify2array %$% unlist(d["comments", ], recursive=FALSE)
  182. list(speeches = tibble(id = speeches["id",], speaker = speeches["speaker",],
  183. date = speeches["date",]),
  184. talks = tibble(speech_id = parts["speech_id", ],
  185. speaker = parts["speaker", ],
  186. content = parts["content", ]),
  187. comments = tibble(speech_id = comments["speech_id",],
  188. on_speaker = comments["on_speaker",],
  189. type = comments["type",],
  190. fraction = comments["fraction",],
  191. commenter = comments["commenter",],
  192. content = comments["content", ]))
  193. }
  194. # create a tibble of speaker from a list of xml nodes representing speaker
  195. parse_speakerlist <- function(speakerliste_xml) {
  196. d <- sapply(speakerliste_xml, parse_speaker)
  197. tibble(id = d["id",],
  198. prename = d["prename",],
  199. lastname = d["lastname",],
  200. fraction = d["fraction",],
  201. title = d["title",],
  202. role_short = d["role_short",],
  203. role_long = d["role_long",])
  204. }
  205. #' Write the parsed and repaired results into separate csv files
  206. #'
  207. #' @param tables list of tables to convert into a csv files.
  208. #' @param path where to put the csv files.
  209. #' @param create set TRUE if the path does not exist yet and you want to create it
  210. #'
  211. #' @export
  212. write_to_csv <- function(tables, path="inst/csv/", create=F) {
  213. check_directory(path, create)
  214. write.table(tables$speaker, str_c(path, "speaker.csv"))
  215. write.table(tables$speeches, str_c(path, "speeches.csv"))
  216. write.table(tables$talks, str_c(path, "talks.csv"))
  217. write.table(tables$comments, str_c(path, "comments.csv"))
  218. write.table(tables$applause, str_c(path, "applause.csv"))
  219. }
  220. #' create a tibble from the csv file
  221. #'
  222. #' @param path directory to read files from
  223. #'
  224. #' reading the tables from a csv is way faster than reading and repairing the data every single time
  225. #'
  226. #' @export
  227. read_from_csv <- function(path="inst/csv/") {
  228. list(speaker = read.table(str_c(path, "speaker.csv")) %>%
  229. tibble() %>%
  230. mutate(id = as.character(id)),
  231. speeches = read.table(str_c(path, "speeches.csv")) %>%
  232. tibble() %>%
  233. mutate(speaker = as.character(speaker),
  234. date = as.Date(date)),
  235. talks = tibble %$% read.table(str_c(path, "talks.csv")),
  236. comments = tibble %$% read.table(str_c(path, "comments.csv")),
  237. applause = tibble %$% read.table(str_c(path, "applause.csv")))
  238. }