An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

52 行
1.4KB

  1. import re
  2. german_words = []
  3. with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
  4. lines = f.readlines()
  5. for line in lines:
  6. #print(line.split(sep="\t"))
  7. index, word, count = line.split(sep="\t")
  8. if int(index) > 100:
  9. german_words.append(word.lower())
  10. def get_words_from_line(line):
  11. words = line.split(sep=" ")
  12. ret_list = []
  13. for word in words:
  14. word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
  15. ret_list.append(word.lower())
  16. return ret_list
  17. hitler_words = []
  18. for i in range(1, 7):
  19. with open(f'hitler_rede_{i}') as f:
  20. lines = f.readlines()
  21. for line in lines:
  22. hitler_words.extend(get_words_from_line(line))
  23. with open(f'goebbels_sportpalast') as f:
  24. lines = f.readlines()
  25. for line in lines:
  26. hitler_words.extend(get_words_from_line(line))
  27. with open(f'mein_kampf') as f:
  28. lines = f.readlines()
  29. for line in lines:
  30. hitler_words.extend(get_words_from_line(line))
  31. hitler_words = set(hitler_words) #unique
  32. #filter_words = hitler_words.intersection(set(german_words))
  33. only_hitler_words = list(hitler_words.difference(german_words))
  34. with open("german_words", "w") as f:
  35. for word in german_words:
  36. word += "\n"
  37. f.write(word)
  38. with open("hitler_words", "w") as f:
  39. for word in only_hitler_words:
  40. word += "\n"
  41. f.write(word)