An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

63 rindas
1.8KB

  1. import re
  2. german_words = []
  3. with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
  4. lines = f.readlines()
  5. for line in lines:
  6. #print(line.split(sep="\t"))
  7. index, word, count = line.split(sep="\t")
  8. if int(index) > 100 and int(count) > 5:
  9. german_words.append(word.lower())
  10. with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f:
  11. lines = f.readlines()
  12. for line in lines:
  13. #print(line.split(sep="\t"))
  14. index, word, count = line.split(sep="\t")
  15. if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times
  16. german_words.append(word.lower())
  17. def get_words_from_line(line):
  18. words = line.split(sep=" ")
  19. ret_list = []
  20. for word in words:
  21. word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
  22. ret_list.append(word.lower())
  23. return ret_list
  24. hitler_words = []
  25. for i in range(1, 7):
  26. with open(f'hitler_rede_{i}') as f:
  27. lines = f.readlines()
  28. for line in lines:
  29. hitler_words.extend(get_words_from_line(line))
  30. with open(f'goebbels_sportpalast') as f:
  31. lines = f.readlines()
  32. for line in lines:
  33. hitler_words.extend(get_words_from_line(line))
  34. with open(f'mein_kampf') as f:
  35. lines = f.readlines()
  36. for line in lines:
  37. hitler_words.extend(get_words_from_line(line))
  38. german_words = set(german_words)
  39. hitler_words = set(hitler_words) #unique
  40. #filter_words = hitler_words.intersection(set(german_words))
  41. only_hitler_words = list(hitler_words.difference(german_words))
  42. print(only_hitler_words)
  43. with open("german_words", "w") as f:
  44. for word in german_words:
  45. word += "\n"
  46. f.write(word)
  47. with open("hitler_words", "w") as f:
  48. for word in only_hitler_words:
  49. word += "\n"
  50. f.write(word)