An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 line
1.8KB

  1. import re
  2. german_words = []
  3. with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
  4. lines = f.readlines()
  5. for line in lines:
  6. #print(line.split(sep="\t"))
  7. index, word, count = line.split(sep="\t")
  8. if int(index) > 100 and int(count) > 5:
  9. german_words.append(word.lower())
  10. with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f:
  11. lines = f.readlines()
  12. for line in lines:
  13. #print(line.split(sep="\t"))
  14. index, word, count = line.split(sep="\t")
  15. if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times
  16. german_words.append(word.lower())
  17. def get_words_from_line(line):
  18. words = line.split(sep=" ")
  19. ret_list = []
  20. for word in words:
  21. word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
  22. ret_list.append(word.lower())
  23. return ret_list
  24. hitler_words = []
  25. for i in range(1, 7):
  26. with open(f'hitler_rede_{i}') as f:
  27. lines = f.readlines()
  28. for line in lines:
  29. hitler_words.extend(get_words_from_line(line))
  30. with open(f'goebbels_sportpalast') as f:
  31. lines = f.readlines()
  32. for line in lines:
  33. hitler_words.extend(get_words_from_line(line))
  34. with open(f'mein_kampf') as f:
  35. lines = f.readlines()
  36. for line in lines:
  37. hitler_words.extend(get_words_from_line(line))
  38. german_words = set(german_words)
  39. hitler_words = set(hitler_words) #unique
  40. #filter_words = hitler_words.intersection(set(german_words))
  41. only_hitler_words = list(hitler_words.difference(german_words))
  42. print(only_hitler_words)
  43. with open("german_words", "w") as f:
  44. for word in german_words:
  45. word += "\n"
  46. f.write(word)
  47. with open("hitler_words", "w") as f:
  48. for word in only_hitler_words:
  49. word += "\n"
  50. f.write(word)