An R package to analyze the parliamentary records of the 19th legislative period of the Bundestag, the German parliament.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

52 linhas
1.4KB

  1. import re
  2. german_words = []
  3. with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
  4. lines = f.readlines()
  5. for line in lines:
  6. #print(line.split(sep="\t"))
  7. index, word, count = line.split(sep="\t")
  8. if int(index) > 100:
  9. german_words.append(word.lower())
  10. def get_words_from_line(line):
  11. words = line.split(sep=" ")
  12. ret_list = []
  13. for word in words:
  14. word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
  15. ret_list.append(word.lower())
  16. return ret_list
  17. hitler_words = []
  18. for i in range(1, 7):
  19. with open(f'hitler_texts/hitler_rede_{i}') as f:
  20. lines = f.readlines()
  21. for line in lines:
  22. hitler_words.extend(get_words_from_line(line))
  23. with open(f'hitler_texts/goebbels_sportpalast') as f:
  24. lines = f.readlines()
  25. for line in lines:
  26. hitler_words.extend(get_words_from_line(line))
  27. with open(f'hitler_texts/mein_kampf') as f:
  28. lines = f.readlines()
  29. for line in lines:
  30. hitler_words.extend(get_words_from_line(line))
  31. hitler_words = set(hitler_words) #unique
  32. #filter_words = hitler_words.intersection(set(german_words))
  33. only_hitler_words = list(hitler_words.difference(german_words))
  34. with open("german_words", "w") as f:
  35. for word in german_words:
  36. word += "\n"
  37. f.write(word)
  38. with open("hitler_words", "w") as f:
  39. for word in hitler_words:
  40. word += "\n"
  41. f.write(word)