|
- import re
-
- german_words = []
- with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
- lines = f.readlines()
- for line in lines:
- #print(line.split(sep="\t"))
- index, word, count = line.split(sep="\t")
- if int(index) > 100:
- german_words.append(word.lower())
-
-
- def get_words_from_line(line):
- words = line.split(sep=" ")
- ret_list = []
- for word in words:
- word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
- ret_list.append(word.lower())
- return ret_list
-
-
- hitler_words = []
- for i in range(1, 7):
- with open(f'hitler_rede_{i}') as f:
- lines = f.readlines()
- for line in lines:
- hitler_words.extend(get_words_from_line(line))
-
- with open(f'goebbels_sportpalast') as f:
- lines = f.readlines()
- for line in lines:
- hitler_words.extend(get_words_from_line(line))
-
- with open(f'mein_kampf') as f:
- lines = f.readlines()
- for line in lines:
- hitler_words.extend(get_words_from_line(line))
-
- hitler_words = set(hitler_words) #unique
- #filter_words = hitler_words.intersection(set(german_words))
-
- only_hitler_words = list(hitler_words.difference(german_words))
-
- with open("german_words", "w") as f:
- for word in german_words:
- word += "\n"
- f.write(word)
-
- with open("hitler_words", "w") as f:
- for word in only_hitler_words:
- word += "\n"
- f.write(word)
|