| @@ -1,8 +1,52 @@ | |||||
| import os | |||||
| words = [] | |||||
| import re | |||||
| german_words = [] | |||||
| with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f: | |||||
| lines = f.readlines() | |||||
| for line in lines: | |||||
| #print(line.split(sep="\t")) | |||||
| index, word, count = line.split(sep="\t") | |||||
| if int(index) > 100: | |||||
| german_words.append(word.lower()) | |||||
| def get_words_from_line(line): | |||||
| words = line.split(sep=" ") | |||||
| ret_list = [] | |||||
| for word in words: | |||||
| word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word) | |||||
| ret_list.append(word.lower()) | |||||
| return ret_list | |||||
| hitler_words = [] | |||||
| for i in range(1, 7): | for i in range(1, 7): | ||||
| with open(f'hitler_rede_{i}') as f: | |||||
| with open(f'hitler_texts/hitler_rede_{i}') as f: | |||||
| lines = f.readlines() | lines = f.readlines() | ||||
| for line in lines: | for line in lines: | ||||
| words.extend(line.split(sep=" ")) | |||||
| hitler_words.extend(get_words_from_line(line)) | |||||
| with open(f'hitler_texts/goebbels_sportpalast') as f: | |||||
| lines = f.readlines() | |||||
| for line in lines: | |||||
| hitler_words.extend(get_words_from_line(line)) | |||||
| with open(f'hitler_texts/mein_kampf') as f: | |||||
| lines = f.readlines() | |||||
| for line in lines: | |||||
| hitler_words.extend(get_words_from_line(line)) | |||||
| hitler_words = set(hitler_words) #unique | |||||
| #filter_words = hitler_words.intersection(set(german_words)) | |||||
| only_hitler_words = list(hitler_words.difference(german_words)) | |||||
| with open("german_words", "w") as f: | |||||
| for word in german_words: | |||||
| word += "\n" | |||||
| f.write(word) | |||||
| with open("hitler_words", "w") as f: | |||||
| for word in hitler_words: | |||||
| word += "\n" | |||||
| f.write(word) | |||||