| @@ -6,7 +6,15 @@ with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words | |||||
| for line in lines: | for line in lines: | ||||
| #print(line.split(sep="\t")) | #print(line.split(sep="\t")) | ||||
| index, word, count = line.split(sep="\t") | index, word, count = line.split(sep="\t") | ||||
| if int(index) > 100: | |||||
| if int(index) > 100 and int(count) > 5: | |||||
| german_words.append(word.lower()) | |||||
| with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f: | |||||
| lines = f.readlines() | |||||
| for line in lines: | |||||
| #print(line.split(sep="\t")) | |||||
| index, word, count = line.split(sep="\t") | |||||
| if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times | |||||
| german_words.append(word.lower()) | german_words.append(word.lower()) | ||||
| @@ -36,11 +44,14 @@ with open(f'mein_kampf') as f: | |||||
| for line in lines: | for line in lines: | ||||
| hitler_words.extend(get_words_from_line(line)) | hitler_words.extend(get_words_from_line(line)) | ||||
| german_words = set(german_words) | |||||
| hitler_words = set(hitler_words) #unique | hitler_words = set(hitler_words) #unique | ||||
| #filter_words = hitler_words.intersection(set(german_words)) | #filter_words = hitler_words.intersection(set(german_words)) | ||||
| only_hitler_words = list(hitler_words.difference(german_words)) | only_hitler_words = list(hitler_words.difference(german_words)) | ||||
| print(only_hitler_words) | |||||
| with open("german_words", "w") as f: | with open("german_words", "w") as f: | ||||
| for word in german_words: | for word in german_words: | ||||
| word += "\n" | word += "\n" | ||||