|
|
|
@@ -1,8 +1,52 @@ |
|
|
|
import os |
|
|
|
words = [] |
|
|
|
import re |
|
|
|
|
|
|
|
german_words = [] |
|
|
|
with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f: |
|
|
|
lines = f.readlines() |
|
|
|
for line in lines: |
|
|
|
#print(line.split(sep="\t")) |
|
|
|
index, word, count = line.split(sep="\t") |
|
|
|
if int(index) > 100: |
|
|
|
german_words.append(word.lower()) |
|
|
|
|
|
|
|
|
|
|
|
def get_words_from_line(line): |
|
|
|
words = line.split(sep=" ") |
|
|
|
ret_list = [] |
|
|
|
for word in words: |
|
|
|
word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word) |
|
|
|
ret_list.append(word.lower()) |
|
|
|
return ret_list |
|
|
|
|
|
|
|
|
|
|
|
hitler_words = [] |
|
|
|
for i in range(1, 7): |
|
|
|
with open(f'hitler_rede_{i}') as f: |
|
|
|
with open(f'hitler_texts/hitler_rede_{i}') as f: |
|
|
|
lines = f.readlines() |
|
|
|
for line in lines: |
|
|
|
words.extend(line.split(sep=" ")) |
|
|
|
|
|
|
|
hitler_words.extend(get_words_from_line(line)) |
|
|
|
|
|
|
|
with open(f'hitler_texts/goebbels_sportpalast') as f: |
|
|
|
lines = f.readlines() |
|
|
|
for line in lines: |
|
|
|
hitler_words.extend(get_words_from_line(line)) |
|
|
|
|
|
|
|
with open(f'hitler_texts/mein_kampf') as f: |
|
|
|
lines = f.readlines() |
|
|
|
for line in lines: |
|
|
|
hitler_words.extend(get_words_from_line(line)) |
|
|
|
|
|
|
|
hitler_words = set(hitler_words) #unique |
|
|
|
#filter_words = hitler_words.intersection(set(german_words)) |
|
|
|
|
|
|
|
only_hitler_words = list(hitler_words.difference(german_words)) |
|
|
|
|
|
|
|
with open("german_words", "w") as f: |
|
|
|
for word in german_words: |
|
|
|
word += "\n" |
|
|
|
f.write(word) |
|
|
|
|
|
|
|
with open("hitler_words", "w") as f: |
|
|
|
for word in hitler_words: |
|
|
|
word += "\n" |
|
|
|
f.write(word) |