import re

german_words = []
with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
    lines = f.readlines()
    for line in lines:
        #print(line.split(sep="\t"))
        index, word, count = line.split(sep="\t")
        if int(index) > 100 and int(count) > 5:
            german_words.append(word.lower())

with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f:
    lines = f.readlines()
    for line in lines:
        #print(line.split(sep="\t"))
        index, word, count = line.split(sep="\t")
        if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times
            german_words.append(word.lower())


def get_words_from_line(line):
    words = line.split(sep=" ")
    ret_list = []
    for word in words:
        word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
        ret_list.append(word.lower())
    return ret_list


hitler_words = []
for i in range(1, 7):
    with open(f'hitler_rede_{i}') as f:
        lines = f.readlines()
        for line in lines:
            hitler_words.extend(get_words_from_line(line))
        
with open(f'goebbels_sportpalast') as f:
    lines = f.readlines()
    for line in lines:
        hitler_words.extend(get_words_from_line(line))

with open(f'mein_kampf') as f:
    lines = f.readlines()
    for line in lines:
        hitler_words.extend(get_words_from_line(line))

german_words = set(german_words)
hitler_words = set(hitler_words) #unique
#filter_words = hitler_words.intersection(set(german_words))

only_hitler_words = list(hitler_words.difference(german_words))

print(only_hitler_words)

with open("german_words", "w") as f:
    for word in german_words:
        word += "\n"
        f.write(word)

with open("hitler_words", "w") as f:
    for word in only_hitler_words:
        word += "\n"
        f.write(word)