import re german_words = [] with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f: lines = f.readlines() for line in lines: #print(line.split(sep="\t")) index, word, count = line.split(sep="\t") if int(index) > 100 and int(count) > 5: german_words.append(word.lower()) with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f: lines = f.readlines() for line in lines: #print(line.split(sep="\t")) index, word, count = line.split(sep="\t") if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times german_words.append(word.lower()) def get_words_from_line(line): words = line.split(sep=" ") ret_list = [] for word in words: word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word) ret_list.append(word.lower()) return ret_list hitler_words = [] for i in range(1, 7): with open(f'hitler_rede_{i}') as f: lines = f.readlines() for line in lines: hitler_words.extend(get_words_from_line(line)) with open(f'goebbels_sportpalast') as f: lines = f.readlines() for line in lines: hitler_words.extend(get_words_from_line(line)) with open(f'mein_kampf') as f: lines = f.readlines() for line in lines: hitler_words.extend(get_words_from_line(line)) german_words = set(german_words) hitler_words = set(hitler_words) #unique #filter_words = hitler_words.intersection(set(german_words)) only_hitler_words = list(hitler_words.difference(german_words)) print(only_hitler_words) with open("german_words", "w") as f: for word in german_words: word += "\n" f.write(word) with open("hitler_words", "w") as f: for word in only_hitler_words: word += "\n" f.write(word)