ソースを参照

improve german word selection

genderequality-alternative
JosuaKugler 4年前
コミット
ef29269d45
3個のファイルの変更138220行の追加517451行の削除
  1. +129500
    -509801
      hitler_texts/german_words
  2. +8708
    -7649
      hitler_texts/hitler_words
  3. +12
    -1
      hitler_texts/parse.py

+ 129500
- 509801
hitler_texts/german_words
ファイル差分が大きすぎるため省略します
ファイルの表示


+ 8708
- 7649
hitler_texts/hitler_words
ファイル差分が大きすぎるため省略します
ファイルの表示


+ 12
- 1
hitler_texts/parse.py ファイルの表示

@@ -6,7 +6,15 @@ with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words
for line in lines: for line in lines:
#print(line.split(sep="\t")) #print(line.split(sep="\t"))
index, word, count = line.split(sep="\t") index, word, count = line.split(sep="\t")
if int(index) > 100:
if int(index) > 100 and int(count) > 5:
german_words.append(word.lower())

with open('/home/josua/deu_mixed-typical_2011_1M/deu_news_1995_1M-words.txt') as f:
lines = f.readlines()
for line in lines:
#print(line.split(sep="\t"))
index, word, count = line.split(sep="\t")
if int(index) > 100 and int(count) > 5:# only words that are used more than 5 times
german_words.append(word.lower()) german_words.append(word.lower())




@@ -36,11 +44,14 @@ with open(f'mein_kampf') as f:
for line in lines: for line in lines:
hitler_words.extend(get_words_from_line(line)) hitler_words.extend(get_words_from_line(line))


german_words = set(german_words)
hitler_words = set(hitler_words) #unique hitler_words = set(hitler_words) #unique
#filter_words = hitler_words.intersection(set(german_words)) #filter_words = hitler_words.intersection(set(german_words))


only_hitler_words = list(hitler_words.difference(german_words)) only_hitler_words = list(hitler_words.difference(german_words))


print(only_hitler_words)

with open("german_words", "w") as f: with open("german_words", "w") as f:
for word in german_words: for word in german_words:
word += "\n" word += "\n"


読み込み中…
キャンセル
保存