Explorar el Código

parse german words and extract hitler words

genderequality-alternative
JosuaKugler hace 4 años
padre
commit
88a62a22d7
Se han modificado 3 ficheros con 534979 adiciones y 5 borrados
  1. +510023
    -0
      hitler_texts/german_words
  2. +24907
    -0
      hitler_texts/hitler_words
  3. +49
    -5
      hitler_texts/parse.py

+ 510023
- 0
hitler_texts/german_words
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 24907
- 0
hitler_texts/hitler_words
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 49
- 5
hitler_texts/parse.py Ver fichero

@@ -1,8 +1,52 @@
import os
words = []
import re

german_words = []
with open('/home/josua/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-words.txt') as f:
lines = f.readlines()
for line in lines:
#print(line.split(sep="\t"))
index, word, count = line.split(sep="\t")
if int(index) > 100:
german_words.append(word.lower())


def get_words_from_line(line):
words = line.split(sep=" ")
ret_list = []
for word in words:
word = re.sub("[^a-zA-ZüöäÜÖÄßẞ]", "", word)
ret_list.append(word.lower())
return ret_list


hitler_words = []
for i in range(1, 7):
with open(f'hitler_rede_{i}') as f:
with open(f'hitler_texts/hitler_rede_{i}') as f:
lines = f.readlines()
for line in lines:
words.extend(line.split(sep=" "))
hitler_words.extend(get_words_from_line(line))
with open(f'hitler_texts/goebbels_sportpalast') as f:
lines = f.readlines()
for line in lines:
hitler_words.extend(get_words_from_line(line))

with open(f'hitler_texts/mein_kampf') as f:
lines = f.readlines()
for line in lines:
hitler_words.extend(get_words_from_line(line))

hitler_words = set(hitler_words) #unique
#filter_words = hitler_words.intersection(set(german_words))

only_hitler_words = list(hitler_words.difference(german_words))

with open("german_words", "w") as f:
for word in german_words:
word += "\n"
f.write(word)

with open("hitler_words", "w") as f:
for word in hitler_words:
word += "\n"
f.write(word)

Cargando…
Cancelar
Guardar