load frequencies of epub

This commit is contained in:
gabriel becker 2023-09-22 22:27:27 +10:00
parent 9aff3d7d43
commit 521c7d694d
5 changed files with 693 additions and 7 deletions

3
.gitignore vendored
View File

@ -161,3 +161,6 @@ cython_debug/
# Project Specific
scripts/
.vscode
.vscode
data/

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -31,6 +31,7 @@ setup(
"nltk",
"EbookLib",
"BeautifulSoup",
"wordfreq",
],
long_description_content_type='text/markdown',
)

View File

@ -62,14 +62,14 @@ def epub2text(epub_path):
return ttext
def generate_corpus_from_epub_file(input_path):
def generate_corpus_from_epub_file(input_path, language):
epub_doc = epub2text(input_path)
german_corpus = []
sw = nltk.corpus.stopwords.words('german')
corpus = []
sw = nltk.corpus.stopwords.words(language)
for content in epub_doc:
for w in nltk.word_tokenize(content, language='german'):
for w in nltk.word_tokenize(content, language=language):
w = w.lower()
if w not in sw and len(w) > 1:
german_corpus.append(w)
return german_corpus
corpus.append(w)
return corpus