load frequencies of epub
This commit is contained in:
parent
9aff3d7d43
commit
521c7d694d
3
.gitignore
vendored
3
.gitignore
vendored
@ -161,3 +161,6 @@ cython_debug/
|
||||
|
||||
# Project Specific
|
||||
scripts/
|
||||
.vscode
|
||||
.vscode
|
||||
data/
|
||||
|
341
notebooks/french_epub_stats.ipynb
Normal file
341
notebooks/french_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
341
notebooks/german_epub_stats.ipynb
Normal file
341
notebooks/german_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
setup.py
1
setup.py
@ -31,6 +31,7 @@ setup(
|
||||
"nltk",
|
||||
"EbookLib",
|
||||
"BeautifulSoup",
|
||||
"wordfreq",
|
||||
],
|
||||
long_description_content_type='text/markdown',
|
||||
)
|
||||
|
@ -62,14 +62,14 @@ def epub2text(epub_path):
|
||||
return ttext
|
||||
|
||||
|
||||
def generate_corpus_from_epub_file(input_path):
|
||||
def generate_corpus_from_epub_file(input_path, language):
|
||||
epub_doc = epub2text(input_path)
|
||||
german_corpus = []
|
||||
sw = nltk.corpus.stopwords.words('german')
|
||||
corpus = []
|
||||
sw = nltk.corpus.stopwords.words(language)
|
||||
for content in epub_doc:
|
||||
for w in nltk.word_tokenize(content, language='german'):
|
||||
for w in nltk.word_tokenize(content, language=language):
|
||||
w = w.lower()
|
||||
if w not in sw and len(w) > 1:
|
||||
german_corpus.append(w)
|
||||
return german_corpus
|
||||
corpus.append(w)
|
||||
return corpus
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user