load frequencies of epub
This commit is contained in:
parent
9aff3d7d43
commit
521c7d694d
5
.gitignore
vendored
5
.gitignore
vendored
@ -160,4 +160,7 @@ cython_debug/
|
|||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
# Project Specific
|
# Project Specific
|
||||||
scripts/
|
scripts/
|
||||||
|
.vscode
|
||||||
|
.vscode
|
||||||
|
data/
|
||||||
|
341
notebooks/french_epub_stats.ipynb
Normal file
341
notebooks/french_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
341
notebooks/german_epub_stats.ipynb
Normal file
341
notebooks/german_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
setup.py
1
setup.py
@ -31,6 +31,7 @@ setup(
|
|||||||
"nltk",
|
"nltk",
|
||||||
"EbookLib",
|
"EbookLib",
|
||||||
"BeautifulSoup",
|
"BeautifulSoup",
|
||||||
|
"wordfreq",
|
||||||
],
|
],
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type='text/markdown',
|
||||||
)
|
)
|
||||||
|
@ -62,14 +62,14 @@ def epub2text(epub_path):
|
|||||||
return ttext
|
return ttext
|
||||||
|
|
||||||
|
|
||||||
def generate_corpus_from_epub_file(input_path):
|
def generate_corpus_from_epub_file(input_path, language):
|
||||||
epub_doc = epub2text(input_path)
|
epub_doc = epub2text(input_path)
|
||||||
german_corpus = []
|
corpus = []
|
||||||
sw = nltk.corpus.stopwords.words('german')
|
sw = nltk.corpus.stopwords.words(language)
|
||||||
for content in epub_doc:
|
for content in epub_doc:
|
||||||
for w in nltk.word_tokenize(content, language='german'):
|
for w in nltk.word_tokenize(content, language=language):
|
||||||
w = w.lower()
|
w = w.lower()
|
||||||
if w not in sw and len(w) > 1:
|
if w not in sw and len(w) > 1:
|
||||||
german_corpus.append(w)
|
corpus.append(w)
|
||||||
return german_corpus
|
return corpus
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user