Browse Source

load frequencies of epub

feature/epub
gabriel becker 1 year ago
parent
commit
521c7d694d
  1. 5
      .gitignore
  2. 341
      notebooks/french_epub_stats.ipynb
  3. 341
      notebooks/german_epub_stats.ipynb
  4. 1
      setup.py
  5. 12
      src/ankimaker/tasks/epub/load_epub.py

5
.gitignore vendored

@ -160,4 +160,7 @@ cython_debug/
.idea/
# Project Specific
scripts/
scripts/
.vscode
.vscode
data/

341
notebooks/french_epub_stats.ipynb

File diff suppressed because one or more lines are too long

341
notebooks/german_epub_stats.ipynb

File diff suppressed because one or more lines are too long

1
setup.py

@ -31,6 +31,7 @@ setup(
"nltk",
"EbookLib",
"BeautifulSoup",
"wordfreq",
],
long_description_content_type='text/markdown',
)

12
src/ankimaker/tasks/epub/load_epub.py

@ -62,14 +62,14 @@ def epub2text(epub_path):
return ttext
def generate_corpus_from_epub_file(input_path):
def generate_corpus_from_epub_file(input_path, language):
epub_doc = epub2text(input_path)
german_corpus = []
sw = nltk.corpus.stopwords.words('german')
corpus = []
sw = nltk.corpus.stopwords.words(language)
for content in epub_doc:
for w in nltk.word_tokenize(content, language='german'):
for w in nltk.word_tokenize(content, language=language):
w = w.lower()
if w not in sw and len(w) > 1:
german_corpus.append(w)
return german_corpus
corpus.append(w)
return corpus

Loading…
Cancel
Save