Browse Source

load frequencies of epub

feature/epub
gabriel becker 1 year ago
parent
commit
521c7d694d
  1. 5
      .gitignore
  2. 341
      notebooks/french_epub_stats.ipynb
  3. 341
      notebooks/german_epub_stats.ipynb
  4. 1
      setup.py
  5. 12
      src/ankimaker/tasks/epub/load_epub.py

5
.gitignore vendored

@ -160,4 +160,7 @@ cython_debug/
.idea/ .idea/
# Project Specific # Project Specific
scripts/ scripts/
.vscode
.vscode
data/

341
notebooks/french_epub_stats.ipynb

File diff suppressed because one or more lines are too long

341
notebooks/german_epub_stats.ipynb

File diff suppressed because one or more lines are too long

1
setup.py

@ -31,6 +31,7 @@ setup(
"nltk", "nltk",
"EbookLib", "EbookLib",
"BeautifulSoup", "BeautifulSoup",
"wordfreq",
], ],
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
) )

12
src/ankimaker/tasks/epub/load_epub.py

@ -62,14 +62,14 @@ def epub2text(epub_path):
return ttext return ttext
def generate_corpus_from_epub_file(input_path): def generate_corpus_from_epub_file(input_path, language):
epub_doc = epub2text(input_path) epub_doc = epub2text(input_path)
german_corpus = [] corpus = []
sw = nltk.corpus.stopwords.words('german') sw = nltk.corpus.stopwords.words(language)
for content in epub_doc: for content in epub_doc:
for w in nltk.word_tokenize(content, language='german'): for w in nltk.word_tokenize(content, language=language):
w = w.lower() w = w.lower()
if w not in sw and len(w) > 1: if w not in sw and len(w) > 1:
german_corpus.append(w) corpus.append(w)
return german_corpus return corpus

Loading…
Cancel
Save