load frequencies of epub

2023-09-22 22:27:27 +10:00 · 2023-09-22 22:27:27 +10:00 · 521c7d694d
commit 521c7d694d
parent 9aff3d7d43
5 changed files with 693 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -161,3 +161,6 @@ cython_debug/

 # Project Specific
 scripts/
+.vscode
+.vscode
+data/
--- a/notebooks/french_epub_stats.ipynb
+++ b/notebooks/french_epub_stats.ipynb
--- a/notebooks/german_epub_stats.ipynb
+++ b/notebooks/german_epub_stats.ipynb
--- a/setup.py
+++ b/setup.py
@ -31,6 +31,7 @@ setup(
        "nltk",
        "EbookLib",
        "BeautifulSoup",
+        "wordfreq",
    ],
    long_description_content_type='text/markdown',
 )
--- a/src/ankimaker/tasks/epub/load_epub.py
+++ b/src/ankimaker/tasks/epub/load_epub.py
@ -62,14 +62,14 @@ def epub2text(epub_path):
    return ttext


-def generate_corpus_from_epub_file(input_path):
+def generate_corpus_from_epub_file(input_path, language):
    epub_doc = epub2text(input_path)
-    german_corpus = []
-    sw = nltk.corpus.stopwords.words('german')
+    corpus = []
+    sw = nltk.corpus.stopwords.words(language)
    for content in epub_doc:
-        for w in nltk.word_tokenize(content, language='german'):
+        for w in nltk.word_tokenize(content, language=language):
            w = w.lower()
            if w not in sw and len(w) > 1:
-                german_corpus.append(w)
-    return german_corpus
+                corpus.append(w)
+    return corpus