Create epub command.

2022-12-09 20:03:14 +11:00 · 2022-12-09 20:03:14 +11:00 · 0c452726db
commit 0c452726db
parent 808e2b55a2
7 changed files with 106 additions and 3 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -3,3 +3,6 @@ genanki
 pandas
 pyyaml
 bullet
+nltk
+EbookLib
+BeautifulSoup4
--- a/setup.py
+++ b/setup.py
@ -8,7 +8,7 @@ def readme():

 setup(
    name='ankimaker',
-    version='0.0.5',
+    version='0.0.6',
    description='Makes anki with files',
    url="https://git.lgoon.xyz/gabriel/ankimaker",
    license="BSD-3-Clause",
@ -27,7 +27,10 @@ setup(
        "genanki",
        "pandas",
        "pyyaml",
-        "bullet"
+        "bullet",
+        "nltk",
+        "EbookLib",
+        "BeautifulSoup",
    ],
    long_description_content_type='text/markdown',
 )
--- a/src/ankimaker/commands/from_epub.py
+++ b/src/ankimaker/commands/from_epub.py
@ -0,0 +1,14 @@
+import click
+
+from ankimaker.commands import cli
+from ankimaker.tasks import process_epub
+
+
+@cli.command('epub')
+@click.option('-i', '--input', 'input_file', type=click.Path(exists=True))
+@click.option('-o', '--output', 'output_file', type=click.Path(exists=False))
+@click.option('-l', '--lang', 'language', default=None, type=click.STRING)
+@click.option('-n', '--name', 'name', default=None, type=click.STRING)
+def generate_anki(input_file, output_file, language, name):
+    process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name)
+    raise NotImplementedError()
--- a/src/ankimaker/tasks/init.py
+++ b/src/ankimaker/tasks/init.py
@ -1,2 +1,3 @@
 from .basic_csv_to_anki import basic_pandas_to_anki
 from .config_tasks import create_config, enhance_config
+from .epub import process_epub
--- a/src/ankimaker/tasks/epub/init.py
+++ b/src/ankimaker/tasks/epub/init.py
@ -0,0 +1 @@
+from .process_epub import process_epub
--- a/src/ankimaker/tasks/epub/load_epub.py
+++ b/src/ankimaker/tasks/epub/load_epub.py
@ -0,0 +1,75 @@
+import nltk
+import ebooklib
+import pandas as pd
+from ebooklib import epub
+from bs4 import BeautifulSoup
+
+nltk.download('stopwords')
+nltk.download('punkt')
+
+blacklist = (
+    '[document]',
+    'noscript',
+    'header',
+    'html',
+    'meta',
+    'head',
+    'input',
+    'script',
+    'style',
+    # there may be more elements you don't want, such as "style", etc.
+)
+
+
+def make_word_frequency_series(corpus):
+    nltk_occurrences = nltk.FreqDist(corpus)
+    occurrences: pd.Series = pd.Series(dict(nltk_occurrences))
+    frequencies = occurrences / sum(occurrences)
+    frequencies = frequencies.sort_values(ascending=False)
+    return frequencies
+
+
+def epub2thtml(epub_path):
+    book = epub.read_epub(epub_path)
+    chapters = []
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            chapters.append(item.get_content())
+    return chapters
+
+
+def chap2text(chap):
+    output = ''
+    soup = BeautifulSoup(chap, 'html.parser')
+    text = soup.find_all(text=True)
+    for t in text:
+        if t.parent.name not in blacklist:
+            output += '{} '.format(t)
+    return output
+
+
+def thtml2ttext(thtml):
+    Output = []
+    for html in thtml:
+        text = chap2text(html)
+        Output.append(text)
+    return Output
+
+
+def epub2text(epub_path):
+    chapters = epub2thtml(epub_path)
+    ttext = thtml2ttext(chapters)
+    return ttext
+
+
+def generate_corpus_from_epub_file(input_path):
+    epub_doc = epub2text(input_path)
+    german_corpus = []
+    sw = nltk.corpus.stopwords.words('german')
+    for content in epub_doc:
+        for w in nltk.word_tokenize(content, language='german'):
+            w = w.lower()
+            if w not in sw and len(w) > 1:
+                german_corpus.append(w)
+    return epub
+
--- a/src/ankimaker/tasks/epub/process_epub.py
+++ b/src/ankimaker/tasks/epub/process_epub.py
@ -0,0 +1,6 @@
+from .load_epub import generate_corpus_from_epub_file
+
+
+def process_epub(input_file, output_file, language, deck_name):
+    corpus = generate_corpus_from_epub_file(input_file)
+    raise NotImplementedError()