diff --git a/requirements.txt b/requirements.txt index 35c4c73..4be9a21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,7 @@ click genanki pandas pyyaml -bullet \ No newline at end of file +bullet +nltk +EbookLib +BeautifulSoup4 \ No newline at end of file diff --git a/setup.py b/setup.py index dd33cfe..91ed412 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def readme(): setup( name='ankimaker', - version='0.0.5', + version='0.0.6', description='Makes anki with files', url="https://git.lgoon.xyz/gabriel/ankimaker", license="BSD-3-Clause", @@ -27,7 +27,10 @@ setup( "genanki", "pandas", "pyyaml", - "bullet" + "bullet", + "nltk", + "EbookLib", + "BeautifulSoup", ], long_description_content_type='text/markdown', ) diff --git a/src/ankimaker/commands/from_epub.py b/src/ankimaker/commands/from_epub.py new file mode 100644 index 0000000..90c18d7 --- /dev/null +++ b/src/ankimaker/commands/from_epub.py @@ -0,0 +1,14 @@ +import click + +from ankimaker.commands import cli +from ankimaker.tasks import process_epub + + +@cli.command('epub') +@click.option('-i', '--input', 'input_file', type=click.Path(exists=True)) +@click.option('-o', '--output', 'output_file', type=click.Path(exists=False)) +@click.option('-l', '--lang', 'language', default=None, type=click.STRING) +@click.option('-n', '--name', 'name', default=None, type=click.STRING) +def generate_anki(input_file, output_file, language, name): + process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name) + raise NotImplementedError() diff --git a/src/ankimaker/tasks/__init__.py b/src/ankimaker/tasks/__init__.py index c2d44d0..d529eb7 100644 --- a/src/ankimaker/tasks/__init__.py +++ b/src/ankimaker/tasks/__init__.py @@ -1,2 +1,3 @@ from .basic_csv_to_anki import basic_pandas_to_anki from .config_tasks import create_config, enhance_config +from .epub import process_epub diff --git a/src/ankimaker/tasks/epub/__init__.py b/src/ankimaker/tasks/epub/__init__.py new file mode 100644 index 0000000..7613e7f --- /dev/null +++ b/src/ankimaker/tasks/epub/__init__.py @@ -0,0 +1 @@ +from .process_epub import process_epub diff --git a/src/ankimaker/tasks/epub/load_epub.py b/src/ankimaker/tasks/epub/load_epub.py new file mode 100644 index 0000000..dfb1c12 --- /dev/null +++ b/src/ankimaker/tasks/epub/load_epub.py @@ -0,0 +1,75 @@ +import nltk +import ebooklib +import pandas as pd +from ebooklib import epub +from bs4 import BeautifulSoup + +nltk.download('stopwords') +nltk.download('punkt') + +blacklist = ( + '[document]', + 'noscript', + 'header', + 'html', + 'meta', + 'head', + 'input', + 'script', + 'style', + # there may be more elements you don't want, such as "style", etc. +) + + +def make_word_frequency_series(corpus): + nltk_occurrences = nltk.FreqDist(corpus) + occurrences: pd.Series = pd.Series(dict(nltk_occurrences)) + frequencies = occurrences / sum(occurrences) + frequencies = frequencies.sort_values(ascending=False) + return frequencies + + +def epub2thtml(epub_path): + book = epub.read_epub(epub_path) + chapters = [] + for item in book.get_items(): + if item.get_type() == ebooklib.ITEM_DOCUMENT: + chapters.append(item.get_content()) + return chapters + + +def chap2text(chap): + output = '' + soup = BeautifulSoup(chap, 'html.parser') + text = soup.find_all(text=True) + for t in text: + if t.parent.name not in blacklist: + output += '{} '.format(t) + return output + + +def thtml2ttext(thtml): + Output = [] + for html in thtml: + text = chap2text(html) + Output.append(text) + return Output + + +def epub2text(epub_path): + chapters = epub2thtml(epub_path) + ttext = thtml2ttext(chapters) + return ttext + + +def generate_corpus_from_epub_file(input_path): + epub_doc = epub2text(input_path) + german_corpus = [] + sw = nltk.corpus.stopwords.words('german') + for content in epub_doc: + for w in nltk.word_tokenize(content, language='german'): + w = w.lower() + if w not in sw and len(w) > 1: + german_corpus.append(w) + return epub + diff --git a/src/ankimaker/tasks/epub/process_epub.py b/src/ankimaker/tasks/epub/process_epub.py new file mode 100644 index 0000000..65053de --- /dev/null +++ b/src/ankimaker/tasks/epub/process_epub.py @@ -0,0 +1,6 @@ +from .load_epub import generate_corpus_from_epub_file + + +def process_epub(input_file, output_file, language, deck_name): + corpus = generate_corpus_from_epub_file(input_file) + raise NotImplementedError()