Browse Source

Create epub command.

feature/epub
gabriel becker 2 years ago
parent
commit
0c452726db
  1. 5
      requirements.txt
  2. 7
      setup.py
  3. 14
      src/ankimaker/commands/from_epub.py
  4. 1
      src/ankimaker/tasks/__init__.py
  5. 1
      src/ankimaker/tasks/epub/__init__.py
  6. 75
      src/ankimaker/tasks/epub/load_epub.py
  7. 6
      src/ankimaker/tasks/epub/process_epub.py

5
requirements.txt

@ -2,4 +2,7 @@ click
genanki genanki
pandas pandas
pyyaml pyyaml
bullet bullet
nltk
EbookLib
BeautifulSoup4

7
setup.py

@ -8,7 +8,7 @@ def readme():
setup( setup(
name='ankimaker', name='ankimaker',
version='0.0.5', version='0.0.6',
description='Makes anki with files', description='Makes anki with files',
url="https://git.lgoon.xyz/gabriel/ankimaker", url="https://git.lgoon.xyz/gabriel/ankimaker",
license="BSD-3-Clause", license="BSD-3-Clause",
@ -27,7 +27,10 @@ setup(
"genanki", "genanki",
"pandas", "pandas",
"pyyaml", "pyyaml",
"bullet" "bullet",
"nltk",
"EbookLib",
"BeautifulSoup",
], ],
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
) )

14
src/ankimaker/commands/from_epub.py

@ -0,0 +1,14 @@
import click
from ankimaker.commands import cli
from ankimaker.tasks import process_epub
@cli.command('epub')
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True))
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False))
@click.option('-l', '--lang', 'language', default=None, type=click.STRING)
@click.option('-n', '--name', 'name', default=None, type=click.STRING)
def generate_anki(input_file, output_file, language, name):
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name)
raise NotImplementedError()

1
src/ankimaker/tasks/__init__.py

@ -1,2 +1,3 @@
from .basic_csv_to_anki import basic_pandas_to_anki from .basic_csv_to_anki import basic_pandas_to_anki
from .config_tasks import create_config, enhance_config from .config_tasks import create_config, enhance_config
from .epub import process_epub

1
src/ankimaker/tasks/epub/__init__.py

@ -0,0 +1 @@
from .process_epub import process_epub

75
src/ankimaker/tasks/epub/load_epub.py

@ -0,0 +1,75 @@
import nltk
import ebooklib
import pandas as pd
from ebooklib import epub
from bs4 import BeautifulSoup
nltk.download('stopwords')
nltk.download('punkt')
blacklist = (
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
# there may be more elements you don't want, such as "style", etc.
)
def make_word_frequency_series(corpus):
nltk_occurrences = nltk.FreqDist(corpus)
occurrences: pd.Series = pd.Series(dict(nltk_occurrences))
frequencies = occurrences / sum(occurrences)
frequencies = frequencies.sort_values(ascending=False)
return frequencies
def epub2thtml(epub_path):
book = epub.read_epub(epub_path)
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
return chapters
def chap2text(chap):
output = ''
soup = BeautifulSoup(chap, 'html.parser')
text = soup.find_all(text=True)
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return output
def thtml2ttext(thtml):
Output = []
for html in thtml:
text = chap2text(html)
Output.append(text)
return Output
def epub2text(epub_path):
chapters = epub2thtml(epub_path)
ttext = thtml2ttext(chapters)
return ttext
def generate_corpus_from_epub_file(input_path):
epub_doc = epub2text(input_path)
german_corpus = []
sw = nltk.corpus.stopwords.words('german')
for content in epub_doc:
for w in nltk.word_tokenize(content, language='german'):
w = w.lower()
if w not in sw and len(w) > 1:
german_corpus.append(w)
return epub

6
src/ankimaker/tasks/epub/process_epub.py

@ -0,0 +1,6 @@
from .load_epub import generate_corpus_from_epub_file
def process_epub(input_file, output_file, language, deck_name):
corpus = generate_corpus_from_epub_file(input_file)
raise NotImplementedError()
Loading…
Cancel
Save