Create epub command.
This commit is contained in:
parent
a2bbc28aa6
commit
f9dde2da0b
@ -2,4 +2,7 @@ click
|
|||||||
genanki
|
genanki
|
||||||
pandas
|
pandas
|
||||||
pyyaml
|
pyyaml
|
||||||
bullet
|
bullet
|
||||||
|
nltk
|
||||||
|
EbookLib
|
||||||
|
BeautifulSoup4
|
7
setup.py
7
setup.py
@ -8,7 +8,7 @@ def readme():
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='ankimaker',
|
name='ankimaker',
|
||||||
version='0.0.5',
|
version='0.0.6',
|
||||||
description='Makes anki with files',
|
description='Makes anki with files',
|
||||||
url="https://git.lgoon.xyz/gabriel/ankimaker",
|
url="https://git.lgoon.xyz/gabriel/ankimaker",
|
||||||
license="BSD-3-Clause",
|
license="BSD-3-Clause",
|
||||||
@ -27,7 +27,10 @@ setup(
|
|||||||
"genanki",
|
"genanki",
|
||||||
"pandas",
|
"pandas",
|
||||||
"pyyaml",
|
"pyyaml",
|
||||||
"bullet"
|
"bullet",
|
||||||
|
"nltk",
|
||||||
|
"EbookLib",
|
||||||
|
"BeautifulSoup",
|
||||||
],
|
],
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type='text/markdown',
|
||||||
)
|
)
|
||||||
|
14
src/ankimaker/commands/from_epub.py
Normal file
14
src/ankimaker/commands/from_epub.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import click
|
||||||
|
|
||||||
|
from ankimaker.commands import cli
|
||||||
|
from ankimaker.tasks import process_epub
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('epub')
|
||||||
|
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True))
|
||||||
|
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False))
|
||||||
|
@click.option('-l', '--lang', 'language', default=None, type=click.STRING)
|
||||||
|
@click.option('-n', '--name', 'name', default=None, type=click.STRING)
|
||||||
|
def generate_anki(input_file, output_file, language, name):
|
||||||
|
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name)
|
||||||
|
raise NotImplementedError()
|
@ -1,2 +1,3 @@
|
|||||||
from .basic_csv_to_anki import basic_pandas_to_anki
|
from .basic_csv_to_anki import basic_pandas_to_anki
|
||||||
from .config_tasks import create_config, enhance_config
|
from .config_tasks import create_config, enhance_config
|
||||||
|
from .epub import process_epub
|
||||||
|
1
src/ankimaker/tasks/epub/__init__.py
Normal file
1
src/ankimaker/tasks/epub/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .process_epub import process_epub
|
75
src/ankimaker/tasks/epub/load_epub.py
Normal file
75
src/ankimaker/tasks/epub/load_epub.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import nltk
|
||||||
|
import ebooklib
|
||||||
|
import pandas as pd
|
||||||
|
from ebooklib import epub
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
blacklist = (
|
||||||
|
'[document]',
|
||||||
|
'noscript',
|
||||||
|
'header',
|
||||||
|
'html',
|
||||||
|
'meta',
|
||||||
|
'head',
|
||||||
|
'input',
|
||||||
|
'script',
|
||||||
|
'style',
|
||||||
|
# there may be more elements you don't want, such as "style", etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_word_frequency_series(corpus):
|
||||||
|
nltk_occurrences = nltk.FreqDist(corpus)
|
||||||
|
occurrences: pd.Series = pd.Series(dict(nltk_occurrences))
|
||||||
|
frequencies = occurrences / sum(occurrences)
|
||||||
|
frequencies = frequencies.sort_values(ascending=False)
|
||||||
|
return frequencies
|
||||||
|
|
||||||
|
|
||||||
|
def epub2thtml(epub_path):
|
||||||
|
book = epub.read_epub(epub_path)
|
||||||
|
chapters = []
|
||||||
|
for item in book.get_items():
|
||||||
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
||||||
|
chapters.append(item.get_content())
|
||||||
|
return chapters
|
||||||
|
|
||||||
|
|
||||||
|
def chap2text(chap):
|
||||||
|
output = ''
|
||||||
|
soup = BeautifulSoup(chap, 'html.parser')
|
||||||
|
text = soup.find_all(text=True)
|
||||||
|
for t in text:
|
||||||
|
if t.parent.name not in blacklist:
|
||||||
|
output += '{} '.format(t)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def thtml2ttext(thtml):
|
||||||
|
Output = []
|
||||||
|
for html in thtml:
|
||||||
|
text = chap2text(html)
|
||||||
|
Output.append(text)
|
||||||
|
return Output
|
||||||
|
|
||||||
|
|
||||||
|
def epub2text(epub_path):
|
||||||
|
chapters = epub2thtml(epub_path)
|
||||||
|
ttext = thtml2ttext(chapters)
|
||||||
|
return ttext
|
||||||
|
|
||||||
|
|
||||||
|
def generate_corpus_from_epub_file(input_path):
|
||||||
|
epub_doc = epub2text(input_path)
|
||||||
|
german_corpus = []
|
||||||
|
sw = nltk.corpus.stopwords.words('german')
|
||||||
|
for content in epub_doc:
|
||||||
|
for w in nltk.word_tokenize(content, language='german'):
|
||||||
|
w = w.lower()
|
||||||
|
if w not in sw and len(w) > 1:
|
||||||
|
german_corpus.append(w)
|
||||||
|
return epub
|
||||||
|
|
6
src/ankimaker/tasks/epub/process_epub.py
Normal file
6
src/ankimaker/tasks/epub/process_epub.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from .load_epub import generate_corpus_from_epub_file
|
||||||
|
|
||||||
|
|
||||||
|
def process_epub(input_file, output_file, language, deck_name):
|
||||||
|
corpus = generate_corpus_from_epub_file(input_file)
|
||||||
|
raise NotImplementedError()
|
Loading…
x
Reference in New Issue
Block a user