gabriel becker
2 years ago
7 changed files with 106 additions and 3 deletions
@ -0,0 +1,14 @@
|
||||
import click |
||||
|
||||
from ankimaker.commands import cli |
||||
from ankimaker.tasks import process_epub |
||||
|
||||
|
||||
@cli.command('epub') |
||||
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True)) |
||||
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False)) |
||||
@click.option('-l', '--lang', 'language', default=None, type=click.STRING) |
||||
@click.option('-n', '--name', 'name', default=None, type=click.STRING) |
||||
def generate_anki(input_file, output_file, language, name): |
||||
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name) |
||||
raise NotImplementedError() |
@ -1,2 +1,3 @@
|
||||
from .basic_csv_to_anki import basic_pandas_to_anki |
||||
from .config_tasks import create_config, enhance_config |
||||
from .epub import process_epub |
||||
|
@ -0,0 +1 @@
|
||||
from .process_epub import process_epub |
@ -0,0 +1,75 @@
|
||||
import nltk |
||||
import ebooklib |
||||
import pandas as pd |
||||
from ebooklib import epub |
||||
from bs4 import BeautifulSoup |
||||
|
||||
nltk.download('stopwords') |
||||
nltk.download('punkt') |
||||
|
||||
blacklist = ( |
||||
'[document]', |
||||
'noscript', |
||||
'header', |
||||
'html', |
||||
'meta', |
||||
'head', |
||||
'input', |
||||
'script', |
||||
'style', |
||||
# there may be more elements you don't want, such as "style", etc. |
||||
) |
||||
|
||||
|
||||
def make_word_frequency_series(corpus): |
||||
nltk_occurrences = nltk.FreqDist(corpus) |
||||
occurrences: pd.Series = pd.Series(dict(nltk_occurrences)) |
||||
frequencies = occurrences / sum(occurrences) |
||||
frequencies = frequencies.sort_values(ascending=False) |
||||
return frequencies |
||||
|
||||
|
||||
def epub2thtml(epub_path): |
||||
book = epub.read_epub(epub_path) |
||||
chapters = [] |
||||
for item in book.get_items(): |
||||
if item.get_type() == ebooklib.ITEM_DOCUMENT: |
||||
chapters.append(item.get_content()) |
||||
return chapters |
||||
|
||||
|
||||
def chap2text(chap): |
||||
output = '' |
||||
soup = BeautifulSoup(chap, 'html.parser') |
||||
text = soup.find_all(text=True) |
||||
for t in text: |
||||
if t.parent.name not in blacklist: |
||||
output += '{} '.format(t) |
||||
return output |
||||
|
||||
|
||||
def thtml2ttext(thtml): |
||||
Output = [] |
||||
for html in thtml: |
||||
text = chap2text(html) |
||||
Output.append(text) |
||||
return Output |
||||
|
||||
|
||||
def epub2text(epub_path): |
||||
chapters = epub2thtml(epub_path) |
||||
ttext = thtml2ttext(chapters) |
||||
return ttext |
||||
|
||||
|
||||
def generate_corpus_from_epub_file(input_path): |
||||
epub_doc = epub2text(input_path) |
||||
german_corpus = [] |
||||
sw = nltk.corpus.stopwords.words('german') |
||||
for content in epub_doc: |
||||
for w in nltk.word_tokenize(content, language='german'): |
||||
w = w.lower() |
||||
if w not in sw and len(w) > 1: |
||||
german_corpus.append(w) |
||||
return epub |
||||
|
Loading…
Reference in new issue