Compare commits
No commits in common. 'feature/epub' and 'main' have entirely different histories.
feature/ep
...
main
17 changed files with 14 additions and 905 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,10 +1,5 @@ |
|||||||
click |
click |
||||||
genanki |
genanki |
||||||
pandas==1.5.2 |
pandas |
||||||
pyyaml |
pyyaml |
||||||
bullet |
bullet |
||||||
nltk |
|
||||||
EbookLib |
|
||||||
BeautifulSoup4 |
|
||||||
PyMultiDictionary |
|
||||||
translate |
|
@ -1,13 +0,0 @@ |
|||||||
import click |
|
||||||
|
|
||||||
from ankimaker.commands import cli |
|
||||||
from ankimaker.tasks.epub_to_anki import process_epub |
|
||||||
|
|
||||||
|
|
||||||
@cli.command('epub') |
|
||||||
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True)) |
|
||||||
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False)) |
|
||||||
@click.option('-l', '--lang', 'language', default=None, type=click.STRING) |
|
||||||
@click.option('-n', '--name', 'name', required=False, type=click.STRING) |
|
||||||
def generate_anki(input_file, output_file, language, name): |
|
||||||
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name) |
|
@ -1,17 +0,0 @@ |
|||||||
import genanki |
|
||||||
from typing import Collection, List |
|
||||||
|
|
||||||
from ankimaker import generator |
|
||||||
|
|
||||||
|
|
||||||
class QuestionAnswerGenerator: |
|
||||||
def __init__(self): |
|
||||||
self.__model = generator.create_model() |
|
||||||
|
|
||||||
def get_cards(self, questions: Collection[str], answers: Collection[str]) -> List[genanki.Model]: |
|
||||||
assert len(questions) == len(answers) |
|
||||||
cards = list() |
|
||||||
for content_fields in zip(questions, answers): |
|
||||||
card = generator.create_note(self.__model, fields=content_fields) |
|
||||||
cards.append(card) |
|
||||||
return cards |
|
@ -1,28 +0,0 @@ |
|||||||
import genanki |
|
||||||
from translate import Translator |
|
||||||
from typing import Collection, List |
|
||||||
|
|
||||||
from ankimaker import generator |
|
||||||
|
|
||||||
|
|
||||||
class TranslatorGenerator: |
|
||||||
def __init__(self, original_language, destination_language): |
|
||||||
""" |
|
||||||
:param original_language: Language of the inserted text, following https://en.wikipedia.org/wiki/ISO_639-1 |
|
||||||
:param destination_language: Language you want to translate to, following https://en.wikipedia.org/wiki/ISO_639-1 |
|
||||||
""" |
|
||||||
self.__translator = Translator(from_lang=original_language, to_lang=destination_language) |
|
||||||
self.__model = generator.model.create_model() |
|
||||||
|
|
||||||
def get_cards(self, content_collection: Collection[str]) -> List[genanki.Model]: |
|
||||||
cards = list() |
|
||||||
for content in content_collection: |
|
||||||
card = self._create_card(content) |
|
||||||
cards.append(card) |
|
||||||
return cards |
|
||||||
|
|
||||||
def _create_card(self, content): |
|
||||||
translation = self.__translator.translate(content) |
|
||||||
fields = (content, translation) |
|
||||||
card = generator.create_note(self.__model, fields) |
|
||||||
return card |
|
@ -1,24 +0,0 @@ |
|||||||
from multiprocessing import Pool |
|
||||||
from itertools import repeat |
|
||||||
from typing import Iterable, Optional |
|
||||||
from http.client import RemoteDisconnected as HttpClientRemoteDisconnected |
|
||||||
|
|
||||||
from PyMultiDictionary import MultiDictionary |
|
||||||
|
|
||||||
|
|
||||||
def get_and_process_word_definition(language: str, word: str) -> Optional[str]: |
|
||||||
try: |
|
||||||
dictionary = MultiDictionary() |
|
||||||
definition = dictionary.meaning(lang=language, word=word) |
|
||||||
if len(definition[1]) <= 1: |
|
||||||
return None |
|
||||||
definition = definition[1].split('.')[0] |
|
||||||
except HttpClientRemoteDisconnected: |
|
||||||
return None |
|
||||||
return definition |
|
||||||
|
|
||||||
|
|
||||||
def get_word_definitions_from_dictionary(language: str, word_collection: Iterable[str]) -> Iterable[str]: |
|
||||||
with Pool(7) as p: |
|
||||||
definitions = p.starmap(get_and_process_word_definition, zip(repeat(language), word_collection)) |
|
||||||
return definitions |
|
@ -1 +0,0 @@ |
|||||||
from . load_epub import generate_corpus_from_epub_file |
|
@ -1,75 +0,0 @@ |
|||||||
import nltk |
|
||||||
import ebooklib |
|
||||||
import pandas as pd |
|
||||||
from ebooklib import epub |
|
||||||
from bs4 import BeautifulSoup |
|
||||||
|
|
||||||
nltk.download('stopwords') |
|
||||||
nltk.download('punkt') |
|
||||||
|
|
||||||
blacklist = ( |
|
||||||
'[document]', |
|
||||||
'noscript', |
|
||||||
'header', |
|
||||||
'html', |
|
||||||
'meta', |
|
||||||
'head', |
|
||||||
'input', |
|
||||||
'script', |
|
||||||
'style', |
|
||||||
# there may be more elements you don't want, such as "style", etc. |
|
||||||
) |
|
||||||
|
|
||||||
|
|
||||||
def make_word_frequency_series(corpus): |
|
||||||
nltk_occurrences = nltk.FreqDist(corpus) |
|
||||||
occurrences: pd.Series = pd.Series(dict(nltk_occurrences)) |
|
||||||
frequencies = occurrences / sum(occurrences) |
|
||||||
frequencies = frequencies.sort_values(ascending=False) |
|
||||||
return frequencies |
|
||||||
|
|
||||||
|
|
||||||
def epub2thtml(epub_path): |
|
||||||
book = epub.read_epub(epub_path) |
|
||||||
chapters = [] |
|
||||||
for item in book.get_items(): |
|
||||||
if item.get_type() == ebooklib.ITEM_DOCUMENT: |
|
||||||
chapters.append(item.get_content()) |
|
||||||
return chapters |
|
||||||
|
|
||||||
|
|
||||||
def chap2text(chap): |
|
||||||
output = '' |
|
||||||
soup = BeautifulSoup(chap, 'html.parser') |
|
||||||
text = soup.find_all(text=True) |
|
||||||
for t in text: |
|
||||||
if t.parent.name not in blacklist: |
|
||||||
output += '{} '.format(t) |
|
||||||
return output |
|
||||||
|
|
||||||
|
|
||||||
def thtml2ttext(thtml): |
|
||||||
Output = [] |
|
||||||
for html in thtml: |
|
||||||
text = chap2text(html) |
|
||||||
Output.append(text) |
|
||||||
return Output |
|
||||||
|
|
||||||
|
|
||||||
def epub2text(epub_path): |
|
||||||
chapters = epub2thtml(epub_path) |
|
||||||
ttext = thtml2ttext(chapters) |
|
||||||
return ttext |
|
||||||
|
|
||||||
|
|
||||||
def generate_corpus_from_epub_file(input_path, language): |
|
||||||
epub_doc = epub2text(input_path) |
|
||||||
corpus = [] |
|
||||||
sw = nltk.corpus.stopwords.words(language) |
|
||||||
for content in epub_doc: |
|
||||||
for w in nltk.word_tokenize(content, language=language): |
|
||||||
w = w.lower() |
|
||||||
if w not in sw and len(w) > 1: |
|
||||||
corpus.append(w) |
|
||||||
return corpus |
|
||||||
|
|
@ -1,24 +0,0 @@ |
|||||||
from ankimaker import generator |
|
||||||
|
|
||||||
from ankimaker.tasks import epub |
|
||||||
from ankimaker.tasks import dictionary |
|
||||||
|
|
||||||
|
|
||||||
def create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions): |
|
||||||
collection = [(words, defi) for words, defi in zip(words_from_epub, definitions) if defi is not None] |
|
||||||
return collection |
|
||||||
|
|
||||||
|
|
||||||
def process_epub(input_file, output_file, language, deck_name): |
|
||||||
words_from_epub = epub.generate_corpus_from_epub_file(input_file) |
|
||||||
definitions = dictionary.get_word_definitions_from_dictionary(language, words_from_epub) |
|
||||||
collection = create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions) |
|
||||||
generator_engine = generator.QuestionAnswerGenerator() |
|
||||||
|
|
||||||
deck = generator.deck.create_deck(deck_name) |
|
||||||
|
|
||||||
words_from_epub, definitions = map(list, zip(*collection)) |
|
||||||
cards = generator_engine.get_cards(words_from_epub, definitions) |
|
||||||
for card in cards: |
|
||||||
deck.add_note(card) |
|
||||||
generator.deck.save_deck(deck, output_file) |
|
Loading…
Reference in new issue