Compare commits
6 Commits
main
...
feature/ep
Author | SHA1 | Date |
---|---|---|
gabriel becker | 521c7d694d | 1 year ago |
gabriel becker | 9aff3d7d43 | 2 years ago |
gabriel becker | 4166ee1424 | 2 years ago |
gabriel becker | 972661e92a | 2 years ago |
gabriel becker | 72c9dae6b4 | 2 years ago |
gabriel becker | 0c452726db | 2 years ago |
17 changed files with 905 additions and 14 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,5 +1,10 @@ |
|||||||
click |
click |
||||||
genanki |
genanki |
||||||
pandas |
pandas==1.5.2 |
||||||
pyyaml |
pyyaml |
||||||
bullet |
bullet |
||||||
|
nltk |
||||||
|
EbookLib |
||||||
|
BeautifulSoup4 |
||||||
|
PyMultiDictionary |
||||||
|
translate |
@ -1,3 +1,4 @@ |
|||||||
from .base_click import cli |
from .base_click import cli |
||||||
from .from_csv import generate_anki |
from .from_csv import generate_anki |
||||||
from .make_config import make_csv_config |
from .make_config import make_csv_config |
||||||
|
from .from_epub import process_epub |
||||||
|
@ -0,0 +1,13 @@ |
|||||||
|
import click |
||||||
|
|
||||||
|
from ankimaker.commands import cli |
||||||
|
from ankimaker.tasks.epub_to_anki import process_epub |
||||||
|
|
||||||
|
|
||||||
|
@cli.command('epub') |
||||||
|
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True)) |
||||||
|
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False)) |
||||||
|
@click.option('-l', '--lang', 'language', default=None, type=click.STRING) |
||||||
|
@click.option('-n', '--name', 'name', required=False, type=click.STRING) |
||||||
|
def generate_anki(input_file, output_file, language, name): |
||||||
|
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name) |
@ -0,0 +1,17 @@ |
|||||||
|
import genanki |
||||||
|
from typing import Collection, List |
||||||
|
|
||||||
|
from ankimaker import generator |
||||||
|
|
||||||
|
|
||||||
|
class QuestionAnswerGenerator: |
||||||
|
def __init__(self): |
||||||
|
self.__model = generator.create_model() |
||||||
|
|
||||||
|
def get_cards(self, questions: Collection[str], answers: Collection[str]) -> List[genanki.Model]: |
||||||
|
assert len(questions) == len(answers) |
||||||
|
cards = list() |
||||||
|
for content_fields in zip(questions, answers): |
||||||
|
card = generator.create_note(self.__model, fields=content_fields) |
||||||
|
cards.append(card) |
||||||
|
return cards |
@ -0,0 +1,28 @@ |
|||||||
|
import genanki |
||||||
|
from translate import Translator |
||||||
|
from typing import Collection, List |
||||||
|
|
||||||
|
from ankimaker import generator |
||||||
|
|
||||||
|
|
||||||
|
class TranslatorGenerator: |
||||||
|
def __init__(self, original_language, destination_language): |
||||||
|
""" |
||||||
|
:param original_language: Language of the inserted text, following https://en.wikipedia.org/wiki/ISO_639-1 |
||||||
|
:param destination_language: Language you want to translate to, following https://en.wikipedia.org/wiki/ISO_639-1 |
||||||
|
""" |
||||||
|
self.__translator = Translator(from_lang=original_language, to_lang=destination_language) |
||||||
|
self.__model = generator.model.create_model() |
||||||
|
|
||||||
|
def get_cards(self, content_collection: Collection[str]) -> List[genanki.Model]: |
||||||
|
cards = list() |
||||||
|
for content in content_collection: |
||||||
|
card = self._create_card(content) |
||||||
|
cards.append(card) |
||||||
|
return cards |
||||||
|
|
||||||
|
def _create_card(self, content): |
||||||
|
translation = self.__translator.translate(content) |
||||||
|
fields = (content, translation) |
||||||
|
card = generator.create_note(self.__model, fields) |
||||||
|
return card |
@ -1,2 +1,3 @@ |
|||||||
from .basic_csv_to_anki import basic_pandas_to_anki |
from .basic_csv_to_anki import basic_pandas_to_anki |
||||||
from .config_tasks import create_config, enhance_config |
from .config_tasks import create_config, enhance_config |
||||||
|
from . import dictionary |
||||||
|
@ -0,0 +1,24 @@ |
|||||||
|
from multiprocessing import Pool |
||||||
|
from itertools import repeat |
||||||
|
from typing import Iterable, Optional |
||||||
|
from http.client import RemoteDisconnected as HttpClientRemoteDisconnected |
||||||
|
|
||||||
|
from PyMultiDictionary import MultiDictionary |
||||||
|
|
||||||
|
|
||||||
|
def get_and_process_word_definition(language: str, word: str) -> Optional[str]: |
||||||
|
try: |
||||||
|
dictionary = MultiDictionary() |
||||||
|
definition = dictionary.meaning(lang=language, word=word) |
||||||
|
if len(definition[1]) <= 1: |
||||||
|
return None |
||||||
|
definition = definition[1].split('.')[0] |
||||||
|
except HttpClientRemoteDisconnected: |
||||||
|
return None |
||||||
|
return definition |
||||||
|
|
||||||
|
|
||||||
|
def get_word_definitions_from_dictionary(language: str, word_collection: Iterable[str]) -> Iterable[str]: |
||||||
|
with Pool(7) as p: |
||||||
|
definitions = p.starmap(get_and_process_word_definition, zip(repeat(language), word_collection)) |
||||||
|
return definitions |
@ -0,0 +1 @@ |
|||||||
|
from . load_epub import generate_corpus_from_epub_file |
@ -0,0 +1,75 @@ |
|||||||
|
import nltk |
||||||
|
import ebooklib |
||||||
|
import pandas as pd |
||||||
|
from ebooklib import epub |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
nltk.download('stopwords') |
||||||
|
nltk.download('punkt') |
||||||
|
|
||||||
|
blacklist = ( |
||||||
|
'[document]', |
||||||
|
'noscript', |
||||||
|
'header', |
||||||
|
'html', |
||||||
|
'meta', |
||||||
|
'head', |
||||||
|
'input', |
||||||
|
'script', |
||||||
|
'style', |
||||||
|
# there may be more elements you don't want, such as "style", etc. |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
def make_word_frequency_series(corpus): |
||||||
|
nltk_occurrences = nltk.FreqDist(corpus) |
||||||
|
occurrences: pd.Series = pd.Series(dict(nltk_occurrences)) |
||||||
|
frequencies = occurrences / sum(occurrences) |
||||||
|
frequencies = frequencies.sort_values(ascending=False) |
||||||
|
return frequencies |
||||||
|
|
||||||
|
|
||||||
|
def epub2thtml(epub_path): |
||||||
|
book = epub.read_epub(epub_path) |
||||||
|
chapters = [] |
||||||
|
for item in book.get_items(): |
||||||
|
if item.get_type() == ebooklib.ITEM_DOCUMENT: |
||||||
|
chapters.append(item.get_content()) |
||||||
|
return chapters |
||||||
|
|
||||||
|
|
||||||
|
def chap2text(chap): |
||||||
|
output = '' |
||||||
|
soup = BeautifulSoup(chap, 'html.parser') |
||||||
|
text = soup.find_all(text=True) |
||||||
|
for t in text: |
||||||
|
if t.parent.name not in blacklist: |
||||||
|
output += '{} '.format(t) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
def thtml2ttext(thtml): |
||||||
|
Output = [] |
||||||
|
for html in thtml: |
||||||
|
text = chap2text(html) |
||||||
|
Output.append(text) |
||||||
|
return Output |
||||||
|
|
||||||
|
|
||||||
|
def epub2text(epub_path): |
||||||
|
chapters = epub2thtml(epub_path) |
||||||
|
ttext = thtml2ttext(chapters) |
||||||
|
return ttext |
||||||
|
|
||||||
|
|
||||||
|
def generate_corpus_from_epub_file(input_path, language): |
||||||
|
epub_doc = epub2text(input_path) |
||||||
|
corpus = [] |
||||||
|
sw = nltk.corpus.stopwords.words(language) |
||||||
|
for content in epub_doc: |
||||||
|
for w in nltk.word_tokenize(content, language=language): |
||||||
|
w = w.lower() |
||||||
|
if w not in sw and len(w) > 1: |
||||||
|
corpus.append(w) |
||||||
|
return corpus |
||||||
|
|
@ -0,0 +1,24 @@ |
|||||||
|
from ankimaker import generator |
||||||
|
|
||||||
|
from ankimaker.tasks import epub |
||||||
|
from ankimaker.tasks import dictionary |
||||||
|
|
||||||
|
|
||||||
|
def create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions): |
||||||
|
collection = [(words, defi) for words, defi in zip(words_from_epub, definitions) if defi is not None] |
||||||
|
return collection |
||||||
|
|
||||||
|
|
||||||
|
def process_epub(input_file, output_file, language, deck_name): |
||||||
|
words_from_epub = epub.generate_corpus_from_epub_file(input_file) |
||||||
|
definitions = dictionary.get_word_definitions_from_dictionary(language, words_from_epub) |
||||||
|
collection = create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions) |
||||||
|
generator_engine = generator.QuestionAnswerGenerator() |
||||||
|
|
||||||
|
deck = generator.deck.create_deck(deck_name) |
||||||
|
|
||||||
|
words_from_epub, definitions = map(list, zip(*collection)) |
||||||
|
cards = generator_engine.get_cards(words_from_epub, definitions) |
||||||
|
for card in cards: |
||||||
|
deck.add_note(card) |
||||||
|
generator.deck.save_deck(deck, output_file) |
Loading…
Reference in new issue