Compare commits
6 Commits
main
...
feature/ep
Author | SHA1 | Date | |
---|---|---|---|
|
1947a0e9f2 | ||
|
d3d67e88df | ||
|
695a87127b | ||
|
61eb9180b7 | ||
|
0a2224e9a1 | ||
|
f9dde2da0b |
4
.gitignore
vendored
4
.gitignore
vendored
@ -169,6 +169,6 @@ cython_debug/
|
||||
|
||||
# Project Specific
|
||||
scripts/
|
||||
|
||||
|
||||
.vscode
|
||||
.vscode
|
||||
data/
|
341
notebooks/french_epub_stats.ipynb
Normal file
341
notebooks/french_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
341
notebooks/german_epub_stats.ipynb
Normal file
341
notebooks/german_epub_stats.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -1,5 +1,10 @@
|
||||
click
|
||||
genanki
|
||||
pandas
|
||||
pandas==1.5.2
|
||||
pyyaml
|
||||
bullet
|
||||
nltk
|
||||
EbookLib
|
||||
BeautifulSoup4
|
||||
PyMultiDictionary
|
||||
translate
|
8
setup.py
8
setup.py
@ -8,7 +8,7 @@ def readme():
|
||||
|
||||
setup(
|
||||
name='ankimaker',
|
||||
version='0.0.5',
|
||||
version='0.0.6',
|
||||
description='Makes anki with files',
|
||||
url="https://git.lgoon.xyz/gabriel/ankimaker",
|
||||
license="BSD-3-Clause",
|
||||
@ -27,7 +27,11 @@ setup(
|
||||
"genanki",
|
||||
"pandas",
|
||||
"pyyaml",
|
||||
"bullet"
|
||||
"bullet",
|
||||
"nltk",
|
||||
"EbookLib",
|
||||
"BeautifulSoup",
|
||||
"wordfreq",
|
||||
],
|
||||
long_description_content_type='text/markdown',
|
||||
)
|
||||
|
@ -1,3 +1,4 @@
|
||||
from .base_click import cli
|
||||
from .from_csv import generate_anki
|
||||
from .make_config import make_csv_config
|
||||
from .from_epub import process_epub
|
||||
|
13
src/ankimaker/commands/from_epub.py
Normal file
13
src/ankimaker/commands/from_epub.py
Normal file
@ -0,0 +1,13 @@
|
||||
import click
|
||||
|
||||
from ankimaker.commands import cli
|
||||
from ankimaker.tasks.epub_to_anki import process_epub
|
||||
|
||||
|
||||
@cli.command('epub')
|
||||
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True))
|
||||
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False))
|
||||
@click.option('-l', '--lang', 'language', default=None, type=click.STRING)
|
||||
@click.option('-n', '--name', 'name', required=False, type=click.STRING)
|
||||
def generate_anki(input_file, output_file, language, name):
|
||||
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name)
|
@ -12,16 +12,20 @@ class AnkimakerConfig(yaml.YAMLObject):
|
||||
question_column = None
|
||||
answer_column = None
|
||||
separators = ','
|
||||
input_language = None,
|
||||
output_language = None,
|
||||
filters: List[List[FilterConfig]] = list()
|
||||
|
||||
def __init__(
|
||||
self, separators=',', header=None, answer_column=None, question_column=None,
|
||||
filters=tuple(), *args, **karhs
|
||||
filters=tuple(), input_language=None, output_language=None, *args, **karhs
|
||||
):
|
||||
self.answer_column = answer_column
|
||||
self.question_column = question_column
|
||||
self.header = header
|
||||
self.separators = separators
|
||||
self.input_language = input_language
|
||||
self.output_language = output_language
|
||||
self.filters = _conditionally_create_new_filters(filters)
|
||||
|
||||
@staticmethod
|
||||
@ -34,6 +38,8 @@ class AnkimakerConfig(yaml.YAMLObject):
|
||||
AnkimakerConfig.question_column = content.question_column
|
||||
AnkimakerConfig.answer_column = content.answer_column
|
||||
AnkimakerConfig.separators = content.separators
|
||||
AnkimakerConfig.input_language = content.input_language
|
||||
AnkimakerConfig.output_language = content.output_language
|
||||
AnkimakerConfig.filters = _conditionally_create_new_filters(content.filters)
|
||||
|
||||
|
||||
|
@ -3,3 +3,5 @@ from . import (
|
||||
)
|
||||
from .card import create_note
|
||||
from .model import create_model
|
||||
from .translator_generator import TranslatorGenerator
|
||||
from .question_answer_generator import QuestionAnswerGenerator
|
||||
|
17
src/ankimaker/generator/question_answer_generator.py
Normal file
17
src/ankimaker/generator/question_answer_generator.py
Normal file
@ -0,0 +1,17 @@
|
||||
import genanki
|
||||
from typing import Collection, List
|
||||
|
||||
from ankimaker import generator
|
||||
|
||||
|
||||
class QuestionAnswerGenerator:
|
||||
def __init__(self):
|
||||
self.__model = generator.create_model()
|
||||
|
||||
def get_cards(self, questions: Collection[str], answers: Collection[str]) -> List[genanki.Model]:
|
||||
assert len(questions) == len(answers)
|
||||
cards = list()
|
||||
for content_fields in zip(questions, answers):
|
||||
card = generator.create_note(self.__model, fields=content_fields)
|
||||
cards.append(card)
|
||||
return cards
|
28
src/ankimaker/generator/translator_generator.py
Normal file
28
src/ankimaker/generator/translator_generator.py
Normal file
@ -0,0 +1,28 @@
|
||||
import genanki
|
||||
from translate import Translator
|
||||
from typing import Collection, List
|
||||
|
||||
from ankimaker import generator
|
||||
|
||||
|
||||
class TranslatorGenerator:
|
||||
def __init__(self, original_language, destination_language):
|
||||
"""
|
||||
:param original_language: Language of the inserted text, following https://en.wikipedia.org/wiki/ISO_639-1
|
||||
:param destination_language: Language you want to translate to, following https://en.wikipedia.org/wiki/ISO_639-1
|
||||
"""
|
||||
self.__translator = Translator(from_lang=original_language, to_lang=destination_language)
|
||||
self.__model = generator.model.create_model()
|
||||
|
||||
def get_cards(self, content_collection: Collection[str]) -> List[genanki.Model]:
|
||||
cards = list()
|
||||
for content in content_collection:
|
||||
card = self._create_card(content)
|
||||
cards.append(card)
|
||||
return cards
|
||||
|
||||
def _create_card(self, content):
|
||||
translation = self.__translator.translate(content)
|
||||
fields = (content, translation)
|
||||
card = generator.create_note(self.__model, fields)
|
||||
return card
|
@ -1,2 +1,3 @@
|
||||
from .basic_csv_to_anki import basic_pandas_to_anki
|
||||
from .config_tasks import create_config, enhance_config
|
||||
from . import dictionary
|
||||
|
@ -18,14 +18,19 @@ def load_csv(path: str) -> pd.DataFrame:
|
||||
|
||||
|
||||
def add_df_to_deck(df: pd.DataFrame, deck: genanki.Deck) -> genanki.Deck:
|
||||
model = generator.create_model()
|
||||
|
||||
for entry in df.to_dict('records'):
|
||||
question = entry[Config.question_column]
|
||||
answer = entry[Config.answer_column]
|
||||
content_fields = (question, answer)
|
||||
note = generator.create_note(model, fields=content_fields)
|
||||
deck.add_note(note)
|
||||
questions = df[Config.question_column].to_list()
|
||||
if Config.answer_column is None:
|
||||
generator_engine = generator.TranslatorGenerator(
|
||||
original_language=Config.input_language,
|
||||
destination_language=Config.output_language,
|
||||
)
|
||||
cards = generator_engine.get_cards(questions)
|
||||
else:
|
||||
answers = df[Config.answer_column]
|
||||
generator_engine = generator.QuestionAnswerGenerator()
|
||||
cards = generator_engine.get_cards(questions, answers)
|
||||
for card in cards:
|
||||
deck.add_note(card)
|
||||
return deck
|
||||
|
||||
|
||||
|
24
src/ankimaker/tasks/dictionary.py
Normal file
24
src/ankimaker/tasks/dictionary.py
Normal file
@ -0,0 +1,24 @@
|
||||
from multiprocessing import Pool
|
||||
from itertools import repeat
|
||||
from typing import Iterable, Optional
|
||||
from http.client import RemoteDisconnected as HttpClientRemoteDisconnected
|
||||
|
||||
from PyMultiDictionary import MultiDictionary
|
||||
|
||||
|
||||
def get_and_process_word_definition(language: str, word: str) -> Optional[str]:
|
||||
try:
|
||||
dictionary = MultiDictionary()
|
||||
definition = dictionary.meaning(lang=language, word=word)
|
||||
if len(definition[1]) <= 1:
|
||||
return None
|
||||
definition = definition[1].split('.')[0]
|
||||
except HttpClientRemoteDisconnected:
|
||||
return None
|
||||
return definition
|
||||
|
||||
|
||||
def get_word_definitions_from_dictionary(language: str, word_collection: Iterable[str]) -> Iterable[str]:
|
||||
with Pool(7) as p:
|
||||
definitions = p.starmap(get_and_process_word_definition, zip(repeat(language), word_collection))
|
||||
return definitions
|
1
src/ankimaker/tasks/epub/__init__.py
Normal file
1
src/ankimaker/tasks/epub/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from . load_epub import generate_corpus_from_epub_file
|
75
src/ankimaker/tasks/epub/load_epub.py
Normal file
75
src/ankimaker/tasks/epub/load_epub.py
Normal file
@ -0,0 +1,75 @@
|
||||
import nltk
|
||||
import ebooklib
|
||||
import pandas as pd
|
||||
from ebooklib import epub
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
nltk.download('stopwords')
|
||||
nltk.download('punkt')
|
||||
|
||||
blacklist = (
|
||||
'[document]',
|
||||
'noscript',
|
||||
'header',
|
||||
'html',
|
||||
'meta',
|
||||
'head',
|
||||
'input',
|
||||
'script',
|
||||
'style',
|
||||
# there may be more elements you don't want, such as "style", etc.
|
||||
)
|
||||
|
||||
|
||||
def make_word_frequency_series(corpus):
|
||||
nltk_occurrences = nltk.FreqDist(corpus)
|
||||
occurrences: pd.Series = pd.Series(dict(nltk_occurrences))
|
||||
frequencies = occurrences / sum(occurrences)
|
||||
frequencies = frequencies.sort_values(ascending=False)
|
||||
return frequencies
|
||||
|
||||
|
||||
def epub2thtml(epub_path):
|
||||
book = epub.read_epub(epub_path)
|
||||
chapters = []
|
||||
for item in book.get_items():
|
||||
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
||||
chapters.append(item.get_content())
|
||||
return chapters
|
||||
|
||||
|
||||
def chap2text(chap):
|
||||
output = ''
|
||||
soup = BeautifulSoup(chap, 'html.parser')
|
||||
text = soup.find_all(text=True)
|
||||
for t in text:
|
||||
if t.parent.name not in blacklist:
|
||||
output += '{} '.format(t)
|
||||
return output
|
||||
|
||||
|
||||
def thtml2ttext(thtml):
|
||||
Output = []
|
||||
for html in thtml:
|
||||
text = chap2text(html)
|
||||
Output.append(text)
|
||||
return Output
|
||||
|
||||
|
||||
def epub2text(epub_path):
|
||||
chapters = epub2thtml(epub_path)
|
||||
ttext = thtml2ttext(chapters)
|
||||
return ttext
|
||||
|
||||
|
||||
def generate_corpus_from_epub_file(input_path, language):
|
||||
epub_doc = epub2text(input_path)
|
||||
corpus = []
|
||||
sw = nltk.corpus.stopwords.words(language)
|
||||
for content in epub_doc:
|
||||
for w in nltk.word_tokenize(content, language=language):
|
||||
w = w.lower()
|
||||
if w not in sw and len(w) > 1:
|
||||
corpus.append(w)
|
||||
return corpus
|
||||
|
24
src/ankimaker/tasks/epub_to_anki.py
Normal file
24
src/ankimaker/tasks/epub_to_anki.py
Normal file
@ -0,0 +1,24 @@
|
||||
from ankimaker import generator
|
||||
|
||||
from ankimaker.tasks import epub
|
||||
from ankimaker.tasks import dictionary
|
||||
|
||||
|
||||
def create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions):
|
||||
collection = [(words, defi) for words, defi in zip(words_from_epub, definitions) if defi is not None]
|
||||
return collection
|
||||
|
||||
|
||||
def process_epub(input_file, output_file, language, deck_name):
|
||||
words_from_epub = epub.generate_corpus_from_epub_file(input_file)
|
||||
definitions = dictionary.get_word_definitions_from_dictionary(language, words_from_epub)
|
||||
collection = create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions)
|
||||
generator_engine = generator.QuestionAnswerGenerator()
|
||||
|
||||
deck = generator.deck.create_deck(deck_name)
|
||||
|
||||
words_from_epub, definitions = map(list, zip(*collection))
|
||||
cards = generator_engine.get_cards(words_from_epub, definitions)
|
||||
for card in cards:
|
||||
deck.add_note(card)
|
||||
generator.deck.save_deck(deck, output_file)
|
Loading…
x
Reference in New Issue
Block a user