Compare commits

..

6 Commits

  1. 3
      .gitignore
  2. 341
      notebooks/french_epub_stats.ipynb
  3. 341
      notebooks/german_epub_stats.ipynb
  4. 7
      requirements.txt
  5. 8
      setup.py
  6. 1
      src/ankimaker/commands/__init__.py
  7. 13
      src/ankimaker/commands/from_epub.py
  8. 8
      src/ankimaker/config/configuration.py
  9. 2
      src/ankimaker/generator/__init__.py
  10. 17
      src/ankimaker/generator/question_answer_generator.py
  11. 28
      src/ankimaker/generator/translator_generator.py
  12. 1
      src/ankimaker/tasks/__init__.py
  13. 21
      src/ankimaker/tasks/basic_csv_to_anki.py
  14. 24
      src/ankimaker/tasks/dictionary.py
  15. 1
      src/ankimaker/tasks/epub/__init__.py
  16. 75
      src/ankimaker/tasks/epub/load_epub.py
  17. 24
      src/ankimaker/tasks/epub_to_anki.py

3
.gitignore vendored

@ -161,3 +161,6 @@ cython_debug/
# Project Specific # Project Specific
scripts/ scripts/
.vscode
.vscode
data/

341
notebooks/french_epub_stats.ipynb

File diff suppressed because one or more lines are too long

341
notebooks/german_epub_stats.ipynb

File diff suppressed because one or more lines are too long

7
requirements.txt

@ -1,5 +1,10 @@
click click
genanki genanki
pandas pandas==1.5.2
pyyaml pyyaml
bullet bullet
nltk
EbookLib
BeautifulSoup4
PyMultiDictionary
translate

8
setup.py

@ -8,7 +8,7 @@ def readme():
setup( setup(
name='ankimaker', name='ankimaker',
version='0.0.5', version='0.0.6',
description='Makes anki with files', description='Makes anki with files',
url="https://git.lgoon.xyz/gabriel/ankimaker", url="https://git.lgoon.xyz/gabriel/ankimaker",
license="BSD-3-Clause", license="BSD-3-Clause",
@ -27,7 +27,11 @@ setup(
"genanki", "genanki",
"pandas", "pandas",
"pyyaml", "pyyaml",
"bullet" "bullet",
"nltk",
"EbookLib",
"BeautifulSoup",
"wordfreq",
], ],
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
) )

1
src/ankimaker/commands/__init__.py

@ -1,3 +1,4 @@
from .base_click import cli from .base_click import cli
from .from_csv import generate_anki from .from_csv import generate_anki
from .make_config import make_csv_config from .make_config import make_csv_config
from .from_epub import process_epub

13
src/ankimaker/commands/from_epub.py

@ -0,0 +1,13 @@
import click
from ankimaker.commands import cli
from ankimaker.tasks.epub_to_anki import process_epub
@cli.command('epub')
@click.option('-i', '--input', 'input_file', type=click.Path(exists=True))
@click.option('-o', '--output', 'output_file', type=click.Path(exists=False))
@click.option('-l', '--lang', 'language', default=None, type=click.STRING)
@click.option('-n', '--name', 'name', required=False, type=click.STRING)
def generate_anki(input_file, output_file, language, name):
process_epub(input_file=input_file, output_file=output_file, language=language, deck_name=name)

8
src/ankimaker/config/configuration.py

@ -12,16 +12,20 @@ class AnkimakerConfig(yaml.YAMLObject):
question_column = None question_column = None
answer_column = None answer_column = None
separators = ',' separators = ','
input_language = None,
output_language = None,
filters: List[List[FilterConfig]] = list() filters: List[List[FilterConfig]] = list()
def __init__( def __init__(
self, separators=',', header=None, answer_column=None, question_column=None, self, separators=',', header=None, answer_column=None, question_column=None,
filters=tuple(), *args, **karhs filters=tuple(), input_language=None, output_language=None, *args, **karhs
): ):
self.answer_column = answer_column self.answer_column = answer_column
self.question_column = question_column self.question_column = question_column
self.header = header self.header = header
self.separators = separators self.separators = separators
self.input_language = input_language
self.output_language = output_language
self.filters = _conditionally_create_new_filters(filters) self.filters = _conditionally_create_new_filters(filters)
@staticmethod @staticmethod
@ -34,6 +38,8 @@ class AnkimakerConfig(yaml.YAMLObject):
AnkimakerConfig.question_column = content.question_column AnkimakerConfig.question_column = content.question_column
AnkimakerConfig.answer_column = content.answer_column AnkimakerConfig.answer_column = content.answer_column
AnkimakerConfig.separators = content.separators AnkimakerConfig.separators = content.separators
AnkimakerConfig.input_language = content.input_language
AnkimakerConfig.output_language = content.output_language
AnkimakerConfig.filters = _conditionally_create_new_filters(content.filters) AnkimakerConfig.filters = _conditionally_create_new_filters(content.filters)

2
src/ankimaker/generator/__init__.py

@ -3,3 +3,5 @@ from . import (
) )
from .card import create_note from .card import create_note
from .model import create_model from .model import create_model
from .translator_generator import TranslatorGenerator
from .question_answer_generator import QuestionAnswerGenerator

17
src/ankimaker/generator/question_answer_generator.py

@ -0,0 +1,17 @@
import genanki
from typing import Collection, List
from ankimaker import generator
class QuestionAnswerGenerator:
def __init__(self):
self.__model = generator.create_model()
def get_cards(self, questions: Collection[str], answers: Collection[str]) -> List[genanki.Model]:
assert len(questions) == len(answers)
cards = list()
for content_fields in zip(questions, answers):
card = generator.create_note(self.__model, fields=content_fields)
cards.append(card)
return cards

28
src/ankimaker/generator/translator_generator.py

@ -0,0 +1,28 @@
import genanki
from translate import Translator
from typing import Collection, List
from ankimaker import generator
class TranslatorGenerator:
def __init__(self, original_language, destination_language):
"""
:param original_language: Language of the inserted text, following https://en.wikipedia.org/wiki/ISO_639-1
:param destination_language: Language you want to translate to, following https://en.wikipedia.org/wiki/ISO_639-1
"""
self.__translator = Translator(from_lang=original_language, to_lang=destination_language)
self.__model = generator.model.create_model()
def get_cards(self, content_collection: Collection[str]) -> List[genanki.Model]:
cards = list()
for content in content_collection:
card = self._create_card(content)
cards.append(card)
return cards
def _create_card(self, content):
translation = self.__translator.translate(content)
fields = (content, translation)
card = generator.create_note(self.__model, fields)
return card

1
src/ankimaker/tasks/__init__.py

@ -1,2 +1,3 @@
from .basic_csv_to_anki import basic_pandas_to_anki from .basic_csv_to_anki import basic_pandas_to_anki
from .config_tasks import create_config, enhance_config from .config_tasks import create_config, enhance_config
from . import dictionary

21
src/ankimaker/tasks/basic_csv_to_anki.py

@ -18,14 +18,19 @@ def load_csv(path: str) -> pd.DataFrame:
def add_df_to_deck(df: pd.DataFrame, deck: genanki.Deck) -> genanki.Deck: def add_df_to_deck(df: pd.DataFrame, deck: genanki.Deck) -> genanki.Deck:
model = generator.create_model() questions = df[Config.question_column].to_list()
if Config.answer_column is None:
for entry in df.to_dict('records'): generator_engine = generator.TranslatorGenerator(
question = entry[Config.question_column] original_language=Config.input_language,
answer = entry[Config.answer_column] destination_language=Config.output_language,
content_fields = (question, answer) )
note = generator.create_note(model, fields=content_fields) cards = generator_engine.get_cards(questions)
deck.add_note(note) else:
answers = df[Config.answer_column]
generator_engine = generator.QuestionAnswerGenerator()
cards = generator_engine.get_cards(questions, answers)
for card in cards:
deck.add_note(card)
return deck return deck

24
src/ankimaker/tasks/dictionary.py

@ -0,0 +1,24 @@
from multiprocessing import Pool
from itertools import repeat
from typing import Iterable, Optional
from http.client import RemoteDisconnected as HttpClientRemoteDisconnected
from PyMultiDictionary import MultiDictionary
def get_and_process_word_definition(language: str, word: str) -> Optional[str]:
try:
dictionary = MultiDictionary()
definition = dictionary.meaning(lang=language, word=word)
if len(definition[1]) <= 1:
return None
definition = definition[1].split('.')[0]
except HttpClientRemoteDisconnected:
return None
return definition
def get_word_definitions_from_dictionary(language: str, word_collection: Iterable[str]) -> Iterable[str]:
with Pool(7) as p:
definitions = p.starmap(get_and_process_word_definition, zip(repeat(language), word_collection))
return definitions

1
src/ankimaker/tasks/epub/__init__.py

@ -0,0 +1 @@
from . load_epub import generate_corpus_from_epub_file

75
src/ankimaker/tasks/epub/load_epub.py

@ -0,0 +1,75 @@
import nltk
import ebooklib
import pandas as pd
from ebooklib import epub
from bs4 import BeautifulSoup
nltk.download('stopwords')
nltk.download('punkt')
blacklist = (
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
# there may be more elements you don't want, such as "style", etc.
)
def make_word_frequency_series(corpus):
nltk_occurrences = nltk.FreqDist(corpus)
occurrences: pd.Series = pd.Series(dict(nltk_occurrences))
frequencies = occurrences / sum(occurrences)
frequencies = frequencies.sort_values(ascending=False)
return frequencies
def epub2thtml(epub_path):
book = epub.read_epub(epub_path)
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
return chapters
def chap2text(chap):
output = ''
soup = BeautifulSoup(chap, 'html.parser')
text = soup.find_all(text=True)
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return output
def thtml2ttext(thtml):
Output = []
for html in thtml:
text = chap2text(html)
Output.append(text)
return Output
def epub2text(epub_path):
chapters = epub2thtml(epub_path)
ttext = thtml2ttext(chapters)
return ttext
def generate_corpus_from_epub_file(input_path, language):
epub_doc = epub2text(input_path)
corpus = []
sw = nltk.corpus.stopwords.words(language)
for content in epub_doc:
for w in nltk.word_tokenize(content, language=language):
w = w.lower()
if w not in sw and len(w) > 1:
corpus.append(w)
return corpus

24
src/ankimaker/tasks/epub_to_anki.py

@ -0,0 +1,24 @@
from ankimaker import generator
from ankimaker.tasks import epub
from ankimaker.tasks import dictionary
def create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions):
collection = [(words, defi) for words, defi in zip(words_from_epub, definitions) if defi is not None]
return collection
def process_epub(input_file, output_file, language, deck_name):
words_from_epub = epub.generate_corpus_from_epub_file(input_file)
definitions = dictionary.get_word_definitions_from_dictionary(language, words_from_epub)
collection = create_collection_and_filter_out_on_empty_definitions(words_from_epub, definitions)
generator_engine = generator.QuestionAnswerGenerator()
deck = generator.deck.create_deck(deck_name)
words_from_epub, definitions = map(list, zip(*collection))
cards = generator_engine.get_cards(words_from_epub, definitions)
for card in cards:
deck.add_note(card)
generator.deck.save_deck(deck, output_file)
Loading…
Cancel
Save