more features

This commit is contained in:
Claudio Santini 2025-01-31 21:45:37 +01:00
parent b50f9bdc12
commit 178d100596
2 changed files with 27 additions and 18 deletions

View file

@ -12,8 +12,8 @@ import sys
import time import time
import shutil import shutil
import subprocess import subprocess
import warnings
import re import re
from tabulate import tabulate
from pathlib import Path from pathlib import Path
from string import Formatter from string import Formatter
from yaspin import yaspin from yaspin import yaspin
@ -31,19 +31,18 @@ sample_rate = 24000
def main(file_path, voice, pick_manually, speed, max_chapters=None): def main(file_path, voice, pick_manually, speed, max_chapters=None):
filename = Path(file_path).name filename = Path(file_path).name
warnings.simplefilter("ignore")
book = epub.read_epub(file_path) book = epub.read_epub(file_path)
meta_title = book.get_metadata('DC', 'title') meta_title = book.get_metadata('DC', 'title')
title = meta_title[0][0] if meta_title else '' title = meta_title[0][0] if meta_title else ''
meta_creator = book.get_metadata('DC', 'creator') meta_creator = book.get_metadata('DC', 'creator')
by_creator = 'by ' + meta_creator[0][0] if meta_creator else '' creator = meta_creator[0][0] if meta_creator else ''
cover_maybe = [c for c in book.get_items() if c.get_type() == ebooklib.ITEM_COVER] cover_maybe = [c for c in book.get_items() if c.get_type() == ebooklib.ITEM_COVER]
cover_image = cover_maybe[0].get_content() if cover_maybe else b"" cover_image = cover_maybe[0].get_content() if cover_maybe else b""
if cover_maybe: if cover_maybe:
print(f'Found cover image {cover_maybe[0].file_name} in {cover_maybe[0].media_type} format') print(f'Found cover image {cover_maybe[0].file_name} in {cover_maybe[0].media_type} format')
intro = f'{title} {by_creator}' intro = f'{title} {creator}.\n\n'
print(intro) print(intro)
document_chapters = find_document_chapters_and_extract_texts(book) document_chapters = find_document_chapters_and_extract_texts(book)
@ -102,7 +101,7 @@ def main(file_path, voice, pick_manually, speed, max_chapters=None):
chapter_wav_files.remove(chapter_filename) chapter_wav_files.remove(chapter_filename)
if has_ffmpeg: if has_ffmpeg:
create_index_file(title, by_creator, chapter_wav_files) create_index_file(title, creator, chapter_wav_files)
create_m4b(chapter_wav_files, filename, cover_image) create_m4b(chapter_wav_files, filename, cover_image)
@ -151,25 +150,32 @@ def is_chapter(c):
return has_min_len and title_looks_like_chapter return has_min_len and title_looks_like_chapter
def chapter_beginning_one_liner(c, chars=20):
s = c.extracted_text[:chars].strip().replace('\n', ' ').replace('\r', ' ')
return s + '' if len(s) > 0 else ''
def find_good_chapters(document_chapters): def find_good_chapters(document_chapters):
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and is_chapter(c)] chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and is_chapter(c)]
from tabulate import tabulate
if len(chapters) == 0: if len(chapters) == 0:
print('Not easy to recognize the chapters, defaulting to all available documents.') print('Not easy to recognize the chapters, defaulting to all non-empty documents.')
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT] chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and len(c.extracted_text) > 10]
print(tabulate([ print(tabulate([
[i, c.get_name(), len(c.extracted_text), '' if c in chapters else ''] [i, c.get_name(), len(c.extracted_text), '' if c in chapters else '', chapter_beginning_one_liner(c)]
for i, c in enumerate(document_chapters, start=1) for i, c in enumerate(document_chapters, start=1)
], headers=['#', 'Chapter', 'Text Length', 'Selected'])) ], headers=['#', 'Chapter', 'Text Length', 'Selected', 'First words']))
return chapters return chapters
def pick_chapters(chapters): def pick_chapters(chapters):
all_chapters_names = [c.get_name() for c in chapters if c.get_type() == ebooklib.ITEM_DOCUMENT] # Display the document name, the length and first 50 characters of the text
chapters_by_names = {
f'{c.get_name()}\t({len(c.extracted_text)} chars)\t[{chapter_beginning_one_liner(c, 50)}]': c
for c in chapters}
title = 'Select which chapters to read in the audiobook' title = 'Select which chapters to read in the audiobook'
selected_chapters_names = pick(all_chapters_names, title, multiselect=True, min_selection_count=1) ret = pick(list(chapters_by_names.keys()), title, multiselect=True, min_selection_count=1)
selected_chapters_names = [c[0] for c in selected_chapters_names] selected_chapters_out_of_order = [chapters_by_names[r[0]] for r in ret]
selected_chapters = [c for c in chapters if c.get_name() in selected_chapters_names] selected_chapters = [c for c in chapters if c in selected_chapters_out_of_order]
return selected_chapters return selected_chapters

View file

@ -16,26 +16,29 @@ class MainTest(unittest.TestCase):
merged_args = dict(voice='af_sky', pick_manually=False, speed=1.0, max_chapters=2) merged_args = dict(voice='af_sky', pick_manually=False, speed=1.0, max_chapters=2)
merged_args.update(kwargs) merged_args.update(kwargs)
main(f'{name}.epub', **merged_args) main(f'{name}.epub', **merged_args)
self.assertTrue(Path(f'{name}.m4b').exists()) m4b_file = Path(f'{name}.m4b')
self.assertTrue(m4b_file.exists())
self.assertTrue(m4b_file.stat().st_size > 256 * 1024)
chapter_1_wav = Path(f'{name}_chapter_1.wav') chapter_1_wav = Path(f'{name}_chapter_1.wav')
self.assertTrue(chapter_1_wav.exists()) self.assertTrue(chapter_1_wav.exists())
self.assertTrue(chapter_1_wav.stat().st_size > 256 * 1024) self.assertTrue(chapter_1_wav.stat().st_size > 256 * 1024)
def test_poe(self): def test_poe(self):
url = 'https://www.gutenberg.org/ebooks/1064.epub.images' url = 'https://www.gutenberg.org/ebooks/1064.epub.images'
self.base('poe') self.base('poe', url)
def test_orwell(self): def test_orwell(self):
url = 'https://archive.org/download/AnimalFarmByGeorgeOrwell/Animal%20Farm%20by%20George%20Orwell.epub' url = 'https://archive.org/download/AnimalFarmByGeorgeOrwell/Animal%20Farm%20by%20George%20Orwell.epub'
self.base('orwell', url) self.base('orwell', url)
def test_italian_pirandello(self): def test_italian_pirandello(self):
self.base('pirandello', voice='im_nicola') url = 'https://www.liberliber.eu/mediateca/libri/p/pirandello/cosi_e_se_vi_pare_1925/epub/pirandello_cosi_e_se_vi_pare_1925.epub'
self.base('pirandello', url, voice='im_nicola')
self.assertTrue(Path('pirandello.m4b').exists()) self.assertTrue(Path('pirandello.m4b').exists())
def test_italian_manzoni(self): def test_italian_manzoni(self):
url = 'https://www.liberliber.eu/mediateca/libri/m/manzoni/i_promessi_sposi/epub/manzoni_i_promessi_sposi.epub' url = 'https://www.liberliber.eu/mediateca/libri/m/manzoni/i_promessi_sposi/epub/manzoni_i_promessi_sposi.epub'
self.base('manzoni', url, voice='im_nicola') self.base('manzoni', url, voice='im_nicola', max_chapters=1)
def test_french_baudelaire(self): def test_french_baudelaire(self):
url = 'http://gallica.bnf.fr/ark:/12148/bpt6k70861t.epub' url = 'http://gallica.bnf.fr/ark:/12148/bpt6k70861t.epub'