audiblez/audiblez.py
Claudio Santini 1ef26f893b voices
2025-02-01 11:23:50 +01:00

314 lines
12 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# audiblez - A program to convert e-books into audiobooks using
# Kokoro-82M model for high-quality text-to-speech synthesis.
# by Claudio Santini 2025 - https://claudio.uk
import torch
import spacy
import ebooklib
import soundfile
import numpy as np
import argparse
import sys
import time
import shutil
import subprocess
import re
from tabulate import tabulate
from pathlib import Path
from string import Formatter
from yaspin import yaspin
from bs4 import BeautifulSoup
from kokoro import KPipeline
from ebooklib import epub
from pydub import AudioSegment
from pick import pick
from tempfile import NamedTemporaryFile
from voices import voices, available_voices_str
sample_rate = 24000
def main(file_path, voice, pick_manually, speed, max_chapters=None):
if not spacy.util.is_package("xx_ent_wiki_sm"):
print("Downloading Spacy model xx_ent_wiki_sm...")
spacy.cli.download("xx_ent_wiki_sm")
filename = Path(file_path).name
book = epub.read_epub(file_path)
meta_title = book.get_metadata('DC', 'title')
title = meta_title[0][0] if meta_title else ''
meta_creator = book.get_metadata('DC', 'creator')
creator = meta_creator[0][0] if meta_creator else ''
cover_maybe = find_cover(book)
cover_image = cover_maybe.get_content() if cover_maybe else b""
if cover_maybe:
print(f'Found cover image {cover_maybe.file_name} in {cover_maybe.media_type} format')
intro = f'{title} {creator}.\n\n'
print(intro)
document_chapters = find_document_chapters_and_extract_texts(book)
if pick_manually is True:
selected_chapters = pick_chapters(document_chapters)
else:
selected_chapters = find_good_chapters(document_chapters)
print_selected_chapters(document_chapters, selected_chapters)
texts = [c.extracted_text for c in selected_chapters]
has_ffmpeg = shutil.which('ffmpeg') is not None
if not has_ffmpeg:
print('\033[91m' + 'ffmpeg not found. Please install ffmpeg to create mp3 and m4b audiobook files.' + '\033[0m')
total_chars, processed_chars = sum(map(len, texts)), 0
print('Started at:', time.strftime('%H:%M:%S'))
print(f'Total characters: {total_chars:,}')
print('Total words:', len(' '.join(texts).split()))
chars_per_sec = 500 if torch.cuda.is_available() else 50
print(f'Estimated time remaining (assuming {chars_per_sec} chars/sec): {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
chapter_wav_files = []
for i, chapter in enumerate(selected_chapters, start=1):
if max_chapters and i > max_chapters: break
text = chapter.extracted_text
xhtml_file_name = chapter.get_name().replace(' ', '_').replace('/', '_').replace('\\', '_')
chapter_filename = filename.replace('.epub', f'_chapter_{i}_{voice}_{xhtml_file_name}.wav')
chapter_wav_files.append(chapter_filename)
if Path(chapter_filename).exists():
print(f'File for chapter {i} already exists. Skipping')
continue
if len(text.strip()) < 10:
print(f'Skipping empty chapter {i}')
chapter_wav_files.remove(chapter_filename)
continue
if i == 1:
text = intro + '.\n\n' + text
start_time = time.time()
pipeline = KPipeline(lang_code=voice[0]) # a for american or b for british etc.
with yaspin(text=f'Reading chapter {i} ({len(text):,} characters)...', color="yellow") as spinner:
audio_segments = gen_audio_segments(pipeline, text, voice, speed)
if audio_segments:
final_audio = np.concatenate(audio_segments)
soundfile.write(chapter_filename, final_audio, sample_rate)
end_time = time.time()
delta_seconds = end_time - start_time
chars_per_sec = len(text) / delta_seconds
processed_chars += len(text)
spinner.ok("")
print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
print('Chapter written to', chapter_filename)
print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)')
progress = processed_chars * 100 // total_chars
print('Progress:', f'{progress}%\n')
else:
spinner.fail("")
print(f'Warning: No audio generated for chapter {i}')
chapter_wav_files.remove(chapter_filename)
if has_ffmpeg:
create_index_file(title, creator, chapter_wav_files)
create_m4b(chapter_wav_files, filename, cover_image)
def find_cover(book):
def is_image(item):
return item is not None and item.media_type.startswith('image/')
for item in book.get_items_of_type(ebooklib.ITEM_COVER):
if is_image(item):
return item
# https://idpf.org/forum/topic-715
for meta in book.get_metadata('OPF', 'cover'):
if is_image(item := book.get_item_with_id(meta[1]['content'])):
return item
if is_image(item := book.get_item_with_id('cover')):
return item
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
if 'cover' in item.get_name().lower() and is_image(item):
return item
return None
def print_selected_chapters(document_chapters, chapters):
print(tabulate([
[i, c.get_name(), len(c.extracted_text), '' if c in chapters else '', chapter_beginning_one_liner(c)]
for i, c in enumerate(document_chapters, start=1)
], headers=['#', 'Chapter', 'Text Length', 'Selected', 'First words']))
def gen_audio_segments(pipeline, text, voice, speed):
nlp = spacy.load('xx_ent_wiki_sm')
nlp.add_pipe('sentencizer')
audio_segments = []
doc = nlp(text)
sentences = list(doc.sents)
for sent in sentences:
for gs, ps, audio in pipeline(sent.text, voice=voice, speed=speed, split_pattern=r'\n\n\n'):
audio_segments.append(audio)
return audio_segments
def find_document_chapters_and_extract_texts(book):
"""Returns every chapter that is an ITEM_DOCUMENT and enriches each chapter with extracted_text."""
document_chapters = []
for chapter in book.get_items():
if chapter.get_type() != ebooklib.ITEM_DOCUMENT:
continue
xml = chapter.get_body_content()
soup = BeautifulSoup(xml, features='lxml')
chapter_text = ''
html_content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'li']
for child in soup.find_all(html_content_tags):
inner_text = child.text.strip() if child.text else ""
if inner_text:
chapter_text += inner_text + '\n'
chapter.extracted_text = chapter_text
document_chapters.append(chapter)
return document_chapters
def is_chapter(c):
name = c.get_name().lower()
has_min_len = len(c.extracted_text) > 100
title_looks_like_chapter = bool(
'chapter' in name.lower()
or re.search(r'part_?\d{1,3}', name)
or re.search(r'split_?\d{1,3}', name)
or re.search(r'ch_?\d{1,3}', name)
or re.search(r'chap_?\d{1,3}', name)
)
return has_min_len and title_looks_like_chapter
def chapter_beginning_one_liner(c, chars=20):
s = c.extracted_text[:chars].strip().replace('\n', ' ').replace('\r', ' ')
return s + '' if len(s) > 0 else ''
def find_good_chapters(document_chapters):
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and is_chapter(c)]
if len(chapters) == 0:
print('Not easy to recognize the chapters, defaulting to all non-empty documents.')
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and len(c.extracted_text) > 10]
return chapters
def pick_chapters(chapters):
# Display the document name, the length and first 50 characters of the text
chapters_by_names = {
f'{c.get_name()}\t({len(c.extracted_text)} chars)\t[{chapter_beginning_one_liner(c, 50)}]': c
for c in chapters}
title = 'Select which chapters to read in the audiobook'
ret = pick(list(chapters_by_names.keys()), title, multiselect=True, min_selection_count=1)
selected_chapters_out_of_order = [chapters_by_names[r[0]] for r in ret]
selected_chapters = [c for c in chapters if c in selected_chapters_out_of_order]
return selected_chapters
def strfdelta(tdelta, fmt='{D:02}d {H:02}h {M:02}m {S:02}s'):
remainder = int(tdelta)
f = Formatter()
desired_fields = [field_tuple[1] for field_tuple in f.parse(fmt)]
possible_fields = ('W', 'D', 'H', 'M', 'S')
constants = {'W': 604800, 'D': 86400, 'H': 3600, 'M': 60, 'S': 1}
values = {}
for field in possible_fields:
if field in desired_fields and field in constants:
values[field], remainder = divmod(remainder, constants[field])
return f.format(fmt, **values)
def create_m4b(chapter_files, filename, cover_image):
tmp_filename = filename.replace('.epub', '.tmp.mp4')
if not Path(tmp_filename).exists():
combined_audio = AudioSegment.empty()
for wav_file in chapter_files:
audio = AudioSegment.from_wav(wav_file)
combined_audio += audio
print('Converting to Mp4...')
combined_audio.export(tmp_filename, format="mp4", codec="aac", bitrate="64k")
final_filename = filename.replace('.epub', '.m4b')
print('Creating M4B file...')
if cover_image:
cover_image_file = NamedTemporaryFile("wb")
cover_image_file.write(cover_image)
cover_image_args = ["-i", cover_image_file.name, "-map", "0:a", "-map", "2:v"]
else:
cover_image_args = []
proc = subprocess.run([
'ffmpeg',
'-i', f'{tmp_filename}',
'-i', 'chapters.txt',
*cover_image_args,
'-map', '0',
'-map_metadata', '1',
'-c:a', 'copy',
'-c:v', 'copy',
'-disposition:v', 'attached_pic',
'-c', 'copy',
'-f', 'mp4',
f'{final_filename}'
])
Path(tmp_filename).unlink()
if proc.returncode == 0:
print(f'{final_filename} created. Enjoy your audiobook.')
print('Feel free to delete the intermediary .wav chapter files, the .m4b is all you need.')
def probe_duration(file_name):
args = ['ffprobe', '-i', file_name, '-show_entries', 'format=duration', '-v', 'quiet', '-of', 'default=noprint_wrappers=1:nokey=1']
proc = subprocess.run(args, capture_output=True, text=True, check=True)
return float(proc.stdout.strip())
def create_index_file(title, creator, chapter_mp3_files):
with open("chapters.txt", "w") as f:
f.write(f";FFMETADATA1\ntitle={title}\nartist={creator}\n\n")
start = 0
i = 0
for c in chapter_mp3_files:
duration = probe_duration(c)
end = start + (int)(duration * 1000)
f.write(f"[CHAPTER]\nTIMEBASE=1/1000\nSTART={start}\nEND={end}\ntitle=Chapter {i}\n\n")
i += 1
start = end
def cli_main():
voices_str = ', '.join(voices)
epilog = ('example:\n' +
' audiblez book.epub -l en-us -v af_sky\n\n' +
'available voices:\n' +
available_voices_str)
default_voice = 'af_sky'
parser = argparse.ArgumentParser(epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('epub_file_path', help='Path to the epub file')
parser.add_argument('-v', '--voice', default=default_voice, help=f'Choose narrating voice: {voices_str}')
parser.add_argument('-p', '--pick', default=False, help=f'Interactively select which chapters to read in the audiobook', action='store_true')
parser.add_argument('-s', '--speed', default=1.0, help=f'Set speed from 0.5 to 2.0', type=float)
parser.add_argument('-c', '--cuda', default=False, help=f'Use GPU via Cuda in Torch if available', action='store_true')
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
if args.cuda:
if torch.cuda.is_available():
print('CUDA GPU available')
torch.set_default_device('cuda')
else:
print('CUDA GPU not available. Defaulting to CPU')
main(args.epub_file_path, args.voice, args.pick, args.speed)
if __name__ == '__main__':
cli_main()