audiblez/audiblez.py

313 lines
12 KiB
Python
Raw Normal View History

2025-01-14 17:45:04 +01:00
#!/usr/bin/env python3
2025-01-14 22:57:31 +01:00
# audiblez - A program to convert e-books into audiobooks using
# Kokoro-82M model for high-quality text-to-speech synthesis.
# by Claudio Santini 2025 - https://claudio.uk
import torch
import spacy
import ebooklib
import soundfile
import numpy as np
2025-01-14 15:35:10 +01:00
import argparse
2025-01-14 17:45:04 +01:00
import sys
2025-01-14 15:35:10 +01:00
import time
import shutil
import subprocess
2025-01-15 09:31:50 +01:00
import re
2025-01-31 21:45:37 +01:00
from tabulate import tabulate
2025-01-14 15:35:10 +01:00
from pathlib import Path
from string import Formatter
from yaspin import yaspin
2025-01-14 15:35:10 +01:00
from bs4 import BeautifulSoup
2025-01-29 10:50:30 +01:00
from kokoro import KPipeline
2025-01-14 15:35:10 +01:00
from ebooklib import epub
2025-01-14 18:38:26 +01:00
from pydub import AudioSegment
2025-01-15 19:12:48 +01:00
from pick import pick
2025-01-22 22:17:02 +05:30
from tempfile import NamedTemporaryFile
2025-01-14 15:35:10 +01:00
2025-01-31 12:13:55 +01:00
from voices import voices, available_voices_str
2025-01-29 10:50:30 +01:00
sample_rate = 24000
def main(file_path, voice, pick_manually, speed, max_chapters=None):
2025-02-01 10:51:25 +01:00
if not spacy.util.is_package("en_core_web_sm"):
print("Downloading Spacy model 'en_core_web_sm'...")
spacy.cli.download("en_core_web_sm")
2025-01-14 17:45:04 +01:00
filename = Path(file_path).name
2025-01-23 21:44:26 +01:00
book = epub.read_epub(file_path)
2025-01-29 15:09:29 +01:00
meta_title = book.get_metadata('DC', 'title')
title = meta_title[0][0] if meta_title else ''
meta_creator = book.get_metadata('DC', 'creator')
2025-01-31 21:45:37 +01:00
creator = meta_creator[0][0] if meta_creator else ''
2025-01-22 22:17:02 +05:30
2025-01-31 15:50:29 -05:00
cover_maybe = find_cover(book)
cover_image = cover_maybe.get_content() if cover_maybe else b""
2025-01-22 22:17:02 +05:30
if cover_maybe:
2025-01-31 15:50:29 -05:00
print(f'Found cover image {cover_maybe.file_name} in {cover_maybe.media_type} format')
2025-01-22 22:17:02 +05:30
2025-01-31 21:45:37 +01:00
intro = f'{title} {creator}.\n\n'
2025-01-14 15:35:10 +01:00
print(intro)
2025-01-31 14:39:32 +01:00
document_chapters = find_document_chapters_and_extract_texts(book)
if pick_manually is True:
2025-01-31 14:39:32 +01:00
chapters = pick_chapters(document_chapters)
2025-01-15 19:12:48 +01:00
else:
chapters = find_good_chapters(document_chapters)
2025-01-31 21:57:21 +01:00
print_selected_chapters(document_chapters, chapters)
2025-01-31 14:39:32 +01:00
texts = [c.extracted_text for c in chapters]
2025-01-22 22:17:02 +05:30
2025-01-14 15:35:10 +01:00
has_ffmpeg = shutil.which('ffmpeg') is not None
2025-01-14 17:45:04 +01:00
if not has_ffmpeg:
print('\033[91m' + 'ffmpeg not found. Please install ffmpeg to create mp3 and m4b audiobook files.' + '\033[0m')
2025-01-22 22:17:02 +05:30
total_chars, processed_chars = sum(map(len, texts)), 0
2025-01-14 17:45:04 +01:00
print('Started at:', time.strftime('%H:%M:%S'))
print(f'Total characters: {total_chars:,}')
print('Total words:', len(' '.join(texts).split()))
chars_per_sec = 500 if torch.cuda.is_available() else 50
2025-01-29 16:16:05 +01:00
print(f'Estimated time remaining (assuming {chars_per_sec} chars/sec): {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
2025-01-14 15:35:10 +01:00
chapter_wav_files = []
for i, text in enumerate(texts, start=1):
if max_chapters and i > max_chapters: break
2025-01-31 21:57:21 +01:00
chapter_filename = filename.replace('.epub', f'_chapter_{i}.wav')
chapter_wav_files.append(chapter_filename)
if Path(chapter_filename).exists():
print(f'File for chapter {i} already exists. Skipping')
continue
if len(text.strip()) < 10:
print(f'Skipping empty chapter {i}')
chapter_wav_files.remove(chapter_filename)
continue
if i == 1:
text = intro + '.\n\n' + text
start_time = time.time()
pipeline = KPipeline(lang_code=voice[0]) # a for american or b for british etc.
2025-01-31 21:57:21 +01:00
with yaspin(text=f'Reading chapter {i} ({len(text):,} characters)...', color="yellow") as spinner:
audio_segments = gen_audio_segments(pipeline, text, voice, speed)
if audio_segments:
final_audio = np.concatenate(audio_segments)
soundfile.write(chapter_filename, final_audio, sample_rate)
end_time = time.time()
delta_seconds = end_time - start_time
chars_per_sec = len(text) / delta_seconds
processed_chars += len(text)
spinner.ok("")
print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
print('Chapter written to', chapter_filename)
print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)')
progress = processed_chars * 100 // total_chars
print('Progress:', f'{progress}%\n')
else:
spinner.fail("")
print(f'Warning: No audio generated for chapter {i}')
chapter_wav_files.remove(chapter_filename)
2025-01-14 17:45:04 +01:00
if has_ffmpeg:
2025-01-31 21:45:37 +01:00
create_index_file(title, creator, chapter_wav_files)
create_m4b(chapter_wav_files, filename, cover_image)
2025-01-14 15:35:10 +01:00
2025-02-01 10:46:58 +01:00
2025-01-31 15:50:29 -05:00
def find_cover(book):
def is_image(item):
return item is not None and item.media_type.startswith('image/')
for item in book.get_items_of_type(ebooklib.ITEM_COVER):
if is_image(item):
return item
# https://idpf.org/forum/topic-715
for meta in book.get_metadata('OPF', 'cover'):
if is_image(item := book.get_item_with_id(meta[1]['content'])):
return item
if is_image(item := book.get_item_with_id('cover')):
return item
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
if 'cover' in item.get_name().lower() and is_image(item):
return item
return None
2025-01-31 12:13:55 +01:00
2025-02-01 10:46:58 +01:00
2025-01-31 21:57:21 +01:00
def print_selected_chapters(document_chapters, chapters):
print(tabulate([
[i, c.get_name(), len(c.extracted_text), '' if c in chapters else '', chapter_beginning_one_liner(c)]
for i, c in enumerate(document_chapters, start=1)
], headers=['#', 'Chapter', 'Text Length', 'Selected', 'First words']))
def gen_audio_segments(pipeline, text, voice, speed):
nlp = spacy.load('xx_ent_wiki_sm')
nlp.add_pipe('sentencizer')
2025-01-31 12:13:55 +01:00
audio_segments = []
doc = nlp(text)
sentences = list(doc.sents)
for sent in sentences:
for gs, ps, audio in pipeline(sent.text, voice=voice, speed=speed, split_pattern=r'\n\n\n'):
audio_segments.append(audio)
2025-01-31 12:13:55 +01:00
return audio_segments
2025-01-31 14:39:32 +01:00
def find_document_chapters_and_extract_texts(book):
"""Returns every chapter that is an ITEM_DOCUMENT and enriches each chapter with extracted_text."""
document_chapters = []
for chapter in book.get_items():
if chapter.get_type() != ebooklib.ITEM_DOCUMENT:
continue
2025-01-14 15:35:10 +01:00
xml = chapter.get_body_content()
soup = BeautifulSoup(xml, features='lxml')
chapter_text = ''
2025-01-27 14:37:31 +01:00
html_content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'li']
2025-01-14 15:35:10 +01:00
for child in soup.find_all(html_content_tags):
inner_text = child.text.strip() if child.text else ""
if inner_text:
chapter_text += inner_text + '\n'
2025-01-31 14:39:32 +01:00
chapter.extracted_text = chapter_text
document_chapters.append(chapter)
return document_chapters
2025-01-14 15:35:10 +01:00
2025-01-15 09:31:50 +01:00
def is_chapter(c):
name = c.get_name().lower()
has_min_len = len(c.extracted_text) > 100
2025-01-31 14:39:32 +01:00
title_looks_like_chapter = bool(
'chapter' in name.lower()
or re.search(r'part_?\d{1,3}', name)
or re.search(r'split_?\d{1,3}', name)
or re.search(r'ch_?\d{1,3}', name)
or re.search(r'chap_?\d{1,3}', name)
)
2025-01-31 14:39:32 +01:00
return has_min_len and title_looks_like_chapter
2025-01-15 09:31:50 +01:00
2025-01-31 21:45:37 +01:00
def chapter_beginning_one_liner(c, chars=20):
s = c.extracted_text[:chars].strip().replace('\n', ' ').replace('\r', ' ')
return s + '' if len(s) > 0 else ''
def find_good_chapters(document_chapters):
2025-01-31 14:39:32 +01:00
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and is_chapter(c)]
2025-01-15 11:00:59 +01:00
if len(chapters) == 0:
2025-01-31 21:45:37 +01:00
print('Not easy to recognize the chapters, defaulting to all non-empty documents.')
chapters = [c for c in document_chapters if c.get_type() == ebooklib.ITEM_DOCUMENT and len(c.extracted_text) > 10]
2025-01-14 15:35:10 +01:00
return chapters
2025-01-31 14:39:32 +01:00
def pick_chapters(chapters):
2025-01-31 21:45:37 +01:00
# Display the document name, the length and first 50 characters of the text
chapters_by_names = {
f'{c.get_name()}\t({len(c.extracted_text)} chars)\t[{chapter_beginning_one_liner(c, 50)}]': c
for c in chapters}
2025-01-15 19:12:48 +01:00
title = 'Select which chapters to read in the audiobook'
2025-01-31 21:45:37 +01:00
ret = pick(list(chapters_by_names.keys()), title, multiselect=True, min_selection_count=1)
selected_chapters_out_of_order = [chapters_by_names[r[0]] for r in ret]
selected_chapters = [c for c in chapters if c in selected_chapters_out_of_order]
2025-01-15 19:12:48 +01:00
return selected_chapters
2025-01-14 15:35:10 +01:00
def strfdelta(tdelta, fmt='{D:02}d {H:02}h {M:02}m {S:02}s'):
remainder = int(tdelta)
f = Formatter()
desired_fields = [field_tuple[1] for field_tuple in f.parse(fmt)]
possible_fields = ('W', 'D', 'H', 'M', 'S')
constants = {'W': 604800, 'D': 86400, 'H': 3600, 'M': 60, 'S': 1}
values = {}
for field in possible_fields:
if field in desired_fields and field in constants:
values[field], remainder = divmod(remainder, constants[field])
return f.format(fmt, **values)
2025-01-30 09:24:21 +01:00
def create_m4b(chapter_files, filename, cover_image):
2025-01-22 22:17:02 +05:30
tmp_filename = filename.replace('.epub', '.tmp.mp4')
2025-01-14 18:38:26 +01:00
if not Path(tmp_filename).exists():
combined_audio = AudioSegment.empty()
for wav_file in chapter_files:
2025-01-14 18:38:26 +01:00
audio = AudioSegment.from_wav(wav_file)
combined_audio += audio
2025-01-14 18:41:15 +01:00
print('Converting to Mp4...')
2025-01-14 18:38:26 +01:00
combined_audio.export(tmp_filename, format="mp4", codec="aac", bitrate="64k")
final_filename = filename.replace('.epub', '.m4b')
2025-01-14 17:45:04 +01:00
print('Creating M4B file...')
2025-01-22 22:17:02 +05:30
if cover_image:
cover_image_file = NamedTemporaryFile("wb")
cover_image_file.write(cover_image)
2025-01-23 22:30:04 +01:00
cover_image_args = ["-i", cover_image_file.name, "-map", "0:a", "-map", "2:v"]
2025-01-22 22:17:02 +05:30
else:
cover_image_args = []
proc = subprocess.run([
2025-01-23 21:44:26 +01:00
'ffmpeg',
'-i', f'{tmp_filename}',
'-i', 'chapters.txt',
2025-01-23 22:30:04 +01:00
*cover_image_args,
2025-01-23 22:14:50 +01:00
'-map', '0',
'-map_metadata', '1',
2025-01-23 21:44:26 +01:00
'-c:a', 'copy',
'-c:v', 'copy',
'-disposition:v', 'attached_pic',
2025-01-23 21:44:26 +01:00
'-c', 'copy',
2025-01-23 21:25:25 +01:00
'-f', 'mp4',
f'{final_filename}'
])
2025-01-14 18:38:26 +01:00
Path(tmp_filename).unlink()
if proc.returncode == 0:
print(f'{final_filename} created. Enjoy your audiobook.')
print('Feel free to delete the intermediary .wav chapter files, the .m4b is all you need.')
2025-01-14 15:35:10 +01:00
2025-01-23 21:44:26 +01:00
def probe_duration(file_name):
args = ['ffprobe', '-i', file_name, '-show_entries', 'format=duration', '-v', 'quiet', '-of', 'default=noprint_wrappers=1:nokey=1']
proc = subprocess.run(args, capture_output=True, text=True, check=True)
return float(proc.stdout.strip())
2025-01-23 21:44:26 +01:00
2025-01-30 09:24:21 +01:00
def create_index_file(title, creator, chapter_mp3_files):
with open("chapters.txt", "w") as f:
f.write(f";FFMETADATA1\ntitle={title}\nartist={creator}\n\n")
start = 0
i = 0
for c in chapter_mp3_files:
2025-01-30 09:24:21 +01:00
duration = probe_duration(c)
end = start + (int)(duration * 1000)
f.write(f"[CHAPTER]\nTIMEBASE=1/1000\nSTART={start}\nEND={end}\ntitle=Chapter {i}\n\n")
i += 1
start = end
2025-01-14 15:35:10 +01:00
2025-01-15 00:06:05 +01:00
def cli_main():
2025-01-14 15:35:10 +01:00
voices_str = ', '.join(voices)
2025-01-31 12:13:55 +01:00
epilog = ('example:\n' +
' audiblez book.epub -l en-us -v af_sky\n\n' +
'available voices:\n' +
available_voices_str)
default_voice = 'af_sky'
2025-01-14 15:35:10 +01:00
parser = argparse.ArgumentParser(epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('epub_file_path', help='Path to the epub file')
2025-01-14 17:45:04 +01:00
parser.add_argument('-v', '--voice', default=default_voice, help=f'Choose narrating voice: {voices_str}')
2025-01-29 10:50:30 +01:00
parser.add_argument('-p', '--pick', default=False, help=f'Interactively select which chapters to read in the audiobook', action='store_true')
2025-01-16 15:20:51 +01:00
parser.add_argument('-s', '--speed', default=1.0, help=f'Set speed from 0.5 to 2.0', type=float)
2025-01-31 12:27:11 +01:00
parser.add_argument('-c', '--cuda', default=False, help=f'Use GPU via Cuda in Torch if available', action='store_true')
2025-01-23 21:44:26 +01:00
2025-01-14 17:45:04 +01:00
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
2025-01-14 15:35:10 +01:00
args = parser.parse_args()
2025-01-29 14:45:06 +01:00
2025-01-30 09:24:21 +01:00
if args.cuda:
if torch.cuda.is_available():
print('CUDA GPU available')
torch.set_default_device('cuda')
else:
print('CUDA GPU not available. Defaulting to CPU')
2025-01-29 14:45:06 +01:00
2025-01-31 14:13:37 +01:00
main(args.epub_file_path, args.voice, args.pick, args.speed)
2025-01-15 00:06:05 +01:00
if __name__ == '__main__':
cli_main()