diff --git a/.gitignore b/.gitignore index 872316b..9dc7a0f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ epub *.json *.onnx dist +.venv diff --git a/README.md b/README.md index 5102c88..4bb2509 100644 --- a/README.md +++ b/README.md @@ -33,14 +33,13 @@ audiblez book.epub -l en-gb -v af_sky It will first create a bunch of `book_chapter_1.wav`, `book_chapter_2.wav`, etc. files in the same directory, and at the end it will produce a `book.m4b` file with the whole book you can listen with VLC or any - audiobook player. +audiobook player. It will only produce the `.m4b` file if you have `ffmpeg` installed on your machine. ## Supported Languages Use `-l` option to specify the language, available language codes are: πŸ‡ΊπŸ‡Έ `en-us`, πŸ‡¬πŸ‡§ `en-gb`, πŸ‡«πŸ‡· `fr-fr`, πŸ‡―πŸ‡΅ `ja`, πŸ‡°πŸ‡· `kr` and πŸ‡¨πŸ‡³ `cmn`. - ## Speed By default the audio is generated using a normal speed, but you can make it up to twice slower or faster by specifying a speed argument between 0.5 to 2.0: @@ -53,6 +52,40 @@ Use `-v` option to specify the voice: available voices are `af`, `af_bella`, `af_nicole`, `af_sarah`, `af_sky`, `am_adam`, `am_michael`, `bf_emma`, `bf_isabella`, `bm_george`, `bm_lewis`. You can try them here: [https://huggingface.co/spaces/hexgrad/Kokoro-TTS](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) + +## How to run on GPU +By default audiblez runs on CPU. If you want to use a GPU for faster performance, install the GPU-enabled ONNX Runtime and specify a runtime provider with the `--providers` flag. By default, the CPU-enabled ONNX Runtime is installed. The GPU runtime must be installed manually. + +```bash +pip install onnxruntime-gpu +``` + +To specify ONNX providers, such as using an NVIDIA GPU, use the `--providers` tag. For example: + +```bash +audiblez book.epub -l en-gb -v af_sky --providers CUDAExecutionProvider +``` + +To see the list of available providers on your system, run the following: + +```bash +audiblez --help +``` + +or + +```bash +python -c "import onnxruntime as ort; print(ort.get_available_providers())" +``` + +This will display the ONNX providers that can be used, such as `CUDAExecutionProvider` for NVIDIA GPUs or `CPUExecutionProvider` for CPU-only execution. + +You can specify a provider hierarchy by providing multiple hierarchies separated by spaces. + +```bash +audiblez book.epub -l en-gb -v af_sky --providers CUDAExecutionProvider CPUExecutionProvider +``` + ## Author by [Claudio Santini](https://claudio.uk) in 2025, distributed under MIT licence. diff --git a/audiblez.py b/audiblez.py index f31ea14..edbe0a1 100755 --- a/audiblez.py +++ b/audiblez.py @@ -15,18 +15,39 @@ import re from pathlib import Path from string import Formatter from bs4 import BeautifulSoup +from kokoro_onnx import config from kokoro_onnx import Kokoro from ebooklib import epub from pydub import AudioSegment from pick import pick +import onnxruntime as ort +from tempfile import NamedTemporaryFile + +config.MAX_PHONEME_LENGTH = 128 -def main(kokoro, file_path, lang, voice, pick_manually, speed): +def main(kokoro, file_path, lang, voice, pick_manually, speed, providers): + # Set ONNX providers if specified + if providers: + available_providers = ort.get_available_providers() + invalid_providers = [p for p in providers if p not in available_providers] + if invalid_providers: + print(f"Invalid ONNX providers: {', '.join(invalid_providers)}") + print(f"Available providers: {', '.join(available_providers)}") + sys.exit(1) + kokoro.sess.set_providers(providers) + print(f"Using ONNX providers: {', '.join(providers)}") filename = Path(file_path).name with warnings.catch_warnings(): book = epub.read_epub(file_path) title = book.get_metadata('DC', 'title')[0][0] creator = book.get_metadata('DC', 'creator')[0][0] + + cover_maybe = [c for c in book.get_items() if c.get_type() == ebooklib.ITEM_COVER] + cover_image = cover_maybe[0].get_content() if cover_maybe else b"" + if cover_maybe: + print(f'Found cover image {cover_maybe[0].file_name} in {cover_maybe[0].media_type} format') + intro = f'{title} by {creator}' print(intro) print('Found Chapters:', [c.get_name() for c in book.get_items() if c.get_type() == ebooklib.ITEM_DOCUMENT]) @@ -36,18 +57,22 @@ def main(kokoro, file_path, lang, voice, pick_manually, speed): chapters = find_chapters(book) print('Selected chapters:', [c.get_name() for c in chapters]) texts = extract_texts(chapters) + has_ffmpeg = shutil.which('ffmpeg') is not None if not has_ffmpeg: print('\033[91m' + 'ffmpeg not found. Please install ffmpeg to create mp3 and m4b audiobook files.' + '\033[0m') - total_chars = sum([len(t) for t in texts]) + + total_chars, processed_chars = sum(map(len, texts)), 0 print('Started at:', time.strftime('%H:%M:%S')) print(f'Total characters: {total_chars:,}') - print('Total words:', len(' '.join(texts).split(' '))) + print('Total words:', len(' '.join(texts).split())) - i = 1 chapter_mp3_files = [] - for text in texts: - if len(text) == 0: + durations = {} + + for i, text in enumerate(texts, start=1): + if len(text.strip()) < 10: + print(f'Skipping empty chapter {i}') continue chapter_filename = filename.replace('.epub', f'_chapter_{i}.wav') chapter_mp3_files.append(chapter_filename) @@ -60,25 +85,30 @@ def main(kokoro, file_path, lang, voice, pick_manually, speed): i += 1 chapter_mp3_files.remove(chapter_filename) continue + print(f'Reading chapter {i} ({len(text):,} characters)...') if i == 1: text = intro + '.\n\n' + text + start_time = time.time() samples, sample_rate = kokoro.create(text, voice=voice, speed=speed, lang=lang) sf.write(f'{chapter_filename}', samples, sample_rate) + durations[chapter_filename] = len(samples)/sample_rate end_time = time.time() delta_seconds = end_time - start_time chars_per_sec = len(text) / delta_seconds - remaining_chars = sum([len(t) for t in texts[i - 1:]]) + processed_chars += len(text) + remaining_chars = total_chars - processed_chars remaining_time = remaining_chars / chars_per_sec print(f'Estimated time remaining: {strfdelta(remaining_time)}') print('Chapter written to', chapter_filename) print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)') - progress = int((total_chars - remaining_chars) / total_chars * 100) + progress = processed_chars * 100 // total_chars print('Progress:', f'{progress}%') - i += 1 + if has_ffmpeg: - create_m4b(chapter_mp3_files, filename, title, creator) + create_index_file(title, creator, chapter_mp3_files, durations) + create_m4b(chapter_mp3_files, filename, title, creator, cover_image) def extract_texts(chapters): @@ -98,17 +128,12 @@ def extract_texts(chapters): def is_chapter(c): name = c.get_name().lower() - part = r"part\d{1,3}" - if re.search(part, name): - return True - ch = r"ch\d{1,3}" - if re.search(ch, name): - return True - chap = r"chap\d{1,3}" - if re.search(chap, name): - return True - if 'chapter' in name: - return True + return bool( + 'chapter' in name.lower() + or re.search(r'part\d{1,3}', name) + or re.search(r'ch\d{1,3}', name) + or re.search(r'chap\d{1,3}', name) + ) def find_chapters(book, verbose=False): @@ -146,8 +171,8 @@ def strfdelta(tdelta, fmt='{D:02}d {H:02}h {M:02}m {S:02}s'): return f.format(fmt, **values) -def create_m4b(chapter_files, filename, title, author): - tmp_filename = filename.replace('.epub', '.tmp.m4a') +def create_m4b(chapter_files, filename, title, author, cover_image): + tmp_filename = filename.replace('.epub', '.tmp.mp4') if not Path(tmp_filename).exists(): combined_audio = AudioSegment.empty() for wav_file in chapter_files: @@ -157,10 +182,28 @@ def create_m4b(chapter_files, filename, title, author): combined_audio.export(tmp_filename, format="mp4", codec="aac", bitrate="64k") final_filename = filename.replace('.epub', '.m4b') print('Creating M4B file...') + + if cover_image: + cover_image_file = NamedTemporaryFile("wb") + cover_image_file.write(cover_image) + cover_image_args = ["-i", cover_image_file.name, "-map", "0:a", "-map", "1:v"] + else: + cover_image_args = [] + proc = subprocess.run([ - 'ffmpeg', '-i', f'{tmp_filename}', '-c', 'copy', '-f', 'mp4', - '-metadata', f'title={title}', - '-metadata', f'author={author}', + 'ffmpeg', + '-i', f'{tmp_filename}', + '-i', 'chapters.txt', + #'-map', '0', + #'-map_metadata', '1', + *cover_image_args, + '-c:a', 'copy', + '-c:v', 'copy', + '-disposition:v', 'attached_pic', + '-metadata:s:v', f'title={title}', + '-metadata', f'artist={author}', + '-c', 'copy', + '-f', 'mp4', f'{final_filename}' ]) Path(tmp_filename).unlink() @@ -168,19 +211,45 @@ def create_m4b(chapter_files, filename, title, author): print(f'{final_filename} created. Enjoy your audiobook.') print('Feel free to delete the intermediary .wav chapter files, the .m4b is all you need.') +def probe_duration(file_name): + args = ['ffprobe', '-i', file_name, '-show_entries', 'format=duration', '-v', 'quiet', '-of', 'default=noprint_wrappers=1:nokey=1'] + proc = subprocess.run(args, capture_output=True, text=True, check=True) + return float(proc.stdout.strip()) + +def create_index_file(title, creator, chapter_mp3_files, durations): + with open("chapters.txt", "w") as f: + f.write(f";FFMETADATA1\ntitle={title}\nartist={creator}\n\n") + start = 0 + i = 0 + for c in chapter_mp3_files: + if c not in durations: + durations[c] = probe_duration(c) + end = start + (int)(durations[c] * 1000) + f.write(f"[CHAPTER]\nTIMEBASE=1/1000\nSTART={start}\nEND={end}\ntitle=Chapter {i}\n\n") + i += 1 + start = end + def cli_main(): - if not Path('kokoro-v0_19.onnx').exists() or not Path('voices.json').exists(): + MODEL_NAME = 'kokoro-v0_19.onnx' + CUDA_PROVIDER = "CUDAExecutionProvider" + VOICES = 'voices.json' + if not Path(MODEL_NAME).exists() or not Path(VOICES).exists(): print('Error: kokoro-v0_19.onnx and voices.json must be in the current directory. Please download them with:') print('wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx') print('wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.json') sys.exit(1) - kokoro = Kokoro('kokoro-v0_19.onnx', 'voices.json') + kokoro = Kokoro(MODEL_NAME, VOICES) voices = list(kokoro.get_voices()) voices_str = ', '.join(voices) epilog = 'example:\n' + \ ' audiblez book.epub -l en-us -v af_sky' default_voice = 'af_sky' if 'af_sky' in voices else voices[0] + + # Get available ONNX providers + available_providers = ort.get_available_providers() + providers_help = f"Available ONNX providers: {', '.join(available_providers)}" + parser = argparse.ArgumentParser(epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('epub_file_path', help='Path to the epub file') parser.add_argument('-l', '--lang', default='en-gb', help='Language code: en-gb, en-us, fr-fr, ja, ko, cmn') @@ -188,11 +257,13 @@ def cli_main(): parser.add_argument('-p', '--pick', default=False, help=f'Interactively select which chapters to read in the audiobook', action='store_true') parser.add_argument('-s', '--speed', default=1.0, help=f'Set speed from 0.5 to 2.0', type=float) + parser.add_argument('--providers', nargs='+', metavar='PROVIDER', help=f"Specify ONNX providers. {providers_help}") + if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() - main(kokoro, args.epub_file_path, args.lang, args.voice, args.pick, args.speed) + main(kokoro, args.epub_file_path, args.lang, args.voice, args.pick, args.speed, args.providers) if __name__ == '__main__': diff --git a/pyproject.toml b/pyproject.toml index b15a37e..cb72ab9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "audiblez" -version = "0.1.12" +version = "0.2.0" description = "Generate audiobooks from e-books (epub to wav/m4b)" authors = [ { name = "Claudio Santini", email = "hireclaudio@gmail.com" }