fix: improve audio processing by handling large text chunks and ensuring valid audio generation

This commit is contained in:
Sameh Sayed 2025-01-31 00:41:52 +02:00
parent acea1ac31c
commit ec2e678c51

View file

@ -7,6 +7,8 @@ import sys
import time import time
import shutil import shutil
import subprocess import subprocess
import numpy as np
import soundfile import soundfile
import ebooklib import ebooklib
import warnings import warnings
@ -63,7 +65,7 @@ def main(pipeline, file_path, voice, pick_manually, speed):
print('Started at:', time.strftime('%H:%M:%S')) print('Started at:', time.strftime('%H:%M:%S'))
print(f'Total characters: {total_chars:,}') print(f'Total characters: {total_chars:,}')
print('Total words:', len(' '.join(texts).split())) print('Total words:', len(' '.join(texts).split()))
chars_per_sec = 500 if torch.cuda.is_available() else 50 # assume 50 or 500 chars per second at the beginning chars_per_sec = 500 if torch.cuda.is_available() else 50
print(f'Estimated time remaining (assuming {chars_per_sec} chars/sec): {strfdelta((total_chars - processed_chars) / chars_per_sec)}') print(f'Estimated time remaining (assuming {chars_per_sec} chars/sec): {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
chapter_mp3_files = [] chapter_mp3_files = []
@ -81,24 +83,43 @@ def main(pipeline, file_path, voice, pick_manually, speed):
if i == 1: if i == 1:
text = intro + '.\n\n' + text text = intro + '.\n\n' + text
start_time = time.time() start_time = time.time()
generator = pipeline(text, voice=voice, speed=speed) audio_segments = []
for gs, ps, audio in generator: chunk_size = 5000 # Adjust chunk size as needed
soundfile.write(chapter_filename, audio, sample_rate)
end_time = time.time() # Fixed the text processing loop
delta_seconds = end_time - start_time remaining_text = text
chars_per_sec = len(text) / delta_seconds while remaining_text:
processed_chars += len(text) chunk = remaining_text[:chunk_size]
print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}') remaining_text = remaining_text[chunk_size:]
print('Chapter written to', chapter_filename)
print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)') # Process the chunk
progress = processed_chars * 100 // total_chars chunk_segments = []
print('Progress:', f'{progress}%\n') for gs, ps, audio in pipeline(chunk, voice=voice, speed=speed):
chunk_segments.append(audio)
if chunk_segments: # Only append if we got valid audio segments
audio_segments.extend(chunk_segments)
if audio_segments: # Only concatenate if we have segments
final_audio = np.concatenate(audio_segments)
soundfile.write(chapter_filename, final_audio, sample_rate)
end_time = time.time()
delta_seconds = end_time - start_time
chars_per_sec = len(text) / delta_seconds
processed_chars += len(text)
print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
print('Chapter written to', chapter_filename)
print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)')
progress = processed_chars * 100 // total_chars
print('Progress:', f'{progress}%\n')
else:
print(f'Warning: No audio generated for chapter {i}')
chapter_mp3_files.remove(chapter_filename)
if has_ffmpeg: if has_ffmpeg:
create_index_file(title, by_creator, chapter_mp3_files) create_index_file(title, by_creator, chapter_mp3_files)
create_m4b(chapter_mp3_files, filename, cover_image) create_m4b(chapter_mp3_files, filename, cover_image)
def extract_texts(chapters): def extract_texts(chapters):
texts = [] texts = []
for chapter in chapters: for chapter in chapters: