From ec2e678c51ca95aeea36488f0b9876e7d1e12a2c Mon Sep 17 00:00:00 2001
From: Sameh Sayed <samehsayed0@hotmail.com>
Date: Fri, 31 Jan 2025 00:41:52 +0200
Subject: [PATCH] fix: improve audio processing by handling large text chunks
 and ensuring valid audio generation

---
 audiblez.py | 49 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/audiblez.py b/audiblez.py
index 3a2f7d4..9b2f38d 100755
--- a/audiblez.py
+++ b/audiblez.py
@@ -7,6 +7,8 @@ import sys
 import time
 import shutil
 import subprocess
+
+import numpy as np
 import soundfile
 import ebooklib
 import warnings
@@ -63,7 +65,7 @@ def main(pipeline, file_path, voice, pick_manually, speed):
     print('Started at:', time.strftime('%H:%M:%S'))
     print(f'Total characters: {total_chars:,}')
     print('Total words:', len(' '.join(texts).split()))
-    chars_per_sec = 500 if torch.cuda.is_available() else 50  # assume 50 or 500 chars per second at the beginning
+    chars_per_sec = 500 if torch.cuda.is_available() else 50
     print(f'Estimated time remaining (assuming {chars_per_sec} chars/sec): {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
 
     chapter_mp3_files = []
@@ -81,24 +83,43 @@ def main(pipeline, file_path, voice, pick_manually, speed):
         if i == 1:
             text = intro + '.\n\n' + text
         start_time = time.time()
-        generator = pipeline(text, voice=voice, speed=speed)
-        for gs, ps, audio in generator:
-            soundfile.write(chapter_filename, audio, sample_rate)
-        end_time = time.time()
-        delta_seconds = end_time - start_time
-        chars_per_sec = len(text) / delta_seconds
-        processed_chars += len(text)
-        print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
-        print('Chapter written to', chapter_filename)
-        print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)')
-        progress = processed_chars * 100 // total_chars
-        print('Progress:', f'{progress}%\n')
+        audio_segments = []
+        chunk_size = 5000  # Adjust chunk size as needed
+
+        # Fixed the text processing loop
+        remaining_text = text
+        while remaining_text:
+            chunk = remaining_text[:chunk_size]
+            remaining_text = remaining_text[chunk_size:]
+            
+            # Process the chunk
+            chunk_segments = []
+            for gs, ps, audio in pipeline(chunk, voice=voice, speed=speed):
+                chunk_segments.append(audio)
+            
+            if chunk_segments:  # Only append if we got valid audio segments
+                audio_segments.extend(chunk_segments)
+
+        if audio_segments:  # Only concatenate if we have segments
+            final_audio = np.concatenate(audio_segments)
+            soundfile.write(chapter_filename, final_audio, sample_rate)
+            end_time = time.time()
+            delta_seconds = end_time - start_time
+            chars_per_sec = len(text) / delta_seconds
+            processed_chars += len(text)
+            print(f'Estimated time remaining: {strfdelta((total_chars - processed_chars) / chars_per_sec)}')
+            print('Chapter written to', chapter_filename)
+            print(f'Chapter {i} read in {delta_seconds:.2f} seconds ({chars_per_sec:.0f} characters per second)')
+            progress = processed_chars * 100 // total_chars
+            print('Progress:', f'{progress}%\n')
+        else:
+            print(f'Warning: No audio generated for chapter {i}')
+            chapter_mp3_files.remove(chapter_filename)
 
     if has_ffmpeg:
         create_index_file(title, by_creator, chapter_mp3_files)
         create_m4b(chapter_mp3_files, filename, cover_image)
 
-
 def extract_texts(chapters):
     texts = []
     for chapter in chapters: