diff --git a/api/src/inference/base.py b/api/src/inference/base.py index 93fcf69..5cb84dc 100644 --- a/api/src/inference/base.py +++ b/api/src/inference/base.py @@ -11,7 +11,7 @@ class AudioChunk: def __init__(self, audio: np.ndarray, - word_timestamps: Optional[List]=None + word_timestamps: Optional[List]=[] ): self.audio=audio self.word_timestamps=word_timestamps diff --git a/api/src/services/audio.py b/api/src/services/audio.py index 8bc99bb..f8b765a 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -57,7 +57,6 @@ class AudioNormalizer: non_silent_index_start, non_silent_index_end = None,None for X in range(0,len(audio_data)): - #print(audio_data[X]) if audio_data[X] > amplitude_threshold: non_silent_index_start=X break @@ -149,11 +148,9 @@ class AudioService: if normalizer is None: normalizer = AudioNormalizer() - print("1") audio_chunk.audio = await normalizer.normalize(audio_chunk.audio) - print("2") audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) - print("3") + # Get or create format-specific writer writer_key = f"{output_format}_{sample_rate}" if is_first_chunk or writer_key not in AudioService._writers: @@ -161,18 +158,16 @@ class AudioService: output_format, sample_rate ) writer = AudioService._writers[writer_key] - print("4") + # Write audio data first if len(audio_chunk.audio) > 0: chunk_data = writer.write_chunk(audio_chunk.audio) - print("5") + # Then finalize if this is the last chunk if is_last_chunk: - print("6") final_data = writer.write_chunk(finalize=True) - print("7") del AudioService._writers[writer_key] - return final_data if final_data else b"" + return final_data if final_data else b"", audio_chunk return chunk_data if chunk_data else b"", audio_chunk @@ -206,8 +201,10 @@ class AudioService: start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk) audio_chunk.audio=audio_chunk.audio[start_index:end_index] - for timestamp in audio_chunk.word_timestamps: - timestamp["start_time"]-=start_index * 24000 - timestamp["end_time"]-=start_index * 24000 + + if audio_chunk.word_timestamps is not None: + for timestamp in audio_chunk.word_timestamps: + timestamp["start_time"]-=start_index / 24000 + timestamp["end_time"]-=start_index / 24000 return audio_chunk \ No newline at end of file diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 67e2641..0fcaac7 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -6,6 +6,7 @@ import tempfile import time from typing import AsyncGenerator, List, Optional, Tuple, Union +from ..inference.base import AudioChunk import numpy as np import torch from kokoro import KPipeline @@ -62,9 +63,8 @@ class TTSService: if not output_format: yield np.array([], dtype=np.float32) return - - result = await AudioService.convert_audio( - np.array([0], dtype=np.float32), # Dummy data for type checking + result, _ = await AudioService.convert_audio( + AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking 24000, output_format, speed, @@ -119,7 +119,7 @@ class TTSService: print(chunk_data.word_timestamps) yield chunk_data.audio else: - print("old backend") + # For legacy backends, load voice tensor voice_tensor = await self._voice_manager.load_voice( voice_name, device=backend.device @@ -315,7 +315,8 @@ class TTSService: except Exception as e: logger.error(f"Error in phoneme audio generation: {str(e)}") - raise + raise e + async def generate_audio( self,