diff --git a/api/src/routers/development.py b/api/src/routers/development.py index 82d5c40..e1cdbab 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -294,6 +294,7 @@ async def create_captioned_speech( request.response_format, is_first_chunk=True, is_last_chunk=False, + trim_audio=False, ) # Convert to requested format with proper finalization diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 73b3923..c4036d6 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -294,6 +294,7 @@ async def create_speech( request.response_format, is_first_chunk=True, is_last_chunk=False, + trim_audio=False ) # Convert to requested format with proper finalization diff --git a/api/src/services/audio.py b/api/src/services/audio.py index 222b5b5..eb31c59 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -119,6 +119,7 @@ class AudioService: chunk_text: str = "", is_first_chunk: bool = True, is_last_chunk: bool = False, + trim_audio: bool = True, normalizer: AudioNormalizer = None, ) -> Tuple[AudioChunk]: """Convert audio data to specified format with streaming support @@ -147,7 +148,9 @@ class AudioService: normalizer = AudioNormalizer() audio_chunk.audio = normalizer.normalize(audio_chunk.audio) - audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) + + if trim_audio == True: + audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) # Get or create format-specific writer writer_key = f"{output_format}_{sample_rate}" diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index d2aaaf2..716c603 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -65,7 +65,7 @@ class TTSService: yield AudioChunk(np.array([], dtype=np.int16),output=b'') return chunk_data = await AudioService.convert_audio( - AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking + AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking 24000, output_format, speed, @@ -225,6 +225,8 @@ class TTSService: return voice, combined_path else: # Single voice + if "(" in voice and ")" in voice: + voice = voice.split("(")[0].strip() path = await self._voice_manager.get_voice_path(voice) if not path: raise RuntimeError(f"Voice not found: {voice}") @@ -341,8 +343,9 @@ class TTSService: try: async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None): + if len(audio_stream_data.audio) > 0: + audio_data_chunks.append(audio_stream_data) - audio_data_chunks.append(audio_stream_data) combined_audio_data=AudioChunk.combine(audio_data_chunks) return combined_audio_data