diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py index 84f4f79..3361ade 100644 --- a/api/src/inference/kokoro_v1.py +++ b/api/src/inference/kokoro_v1.py @@ -295,7 +295,8 @@ class KokoroV1(BaseModelBackend): logger.error( f"Failed to process timestamps for chunk: {e}" ) - + + yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps) else: logger.warning("No audio in chunk") diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index e285234..0eaa266 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -128,7 +128,7 @@ async def process_voices( async def stream_audio_chunks( tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request -) -> AsyncGenerator[Tuple[bytes,AudioChunk], None]: +) -> AsyncGenerator[list, None]: """Stream audio chunks as they're generated with client disconnect handling""" voice_name = await process_voices(request.voice, tts_service) @@ -140,8 +140,10 @@ async def stream_audio_chunks( speed=request.speed, output_format=request.response_format, lang_code=request.lang_code or request.voice[0], - normalization_options=request.normalization_options + normalization_options=request.normalization_options, + return_timestamps=True, ): + # Check if client is still connected is_disconnected = client_request.is_disconnected if callable(is_disconnected): @@ -149,7 +151,8 @@ async def stream_audio_chunks( if is_disconnected: logger.info("Client disconnected, stopping audio generation") break - yield chunk, chunk_data + + yield chunk except Exception as e: logger.error(f"Error in audio streaming: {str(e)}") # Let the exception propagate to trigger cleanup @@ -158,6 +161,7 @@ async def stream_audio_chunks( @router.post("/audio/speech") async def create_speech( + request: OpenAISpeechRequest, client_request: Request, x_raw_response: str = Header(None, alias="x-raw-response"), @@ -217,7 +221,7 @@ async def create_speech( async def dual_output(): try: # Write chunks to temp file and stream - async for chunk, chunk_data in generator: + async for chunk in generator: if chunk: # Skip empty chunks await temp_writer.write(chunk) #if return_json: diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 470523e..172a133 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -247,7 +247,7 @@ class TTSService: """Generate and stream audio chunks.""" stream_normalizer = AudioNormalizer() chunk_index = 0 - + current_offset=0.0 try: # Get backend backend = self.model_manager.get_backend() @@ -261,7 +261,8 @@ class TTSService: logger.info( f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream" ) - + + # Process text in chunks with smart splitting async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options): try: @@ -277,8 +278,17 @@ class TTSService: is_last=False, # We'll update the last chunk later normalizer=stream_normalizer, lang_code=pipeline_lang_code, # Pass lang_code + return_timestamps=return_timestamps, ): + if chunk_data.word_timestamps is not None: + for timestamp in chunk_data.word_timestamps: + timestamp["start_time"]+=current_offset + timestamp["end_time"]+=current_offset + + current_offset+=len(chunk_data.audio) / 24000 + if result is not None: + print(chunk_data.word_timestamps) yield result,chunk_data chunk_index += 1 else: diff --git a/output.mp3 b/output.mp3 index 580f1b9..1ecd8a3 100644 Binary files a/output.mp3 and b/output.mp3 differ diff --git a/peaks/output.mp3.reapeaks b/peaks/output.mp3.reapeaks new file mode 100644 index 0000000..46e2ba3 Binary files /dev/null and b/peaks/output.mp3.reapeaks differ