More work on timestamps (Does not maintain accuracy over multiple chunks)

2025-08-05 16:48:53 +00:00 · 2025-02-12 21:36:35 -05:00 · 2025-02-12 21:36:35 -05:00 · 5b20602b8e
commit 5b20602b8e
parent 6985f6ef99
5 changed files with 22 additions and 7 deletions
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -295,7 +295,8 @@ class KokoroV1(BaseModelBackend):
                                logger.error(
                                    f"Failed to process timestamps for chunk: {e}"
                                )
-                        
+                      
+                      
                    yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps)
                else:
                    logger.warning("No audio in chunk")
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -128,7 +128,7 @@ async def process_voices(

 async def stream_audio_chunks(
    tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request
-) -> AsyncGenerator[Tuple[bytes,AudioChunk], None]:
+) -> AsyncGenerator[list, None]:
    """Stream audio chunks as they're generated with client disconnect handling"""
    voice_name = await process_voices(request.voice, tts_service)

@ -140,8 +140,10 @@ async def stream_audio_chunks(
            speed=request.speed,
            output_format=request.response_format,
            lang_code=request.lang_code or request.voice[0],
-            normalization_options=request.normalization_options
+            normalization_options=request.normalization_options,
+            return_timestamps=True,
        ):
+
            # Check if client is still connected
            is_disconnected = client_request.is_disconnected
            if callable(is_disconnected):
@ -149,7 +151,8 @@ async def stream_audio_chunks(
            if is_disconnected:
                logger.info("Client disconnected, stopping audio generation")
                break
-            yield chunk, chunk_data
+
+            yield chunk
    except Exception as e:
        logger.error(f"Error in audio streaming: {str(e)}")
        # Let the exception propagate to trigger cleanup
@ -158,6 +161,7 @@ async def stream_audio_chunks(

@router.post("/audio/speech")
 async def create_speech(
+    
    request: OpenAISpeechRequest,
    client_request: Request,
    x_raw_response: str = Header(None, alias="x-raw-response"),
@ -217,7 +221,7 @@ async def create_speech(
                async def dual_output():
                    try:
                        # Write chunks to temp file and stream
-                        async for chunk, chunk_data in generator:
+                        async for chunk in generator:
                            if chunk:  # Skip empty chunks
                                await temp_writer.write(chunk)
                                #if return_json:
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -247,7 +247,7 @@ class TTSService:
        """Generate and stream audio chunks."""
        stream_normalizer = AudioNormalizer()
        chunk_index = 0
-
+        current_offset=0.0
        try:
            # Get backend
            backend = self.model_manager.get_backend()
@ -261,7 +261,8 @@ class TTSService:
            logger.info(
                f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
            )
-
+            
+            
            # Process text in chunks with smart splitting
            async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
                try:
@ -277,8 +278,17 @@ class TTSService:
                        is_last=False,  # We'll update the last chunk later
                        normalizer=stream_normalizer,
                        lang_code=pipeline_lang_code,  # Pass lang_code
+                        return_timestamps=return_timestamps,
                    ):
+                        if chunk_data.word_timestamps is not None:
+                            for timestamp in chunk_data.word_timestamps:
+                                timestamp["start_time"]+=current_offset
+                                timestamp["end_time"]+=current_offset
+                        
+                        current_offset+=len(chunk_data.audio) / 24000
+                        
                        if result is not None:
+                            print(chunk_data.word_timestamps)
                            yield result,chunk_data
                            chunk_index += 1
                        else:
--- a/output.mp3
+++ b/output.mp3
--- a/peaks/output.mp3.reapeaks
+++ b/peaks/output.mp3.reapeaks