more work on streaming timestamps (not working weird error) :(

2025-04-13 09:39:17 +00:00 · 2025-02-12 20:34:55 +00:00 · 2025-02-12 20:34:55 +00:00 · 6985f6ef99
commit 6985f6ef99
parent 91d370d97f
5 changed files with 21 additions and 23 deletions
--- a/Test.py
+++ b/Test.py
@ -12,10 +12,7 @@ response = requests.post(
        "input": "http://localhost:8880/web/",
        "voice": "af_heart",
        "response_format": "mp3",  # Supported: mp3, wav, opus, flac
-        "speed": 1.0,
-        "normalization_options": {
-            "normalize": True
-        }
+        "speed": 1.0
    }
 )

--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -291,17 +291,6 @@ class KokoroV1(BaseModelBackend):
                                        f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
                                    )

-                                # Update offset for next chunk based on pred_dur
-                                chunk_duration = (
-                                    float(result.pred_dur.sum()) / 80
-                                )  # Convert frames to seconds
-                                current_offset = max(
-                                    current_offset + chunk_duration, end_time
-                                )
-                                logger.debug(
-                                    f"Updated time offset to {current_offset:.3f}s"
-                                )
-
                            except Exception as e:
                                logger.error(
                                    f"Failed to process timestamps for chunk: {e}"
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -8,7 +8,7 @@ import tempfile
 from typing import AsyncGenerator, Dict, List, Union, Tuple

 import aiofiles
-from inference.base import AudioChunk
+from ..inference.base import AudioChunk
 import torch
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
 from fastapi.responses import FileResponse, StreamingResponse
@ -214,15 +214,15 @@ async def create_speech(
                }

                # Create async generator for streaming
-                async def dual_output(return_json:bool=False):
+                async def dual_output():
                    try:
                        # Write chunks to temp file and stream
                        async for chunk, chunk_data in generator:
                            if chunk:  # Skip empty chunks
                                await temp_writer.write(chunk)
-                                if return_json:
-                                    yield chunk, chunk_data
-                                else:
+                                #if return_json:
+                                #    yield chunk, chunk_data
+                                #else:
                                yield chunk

                        # Finalize the temp file
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -328,10 +328,21 @@ class TTSService:
    ) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
        """Generate complete audio for text using streaming internally."""
        start_time = time.time()
-        chunks = []
+        audio_chunks = []
+        audio_data_chunks=[]
        word_timestamps = []
        
+        start_time = time.time()
+        chunks = []
+        word_timestamps = []
        try:
+            async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
+                print("common")
+                audio_chunks.append(audio_stream_data.audio)
+                audio_data_chunks.append(audio_stream_data)
+                
+            print(audio_data_chunks)
+            """
            # Get backend and voice path
            backend = self.model_manager.get_backend()
            voice_name, voice_path = await self._get_voice_path(voice)
@ -574,11 +585,12 @@ class TTSService:
                        [],
                    )  # Empty timestamps for legacy backends
                return audio, processing_time
-
+        """
        except Exception as e:
            logger.error(f"Error in audio generation: {str(e)}")
            raise
        
+
    async def combine_voices(self, voices: List[str]) -> torch.Tensor:
        """Combine multiple voices.

--- a/output.mp3
+++ b/output.mp3