more work on streaming timestamps (not working weird error) :(

2025-08-31 21:59:28 +00:00 · 2025-02-12 20:34:55 +00:00 · 2025-02-12 20:34:55 +00:00 · 6985f6ef99
commit 6985f6ef99
parent 91d370d97f
5 changed files with 21 additions and 23 deletions
--- a/Test.py
+++ b/Test.py
@ -12,10 +12,7 @@ response = requests.post(
        "input": "http://localhost:8880/web/",
        "voice": "af_heart",
        "response_format": "mp3",  # Supported: mp3, wav, opus, flac
-        "speed": 1.0,
+        "speed": 1.0
        "normalization_options": {
            "normalize": True
        }
    }
 )
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -291,17 +291,6 @@ class KokoroV1(BaseModelBackend):
                                        f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
                                    )
                                # Update offset for next chunk based on pred_dur
                                chunk_duration = (
                                    float(result.pred_dur.sum()) / 80
                                )  # Convert frames to seconds
                                current_offset = max(
                                    current_offset + chunk_duration, end_time
                                )
                                logger.debug(
                                    f"Updated time offset to {current_offset:.3f}s"
                                )
                            except Exception as e:
                                logger.error(
                                    f"Failed to process timestamps for chunk: {e}"
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -8,7 +8,7 @@ import tempfile
 from typing import AsyncGenerator, Dict, List, Union, Tuple
 import aiofiles
-from inference.base import AudioChunk
+from ..inference.base import AudioChunk
 import torch
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
 from fastapi.responses import FileResponse, StreamingResponse
@ -214,16 +214,16 @@ async def create_speech(
                }
                # Create async generator for streaming
-                async def dual_output(return_json:bool=False):
+                async def dual_output():
                    try:
                        # Write chunks to temp file and stream
                        async for chunk, chunk_data in generator:
                            if chunk:  # Skip empty chunks
                                await temp_writer.write(chunk)
-                                if return_json:
+                                #if return_json:
-                                    yield chunk, chunk_data
+                                #    yield chunk, chunk_data
-                                else:
+                                #else:
-                                    yield chunk
+                                yield chunk
                        # Finalize the temp file
                        await temp_writer.finalize()
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -327,11 +327,22 @@ class TTSService:
        lang_code: Optional[str] = None,
    ) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
        """Generate complete audio for text using streaming internally."""
        start_time = time.time()
        audio_chunks = []
        audio_data_chunks=[]
        word_timestamps = []
        start_time = time.time()
        chunks = []
        word_timestamps = []
        try:
            async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
                print("common")
                audio_chunks.append(audio_stream_data.audio)
                audio_data_chunks.append(audio_stream_data)
            print(audio_data_chunks)
            """
            # Get backend and voice path
            backend = self.model_manager.get_backend()
            voice_name, voice_path = await self._get_voice_path(voice)
@ -574,10 +585,11 @@ class TTSService:
                        [],
                    )  # Empty timestamps for legacy backends
                return audio, processing_time
-
+        """
        except Exception as e:
            logger.error(f"Error in audio generation: {str(e)}")
            raise
    async def combine_voices(self, voices: List[str]) -> torch.Tensor:
        """Combine multiple voices.
--- a/output.mp3
+++ b/output.mp3