Started work on allowing streaming word level timestamps as well as transitioning the dev code so it uses a lot more from the open ai endpoint

2025-08-05 16:48:53 +00:00 · 2025-02-13 18:00:03 -05:00 · 2025-02-13 18:00:03 -05:00 · 4027768920
commit 4027768920
parent 7772dbc2e4
4 changed files with 184 additions and 11 deletions
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse, FileResponse
 from kokoro import KPipeline
 from loguru import logger
 from ..inference.base import AudioChunk
 from ..core.config import settings
 from ..services.audio import AudioNormalizer, AudioService
 from ..services.streaming_audio_writer import StreamingAudioWriter
@ -19,6 +20,7 @@ from ..structures.text_schemas import (
    PhonemeRequest,
    PhonemeResponse,
 )
 from .openai_compatible import process_voices, stream_audio_chunks
 import json
 import os
 from pathlib import Path
@ -194,6 +196,154 @@ async def create_captioned_speech(
    tts_service: TTSService = Depends(get_tts_service),
 ):
    """Generate audio with word-level timestamps using streaming approach"""
    try:
        # model_name = get_model_name(request.model)
        tts_service = await get_tts_service()
        voice_name = await process_voices(request.voice, tts_service)
        # Set content type based on format
        content_type = {
            "mp3": "audio/mpeg",
            "opus": "audio/opus",
            "aac": "audio/aac",
            "flac": "audio/flac",
            "wav": "audio/wav",
            "pcm": "audio/pcm",
        }.get(request.response_format, f"audio/{request.response_format}")
        # Check if streaming is requested (default for OpenAI client)
        if request.stream:
            # Create generator but don't start it yet
            generator = stream_audio_chunks(tts_service, request, client_request)
            # If download link requested, wrap generator with temp file writer
            if request.return_download_link:
                from ..services.temp_manager import TempFileWriter
                temp_writer = TempFileWriter(request.response_format)
                await temp_writer.__aenter__()  # Initialize temp file
                # Get download path immediately after temp file creation
                download_path = temp_writer.download_path
                # Create response headers with download path
                headers = {
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "X-Accel-Buffering": "no",
                    "Cache-Control": "no-cache",
                    "Transfer-Encoding": "chunked",
                    "X-Download-Path": download_path,
                }
                # Create async generator for streaming
                async def dual_output():
                    try:
                        # Write chunks to temp file and stream
                        async for chunk in generator:
                            if chunk:  # Skip empty chunks
                                await temp_writer.write(chunk)
                                #if return_json:
                                #    yield chunk, chunk_data
                                #else:
                                yield chunk
                        # Finalize the temp file
                        await temp_writer.finalize()
                    except Exception as e:
                        logger.error(f"Error in dual output streaming: {e}")
                        await temp_writer.__aexit__(type(e), e, e.__traceback__)
                        raise
                    finally:
                        # Ensure temp writer is closed
                        if not temp_writer._finalized:
                            await temp_writer.__aexit__(None, None, None)
                # Stream with temp file writing
                return StreamingResponse(
                    dual_output(), media_type=content_type, headers=headers
                )
            # Standard streaming without download link
            return StreamingResponse(
                generator,
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "X-Accel-Buffering": "no",
                    "Cache-Control": "no-cache",
                    "Transfer-Encoding": "chunked",
                },
            )
        else:
            # Generate complete audio using public interface
            _, audio_data = await tts_service.generate_audio(
                text=request.input,
                voice=voice_name,
                speed=request.speed,
                lang_code=request.lang_code,
            )
            content, audio_data = await AudioService.convert_audio(
                audio_data,
                24000,
                request.response_format,
                is_first_chunk=True,
                is_last_chunk=False,
            )
            # Convert to requested format with proper finalization
            final, _ = await AudioService.convert_audio(
                AudioChunk(np.array([], dtype=np.int16)),
                24000,
                request.response_format,
                is_first_chunk=False,
                is_last_chunk=True,
            )
            output=content+final
            return Response(
                content=output,
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "Cache-Control": "no-cache",  # Prevent caching
                },
            )
    except ValueError as e:
        # Handle validation errors
        logger.warning(f"Invalid request: {str(e)}")
        raise HTTPException(
            status_code=400,
            detail={
                "error": "validation_error",
                "message": str(e),
                "type": "invalid_request_error",
            },
        )
    except RuntimeError as e:
        # Handle runtime/processing errors
        logger.error(f"Processing error: {str(e)}")
        raise HTTPException(
            status_code=500,
            detail={
                "error": "processing_error",
                "message": str(e),
                "type": "server_error",
            },
        )
    except Exception as e:
        # Handle unexpected errors
        logger.error(f"Unexpected error in speech generation: {str(e)}")
        raise HTTPException(
            status_code=500,
            detail={
                "error": "processing_error",
                "message": str(e),
                "type": "server_error",
            },
        )
    """
    try:
        # Set content type based on format
        content_type = {
@ -344,3 +494,4 @@ async def create_captioned_speech(
                "type": "server_error",
            },
        )
    """
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -10,12 +10,13 @@ from urllib import response
 import numpy as np
 import aiofiles
-from ..inference.base import AudioChunk
+
 import torch
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
 from fastapi.responses import FileResponse, StreamingResponse
 from loguru import logger
 from ..inference.base import AudioChunk
 from ..core.config import settings
 from ..services.audio import AudioService
 from ..services.tts_service import TTSService
@ -130,7 +131,7 @@ async def process_voices(
 async def stream_audio_chunks(
    tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request
-) -> AsyncGenerator[list, None]:
+) -> AsyncGenerator[Tuple[Union[np.ndarray,bytes],AudioChunk], None]:
    """Stream audio chunks as they're generated with client disconnect handling"""
    voice_name = await process_voices(request.voice, tts_service)
@ -154,7 +155,7 @@ async def stream_audio_chunks(
                logger.info("Client disconnected, stopping audio generation")
                break
-            yield chunk
+            yield (chunk,chunk_data)
    except Exception as e:
        logger.error(f"Error in audio streaming: {str(e)}")
        # Let the exception propagate to trigger cleanup
@ -223,7 +224,7 @@ async def create_speech(
                async def dual_output():
                    try:
                        # Write chunks to temp file and stream
-                        async for chunk in generator:
+                        async for chunk,chunk_data in generator:
                            if chunk:  # Skip empty chunks
                                await temp_writer.write(chunk)
                                #if return_json:
@ -247,9 +248,19 @@ async def create_speech(
                    dual_output(), media_type=content_type, headers=headers
                )
            async def single_output():
                try:
                    # Stream chunks
                    async for chunk,chunk_data in generator:
                        if chunk:  # Skip empty chunks
                            yield chunk
                except Exception as e:
                    logger.error(f"Error in single output streaming: {e}")
                    raise
            # Standard streaming without download link
            return StreamingResponse(
-                generator,
+                single_output(),
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -332,24 +332,23 @@ class TTSService:
        text: str,
        voice: str,
        speed: float = 1.0,
-        return_timestamps: bool = True,
+        return_timestamps: bool = False,
        lang_code: Optional[str] = None,
    ) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
        """Generate complete audio for text using streaming internally."""
        start_time = time.time()
        audio_chunks = []
        audio_data_chunks=[]
        try:
-            async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
+            async for _,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
-                audio_chunks.append(audio_stream_data.audio)
+
                audio_data_chunks.append(audio_stream_data)
-            combined_audio=np.concatenate(audio_chunks,dtype=np.int16)
+
            combined_audio_data=AudioChunk.combine(audio_data_chunks)
-            return combined_audio,combined_audio_data
+            return combined_audio_data.audio,combined_audio_data
            """
            # Get backend and voice path
            backend = self.model_manager.get_backend()
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -106,11 +106,23 @@ class CaptionedSpeechRequest(BaseModel):
        le=4.0,
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
    stream: bool = Field(
        default=True,  # Default to streaming for OpenAI compatibility
        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
    )
    return_timestamps: bool = Field(
        default=True,
        description="If true (default), returns word-level timestamps in the response",
    )
    return_download_link: bool = Field(
        default=False,
        description="If true, returns a download link in X-Download-Path header after streaming completes",
    )
    lang_code: Optional[str] = Field(
        default=None,
        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
    )
    normalization_options: Optional[NormalizationOptions] = Field(
        default= NormalizationOptions(),
        description= "Options for the normalization system"
    )