diff --git a/api/src/routers/development.py b/api/src/routers/development.py index 20c2206..e02c9b5 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse, FileResponse from kokoro import KPipeline from loguru import logger +from ..inference.base import AudioChunk from ..core.config import settings from ..services.audio import AudioNormalizer, AudioService from ..services.streaming_audio_writer import StreamingAudioWriter @@ -19,6 +20,7 @@ from ..structures.text_schemas import ( PhonemeRequest, PhonemeResponse, ) +from .openai_compatible import process_voices, stream_audio_chunks import json import os from pathlib import Path @@ -194,6 +196,154 @@ async def create_captioned_speech( tts_service: TTSService = Depends(get_tts_service), ): """Generate audio with word-level timestamps using streaming approach""" + + try: + # model_name = get_model_name(request.model) + tts_service = await get_tts_service() + voice_name = await process_voices(request.voice, tts_service) + + # Set content type based on format + content_type = { + "mp3": "audio/mpeg", + "opus": "audio/opus", + "aac": "audio/aac", + "flac": "audio/flac", + "wav": "audio/wav", + "pcm": "audio/pcm", + }.get(request.response_format, f"audio/{request.response_format}") + + # Check if streaming is requested (default for OpenAI client) + if request.stream: + # Create generator but don't start it yet + generator = stream_audio_chunks(tts_service, request, client_request) + + # If download link requested, wrap generator with temp file writer + if request.return_download_link: + from ..services.temp_manager import TempFileWriter + + temp_writer = TempFileWriter(request.response_format) + await temp_writer.__aenter__() # Initialize temp file + + # Get download path immediately after temp file creation + download_path = temp_writer.download_path + + # Create response headers with download path + headers = { + "Content-Disposition": f"attachment; filename=speech.{request.response_format}", + "X-Accel-Buffering": "no", + "Cache-Control": "no-cache", + "Transfer-Encoding": "chunked", + "X-Download-Path": download_path, + } + + # Create async generator for streaming + async def dual_output(): + try: + # Write chunks to temp file and stream + async for chunk in generator: + if chunk: # Skip empty chunks + await temp_writer.write(chunk) + #if return_json: + # yield chunk, chunk_data + #else: + yield chunk + + # Finalize the temp file + await temp_writer.finalize() + except Exception as e: + logger.error(f"Error in dual output streaming: {e}") + await temp_writer.__aexit__(type(e), e, e.__traceback__) + raise + finally: + # Ensure temp writer is closed + if not temp_writer._finalized: + await temp_writer.__aexit__(None, None, None) + + # Stream with temp file writing + return StreamingResponse( + dual_output(), media_type=content_type, headers=headers + ) + + # Standard streaming without download link + return StreamingResponse( + generator, + media_type=content_type, + headers={ + "Content-Disposition": f"attachment; filename=speech.{request.response_format}", + "X-Accel-Buffering": "no", + "Cache-Control": "no-cache", + "Transfer-Encoding": "chunked", + }, + ) + else: + # Generate complete audio using public interface + _, audio_data = await tts_service.generate_audio( + text=request.input, + voice=voice_name, + speed=request.speed, + lang_code=request.lang_code, + ) + content, audio_data = await AudioService.convert_audio( + audio_data, + 24000, + request.response_format, + is_first_chunk=True, + is_last_chunk=False, + ) + + # Convert to requested format with proper finalization + final, _ = await AudioService.convert_audio( + AudioChunk(np.array([], dtype=np.int16)), + 24000, + request.response_format, + is_first_chunk=False, + is_last_chunk=True, + ) + output=content+final + return Response( + content=output, + media_type=content_type, + headers={ + "Content-Disposition": f"attachment; filename=speech.{request.response_format}", + "Cache-Control": "no-cache", # Prevent caching + }, + ) + + except ValueError as e: + # Handle validation errors + logger.warning(f"Invalid request: {str(e)}") + raise HTTPException( + status_code=400, + detail={ + "error": "validation_error", + "message": str(e), + "type": "invalid_request_error", + }, + ) + except RuntimeError as e: + # Handle runtime/processing errors + logger.error(f"Processing error: {str(e)}") + raise HTTPException( + status_code=500, + detail={ + "error": "processing_error", + "message": str(e), + "type": "server_error", + }, + ) + except Exception as e: + # Handle unexpected errors + logger.error(f"Unexpected error in speech generation: {str(e)}") + raise HTTPException( + status_code=500, + detail={ + "error": "processing_error", + "message": str(e), + "type": "server_error", + }, + ) + + """ try: # Set content type based on format content_type = { @@ -344,3 +494,4 @@ async def create_captioned_speech( "type": "server_error", }, ) + """ \ No newline at end of file diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 0d06547..771830b 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -10,12 +10,13 @@ from urllib import response import numpy as np import aiofiles -from ..inference.base import AudioChunk + import torch from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response from fastapi.responses import FileResponse, StreamingResponse from loguru import logger +from ..inference.base import AudioChunk from ..core.config import settings from ..services.audio import AudioService from ..services.tts_service import TTSService @@ -130,7 +131,7 @@ async def process_voices( async def stream_audio_chunks( tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request -) -> AsyncGenerator[list, None]: +) -> AsyncGenerator[Tuple[Union[np.ndarray,bytes],AudioChunk], None]: """Stream audio chunks as they're generated with client disconnect handling""" voice_name = await process_voices(request.voice, tts_service) @@ -154,7 +155,7 @@ async def stream_audio_chunks( logger.info("Client disconnected, stopping audio generation") break - yield chunk + yield (chunk,chunk_data) except Exception as e: logger.error(f"Error in audio streaming: {str(e)}") # Let the exception propagate to trigger cleanup @@ -223,7 +224,7 @@ async def create_speech( async def dual_output(): try: # Write chunks to temp file and stream - async for chunk in generator: + async for chunk,chunk_data in generator: if chunk: # Skip empty chunks await temp_writer.write(chunk) #if return_json: @@ -247,9 +248,19 @@ async def create_speech( dual_output(), media_type=content_type, headers=headers ) + async def single_output(): + try: + # Stream chunks + async for chunk,chunk_data in generator: + if chunk: # Skip empty chunks + yield chunk + except Exception as e: + logger.error(f"Error in single output streaming: {e}") + raise + # Standard streaming without download link return StreamingResponse( - generator, + single_output(), media_type=content_type, headers={ "Content-Disposition": f"attachment; filename=speech.{request.response_format}", diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index b08f64d..a8e05ad 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -332,24 +332,23 @@ class TTSService: text: str, voice: str, speed: float = 1.0, - return_timestamps: bool = True, + return_timestamps: bool = False, lang_code: Optional[str] = None, ) -> Tuple[Tuple[np.ndarray,AudioChunk]]: """Generate complete audio for text using streaming internally.""" start_time = time.time() - audio_chunks = [] audio_data_chunks=[] try: - async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None): - audio_chunks.append(audio_stream_data.audio) + async for _,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None): + audio_data_chunks.append(audio_stream_data) - combined_audio=np.concatenate(audio_chunks,dtype=np.int16) + combined_audio_data=AudioChunk.combine(audio_data_chunks) - return combined_audio,combined_audio_data + return combined_audio_data.audio,combined_audio_data """ # Get backend and voice path backend = self.model_manager.get_backend() diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py index 7a6484c..b838b38 100644 --- a/api/src/structures/schemas.py +++ b/api/src/structures/schemas.py @@ -106,11 +106,23 @@ class CaptionedSpeechRequest(BaseModel): le=4.0, description="The speed of the generated audio. Select a value from 0.25 to 4.0.", ) + stream: bool = Field( + default=True, # Default to streaming for OpenAI compatibility + description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.", + ) return_timestamps: bool = Field( default=True, description="If true (default), returns word-level timestamps in the response", ) + return_download_link: bool = Field( + default=False, + description="If true, returns a download link in X-Download-Path header after streaming completes", + ) lang_code: Optional[str] = Field( default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) + normalization_options: Optional[NormalizationOptions] = Field( + default= NormalizationOptions(), + description= "Options for the normalization system" + )