2025-01-23 04:11:31 -07:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
from typing import AsyncGenerator, Dict, List, Union
|
2024-12-31 10:30:12 -05:00
|
|
|
|
2025-02-09 22:41:42 -07:00
|
|
|
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
|
2025-01-04 17:54:54 -07:00
|
|
|
from fastapi.responses import StreamingResponse
|
2025-01-13 20:15:46 -07:00
|
|
|
from loguru import logger
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-01 21:50:41 -07:00
|
|
|
from ..services.audio import AudioService
|
2025-01-09 18:41:44 -07:00
|
|
|
from ..services.tts_service import TTSService
|
2025-01-13 20:15:46 -07:00
|
|
|
from ..structures.schemas import OpenAISpeechRequest
|
2025-01-23 04:11:31 -07:00
|
|
|
from ..core.config import settings
|
|
|
|
|
|
|
|
# Load OpenAI mappings
|
|
|
|
def load_openai_mappings() -> Dict:
|
|
|
|
"""Load OpenAI voice and model mappings from JSON"""
|
|
|
|
api_dir = os.path.dirname(os.path.dirname(__file__))
|
|
|
|
mapping_path = os.path.join(api_dir, "core", "openai_mappings.json")
|
|
|
|
try:
|
|
|
|
with open(mapping_path, 'r') as f:
|
|
|
|
return json.load(f)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to load OpenAI mappings: {e}")
|
|
|
|
return {"models": {}, "voices": {}}
|
|
|
|
|
|
|
|
# Global mappings
|
|
|
|
_openai_mappings = load_openai_mappings()
|
2024-12-31 01:52:16 -07:00
|
|
|
|
2025-01-23 02:00:46 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
router = APIRouter(
|
|
|
|
tags=["OpenAI Compatible TTS"],
|
|
|
|
responses={404: {"description": "Not found"}},
|
|
|
|
)
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Global TTSService instance with lock
|
|
|
|
_tts_service = None
|
|
|
|
_init_lock = None
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2025-01-23 02:00:46 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
async def get_tts_service() -> TTSService:
|
|
|
|
"""Get global TTSService instance"""
|
|
|
|
global _tts_service, _init_lock
|
|
|
|
|
|
|
|
# Create lock if needed
|
|
|
|
if _init_lock is None:
|
|
|
|
import asyncio
|
|
|
|
_init_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
# Initialize service if needed
|
|
|
|
if _tts_service is None:
|
|
|
|
async with _init_lock:
|
|
|
|
# Double check pattern
|
|
|
|
if _tts_service is None:
|
|
|
|
_tts_service = await TTSService.create()
|
|
|
|
logger.info("Created global TTSService instance")
|
|
|
|
|
|
|
|
return _tts_service
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
|
2025-01-23 04:11:31 -07:00
|
|
|
def get_model_name(model: str) -> str:
|
|
|
|
"""Get internal model name from OpenAI model name"""
|
|
|
|
base_name = _openai_mappings["models"].get(model)
|
|
|
|
if not base_name:
|
|
|
|
raise ValueError(f"Unsupported model: {model}")
|
|
|
|
# Add extension based on runtime config
|
|
|
|
extension = ".onnx" if settings.use_onnx else ".pth"
|
|
|
|
return base_name + extension
|
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
async def process_voices(
|
|
|
|
voice_input: Union[str, List[str]], tts_service: TTSService
|
|
|
|
) -> str:
|
2025-01-07 03:50:08 -07:00
|
|
|
"""Process voice input into a combined voice, handling both string and list formats"""
|
|
|
|
# Convert input to list of voices
|
|
|
|
if isinstance(voice_input, str):
|
2025-01-23 04:11:31 -07:00
|
|
|
# Check if it's an OpenAI voice name
|
|
|
|
mapped_voice = _openai_mappings["voices"].get(voice_input)
|
|
|
|
if mapped_voice:
|
|
|
|
voice_input = mapped_voice
|
2025-01-07 03:50:08 -07:00
|
|
|
voices = [v.strip() for v in voice_input.split("+") if v.strip()]
|
|
|
|
else:
|
2025-01-23 04:11:31 -07:00
|
|
|
# For list input, map each voice if it's an OpenAI voice name
|
|
|
|
voices = [_openai_mappings["voices"].get(v, v) for v in voice_input]
|
|
|
|
voices = [v.strip() for v in voices if v.strip()]
|
2025-01-07 03:50:08 -07:00
|
|
|
|
|
|
|
if not voices:
|
|
|
|
raise ValueError("No voices provided")
|
|
|
|
|
2025-01-23 02:00:46 -07:00
|
|
|
# If single voice, validate and return it
|
|
|
|
if len(voices) == 1:
|
|
|
|
available_voices = await tts_service.list_voices()
|
|
|
|
if voices[0] not in available_voices:
|
|
|
|
raise ValueError(
|
|
|
|
f"Voice '{voices[0]}' not found. Available voices: {', '.join(sorted(available_voices))}"
|
|
|
|
)
|
|
|
|
return voices[0]
|
|
|
|
|
|
|
|
# For multiple voices, validate base voices exist
|
2025-01-07 03:50:08 -07:00
|
|
|
available_voices = await tts_service.list_voices()
|
|
|
|
for voice in voices:
|
|
|
|
if voice not in available_voices:
|
2025-01-09 18:41:44 -07:00
|
|
|
raise ValueError(
|
2025-01-23 02:00:46 -07:00
|
|
|
f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
|
2025-01-09 18:41:44 -07:00
|
|
|
)
|
2025-01-07 03:50:08 -07:00
|
|
|
|
2025-01-23 02:00:46 -07:00
|
|
|
# Combine voices
|
2025-01-07 03:50:08 -07:00
|
|
|
return await tts_service.combine_voices(voices=voices)
|
|
|
|
|
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
async def stream_audio_chunks(
|
2025-01-13 23:25:06 -07:00
|
|
|
tts_service: TTSService,
|
|
|
|
request: OpenAISpeechRequest,
|
|
|
|
client_request: Request
|
2025-01-09 18:41:44 -07:00
|
|
|
) -> AsyncGenerator[bytes, None]:
|
2025-01-13 23:25:06 -07:00
|
|
|
"""Stream audio chunks as they're generated with client disconnect handling"""
|
2025-02-09 22:41:42 -07:00
|
|
|
voice_to_use = await process_voices(request.voice, tts_service)
|
2025-01-13 23:25:06 -07:00
|
|
|
|
|
|
|
try:
|
|
|
|
async for chunk in tts_service.generate_audio_stream(
|
|
|
|
text=request.input,
|
|
|
|
voice=voice_to_use,
|
|
|
|
speed=request.speed,
|
|
|
|
output_format=request.response_format,
|
|
|
|
):
|
|
|
|
# Check if client is still connected
|
2025-01-23 04:11:31 -07:00
|
|
|
is_disconnected = client_request.is_disconnected
|
|
|
|
if callable(is_disconnected):
|
|
|
|
is_disconnected = await is_disconnected()
|
|
|
|
if is_disconnected:
|
2025-01-13 23:25:06 -07:00
|
|
|
logger.info("Client disconnected, stopping audio generation")
|
|
|
|
break
|
|
|
|
yield chunk
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in audio streaming: {str(e)}")
|
|
|
|
# Let the exception propagate to trigger cleanup
|
|
|
|
raise
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-04 17:55:36 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
@router.post("/audio/speech")
|
|
|
|
async def create_speech(
|
2025-01-09 18:41:44 -07:00
|
|
|
request: OpenAISpeechRequest,
|
2025-01-13 23:25:06 -07:00
|
|
|
client_request: Request,
|
2025-01-04 17:55:36 -07:00
|
|
|
x_raw_response: str = Header(None, alias="x-raw-response"),
|
2024-12-31 01:52:16 -07:00
|
|
|
):
|
|
|
|
"""OpenAI-compatible endpoint for text-to-speech"""
|
2025-01-23 04:11:31 -07:00
|
|
|
# Validate model before processing request
|
|
|
|
if request.model not in _openai_mappings["models"]:
|
|
|
|
raise HTTPException(
|
|
|
|
status_code=400,
|
|
|
|
detail={
|
|
|
|
"error": "invalid_model",
|
|
|
|
"message": f"Unsupported model: {request.model}",
|
|
|
|
"type": "invalid_request_error"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
try:
|
2025-01-23 04:11:31 -07:00
|
|
|
model_name = get_model_name(request.model)
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Get global service instance
|
|
|
|
tts_service = await get_tts_service()
|
|
|
|
|
2025-02-09 22:41:42 -07:00
|
|
|
# Process voice combination and validate
|
|
|
|
voice_to_use = await process_voices(request.voice, tts_service)
|
|
|
|
|
|
|
|
# Set content type based on format
|
2025-01-04 17:54:54 -07:00
|
|
|
content_type = {
|
|
|
|
"mp3": "audio/mpeg",
|
|
|
|
"opus": "audio/opus",
|
|
|
|
"aac": "audio/aac",
|
|
|
|
"flac": "audio/flac",
|
|
|
|
"wav": "audio/wav",
|
|
|
|
"pcm": "audio/pcm",
|
|
|
|
}.get(request.response_format, f"audio/{request.response_format}")
|
|
|
|
|
2025-01-04 22:23:59 -07:00
|
|
|
# Check if streaming is requested (default for OpenAI client)
|
|
|
|
if request.stream:
|
2025-01-22 21:11:47 -07:00
|
|
|
# Create generator but don't start it yet
|
|
|
|
generator = stream_audio_chunks(tts_service, request, client_request)
|
|
|
|
|
|
|
|
# Test the generator by attempting to get first chunk
|
|
|
|
try:
|
|
|
|
first_chunk = await anext(generator)
|
|
|
|
except StopAsyncIteration:
|
|
|
|
first_chunk = b"" # Empty audio case
|
|
|
|
except Exception as e:
|
|
|
|
# Re-raise any errors to be caught by the outer try-except
|
|
|
|
raise RuntimeError(f"Failed to initialize audio stream: {str(e)}") from e
|
|
|
|
|
|
|
|
# If we got here, streaming can begin
|
|
|
|
async def safe_stream():
|
|
|
|
yield first_chunk
|
|
|
|
try:
|
|
|
|
async for chunk in generator:
|
|
|
|
yield chunk
|
|
|
|
except Exception as e:
|
|
|
|
# Log the error but don't yield anything - the connection will close
|
|
|
|
logger.error(f"Error during streaming: {str(e)}")
|
|
|
|
raise
|
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
# Stream audio chunks as they're generated
|
|
|
|
return StreamingResponse(
|
2025-01-22 21:11:47 -07:00
|
|
|
safe_stream(),
|
2025-01-04 17:54:54 -07:00
|
|
|
media_type=content_type,
|
|
|
|
headers={
|
|
|
|
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
|
|
|
"X-Accel-Buffering": "no", # Disable proxy buffering
|
|
|
|
"Cache-Control": "no-cache", # Prevent caching
|
2025-01-10 22:03:16 -07:00
|
|
|
"Transfer-Encoding": "chunked", # Enable chunked transfer encoding
|
2025-01-04 17:54:54 -07:00
|
|
|
},
|
|
|
|
)
|
|
|
|
else:
|
2025-02-09 22:41:42 -07:00
|
|
|
# Generate complete audio using public interface
|
|
|
|
audio, _ = await tts_service.generate_audio(
|
|
|
|
text=request.input,
|
|
|
|
voice=voice_to_use,
|
|
|
|
speed=request.speed,
|
|
|
|
stitch_long_output=True
|
|
|
|
)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
|
|
|
# Convert to requested format
|
2025-01-24 04:06:47 -07:00
|
|
|
content = await AudioService.convert_audio(
|
2025-01-09 18:41:44 -07:00
|
|
|
audio, 24000, request.response_format, is_first_chunk=True, stream=False
|
|
|
|
)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
|
|
|
return Response(
|
|
|
|
content=content,
|
|
|
|
media_type=content_type,
|
|
|
|
headers={
|
|
|
|
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
|
|
|
"Cache-Control": "no-cache", # Prevent caching
|
|
|
|
},
|
|
|
|
)
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 18:55:26 -07:00
|
|
|
except ValueError as e:
|
2025-01-22 05:00:38 -07:00
|
|
|
# Handle validation errors
|
|
|
|
logger.warning(f"Invalid request: {str(e)}")
|
2024-12-31 18:55:26 -07:00
|
|
|
raise HTTPException(
|
2025-01-22 05:00:38 -07:00
|
|
|
status_code=400,
|
|
|
|
detail={
|
|
|
|
"error": "validation_error",
|
|
|
|
"message": str(e),
|
|
|
|
"type": "invalid_request_error"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
except RuntimeError as e:
|
|
|
|
# Handle runtime/processing errors
|
|
|
|
logger.error(f"Processing error: {str(e)}")
|
|
|
|
raise HTTPException(
|
|
|
|
status_code=500,
|
|
|
|
detail={
|
|
|
|
"error": "processing_error",
|
2025-01-23 04:11:31 -07:00
|
|
|
"message": str(e),
|
2025-01-22 05:00:38 -07:00
|
|
|
"type": "server_error"
|
|
|
|
}
|
2024-12-31 18:55:26 -07:00
|
|
|
)
|
2024-12-31 01:52:16 -07:00
|
|
|
except Exception as e:
|
2025-01-22 05:00:38 -07:00
|
|
|
# Handle unexpected errors
|
|
|
|
logger.error(f"Unexpected error in speech generation: {str(e)}")
|
2024-12-31 18:55:26 -07:00
|
|
|
raise HTTPException(
|
2025-01-22 05:00:38 -07:00
|
|
|
status_code=500,
|
|
|
|
detail={
|
2025-01-23 04:11:31 -07:00
|
|
|
"error": "processing_error",
|
|
|
|
"message": str(e),
|
2025-01-22 05:00:38 -07:00
|
|
|
"type": "server_error"
|
|
|
|
}
|
2024-12-31 18:55:26 -07:00
|
|
|
)
|
2024-12-31 01:52:16 -07:00
|
|
|
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
@router.get("/audio/voices")
|
2025-01-22 02:33:29 -07:00
|
|
|
async def list_voices():
|
2024-12-31 01:52:16 -07:00
|
|
|
"""List all available voices for text-to-speech"""
|
|
|
|
try:
|
2025-01-22 02:33:29 -07:00
|
|
|
tts_service = await get_tts_service()
|
2025-01-07 03:50:08 -07:00
|
|
|
voices = await tts_service.list_voices()
|
2024-12-31 01:52:16 -07:00
|
|
|
return {"voices": voices}
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error listing voices: {str(e)}")
|
2025-01-22 05:00:38 -07:00
|
|
|
raise HTTPException(
|
|
|
|
status_code=500,
|
|
|
|
detail={
|
|
|
|
"error": "server_error",
|
|
|
|
"message": "Failed to retrieve voice list",
|
|
|
|
"type": "server_error"
|
|
|
|
}
|
|
|
|
)
|
2024-12-31 10:30:12 -05:00
|
|
|
|
|
|
|
|
|
|
|
@router.post("/audio/voices/combine")
|
2025-01-22 02:33:29 -07:00
|
|
|
async def combine_voices(request: Union[str, List[str]]):
|
2024-12-31 18:55:26 -07:00
|
|
|
"""Combine multiple voices into a new voice.
|
2025-01-01 21:50:41 -07:00
|
|
|
|
2024-12-31 18:55:26 -07:00
|
|
|
Args:
|
2025-01-07 03:50:08 -07:00
|
|
|
request: Either a string with voices separated by + (e.g. "voice1+voice2")
|
|
|
|
or a list of voice names to combine
|
2025-01-01 21:50:41 -07:00
|
|
|
|
2024-12-31 18:55:26 -07:00
|
|
|
Returns:
|
|
|
|
Dict with combined voice name and list of all available voices
|
2025-01-01 21:50:41 -07:00
|
|
|
|
2024-12-31 18:55:26 -07:00
|
|
|
Raises:
|
2025-01-01 21:50:41 -07:00
|
|
|
HTTPException:
|
2024-12-31 18:55:26 -07:00
|
|
|
- 400: Invalid request (wrong number of voices, voice not found)
|
|
|
|
- 500: Server error (file system issues, combination failed)
|
|
|
|
"""
|
2024-12-31 10:30:12 -05:00
|
|
|
try:
|
2025-01-22 02:33:29 -07:00
|
|
|
tts_service = await get_tts_service()
|
2025-01-07 03:50:08 -07:00
|
|
|
combined_voice = await process_voices(request, tts_service)
|
|
|
|
voices = await tts_service.list_voices()
|
2024-12-31 18:55:26 -07:00
|
|
|
return {"voices": voices, "voice": combined_voice}
|
2025-01-01 21:50:41 -07:00
|
|
|
|
2024-12-31 18:55:26 -07:00
|
|
|
except ValueError as e:
|
2025-01-22 05:00:38 -07:00
|
|
|
logger.warning(f"Invalid voice combination request: {str(e)}")
|
2024-12-31 18:55:26 -07:00
|
|
|
raise HTTPException(
|
2025-01-22 05:00:38 -07:00
|
|
|
status_code=400,
|
|
|
|
detail={
|
|
|
|
"error": "validation_error",
|
|
|
|
"message": str(e),
|
|
|
|
"type": "invalid_request_error"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
except RuntimeError as e:
|
|
|
|
logger.error(f"Voice combination processing error: {str(e)}")
|
|
|
|
raise HTTPException(
|
|
|
|
status_code=500,
|
|
|
|
detail={
|
|
|
|
"error": "processing_error",
|
|
|
|
"message": "Failed to process voice combination request",
|
|
|
|
"type": "server_error"
|
|
|
|
}
|
2024-12-31 18:55:26 -07:00
|
|
|
)
|
2024-12-31 10:30:12 -05:00
|
|
|
except Exception as e:
|
2025-01-22 05:00:38 -07:00
|
|
|
logger.error(f"Unexpected error in voice combination: {str(e)}")
|
2024-12-31 18:55:26 -07:00
|
|
|
raise HTTPException(
|
2025-01-22 05:00:38 -07:00
|
|
|
status_code=500,
|
|
|
|
detail={
|
|
|
|
"error": "server_error",
|
|
|
|
"message": "An unexpected error occurred",
|
|
|
|
"type": "server_error"
|
|
|
|
}
|
2024-12-31 18:55:26 -07:00
|
|
|
)
|