Kokoro-FastAPI/api/src/routers/openai_compatible.py

643 lines
22 KiB
Python
Raw Normal View History

2025-01-28 13:52:57 -07:00
"""OpenAI-compatible router for text-to-speech"""
2025-02-09 18:32:17 -07:00
import io
import json
import os
2025-02-11 19:09:35 -05:00
import re
import tempfile
from typing import AsyncGenerator, Dict, List, Union, Tuple, Optional
from urllib import response
2025-02-13 16:12:51 -05:00
import numpy as np
2024-12-31 10:30:12 -05:00
import aiofiles
2025-02-14 13:37:42 -05:00
from structures.schemas import CaptionedSpeechRequest
2025-02-09 18:32:17 -07:00
import torch
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
2025-02-09 18:32:17 -07:00
from fastapi.responses import FileResponse, StreamingResponse
2025-01-13 20:15:46 -07:00
from loguru import logger
2025-01-09 18:41:44 -07:00
from ..inference.base import AudioChunk
2025-02-09 18:32:17 -07:00
from ..core.config import settings
from ..core.auth import verify_api_key
2025-01-01 21:50:41 -07:00
from ..services.audio import AudioService
2025-01-09 18:41:44 -07:00
from ..services.tts_service import TTSService
from ..structures import OpenAISpeechRequest
2025-02-09 18:32:17 -07:00
# Load OpenAI mappings
def load_openai_mappings() -> Dict:
"""Load OpenAI voice and model mappings from JSON"""
api_dir = os.path.dirname(os.path.dirname(__file__))
mapping_path = os.path.join(api_dir, "core", "openai_mappings.json")
try:
2025-02-09 18:32:17 -07:00
with open(mapping_path, "r") as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load OpenAI mappings: {e}")
return {"models": {}, "voices": {}}
2025-02-09 18:32:17 -07:00
# Global mappings
_openai_mappings = load_openai_mappings()
router = APIRouter(
tags=["OpenAI Compatible TTS"],
responses={404: {"description": "Not found"}},
dependencies=[Depends(verify_api_key)], # Apply authentication to all routes
)
# Global TTSService instance with lock
_tts_service = None
_init_lock = None
async def get_tts_service() -> TTSService:
"""Get global TTSService instance"""
global _tts_service, _init_lock
2025-02-09 18:32:17 -07:00
# Create lock if needed
if _init_lock is None:
import asyncio
2025-02-09 18:32:17 -07:00
_init_lock = asyncio.Lock()
2025-02-09 18:32:17 -07:00
# Initialize service if needed
if _tts_service is None:
async with _init_lock:
# Double check pattern
if _tts_service is None:
_tts_service = await TTSService.create()
logger.info("Created global TTSService instance")
2025-02-09 18:32:17 -07:00
return _tts_service
def get_model_name(model: str) -> str:
"""Get internal model name from OpenAI model name"""
base_name = _openai_mappings["models"].get(model)
if not base_name:
raise ValueError(f"Unsupported model: {model}")
return base_name + ".pth"
2025-01-09 18:41:44 -07:00
async def process_voices(
voice_input: Union[str, List[str]], tts_service: TTSService
) -> str:
"""Process voice input, handling both string and list formats
2025-02-09 18:32:17 -07:00
Returns:
Voice name to use (with weights if specified)
"""
# Convert input to list of voices
if isinstance(voice_input, str):
# Check if it's an OpenAI voice name
mapped_voice = _openai_mappings["voices"].get(voice_input)
if mapped_voice:
voice_input = mapped_voice
# Split on + but preserve any parentheses
voices = []
for part in voice_input.split("+"):
part = part.strip()
if not part:
continue
# Extract voice name without weight
voice_name = part.split("(")[0].strip()
# Check if it's a valid voice
available_voices = await tts_service.list_voices()
if voice_name not in available_voices:
raise ValueError(
f"Voice '{voice_name}' not found. Available voices: {', '.join(sorted(available_voices))}"
)
voices.append(part)
else:
# For list input, map each voice if it's an OpenAI voice name
voices = []
for v in voice_input:
mapped = _openai_mappings["voices"].get(v, v)
voice_name = mapped.split("(")[0].strip()
# Check if it's a valid voice
available_voices = await tts_service.list_voices()
if voice_name not in available_voices:
raise ValueError(
f"Voice '{voice_name}' not found. Available voices: {', '.join(sorted(available_voices))}"
)
voices.append(mapped)
if not voices:
raise ValueError("No voices provided")
# For multiple voices, combine them with +
return "+".join(voices)
2025-01-09 18:41:44 -07:00
async def stream_audio_chunks(
2025-02-14 13:37:42 -05:00
tts_service: TTSService, request: Union[OpenAISpeechRequest,CaptionedSpeechRequest], client_request: Request
) -> AsyncGenerator[AudioChunk, None]:
2025-01-13 23:25:06 -07:00
"""Stream audio chunks as they're generated with client disconnect handling"""
voice_name = await process_voices(request.voice, tts_service)
2025-02-09 18:32:17 -07:00
2025-02-14 13:37:42 -05:00
unique_properties={
"return_timestamps":False
}
if hasattr(request, "return_timestamps"):
unique_properties["return_timestamps"]=request.return_timestamps
2025-01-13 23:25:06 -07:00
try:
async for chunk_data in tts_service.generate_audio_stream(
2025-01-13 23:25:06 -07:00
text=request.input,
voice=voice_name,
2025-01-13 23:25:06 -07:00
speed=request.speed,
output_format=request.response_format,
2025-03-09 15:16:45 -04:00
lang_code=request.lang_code,
2025-02-14 14:39:24 -05:00
normalization_options=request.normalization_options,
2025-02-14 13:37:42 -05:00
return_timestamps=unique_properties["return_timestamps"],
2025-01-13 23:25:06 -07:00
):
# Check if client is still connected
is_disconnected = client_request.is_disconnected
if callable(is_disconnected):
is_disconnected = await is_disconnected()
if is_disconnected:
2025-01-13 23:25:06 -07:00
logger.info("Client disconnected, stopping audio generation")
break
yield chunk_data
2025-01-13 23:25:06 -07:00
except Exception as e:
logger.error(f"Error in audio streaming: {str(e)}")
# Let the exception propagate to trigger cleanup
raise
2025-01-04 17:54:54 -07:00
2025-01-04 17:55:36 -07:00
@router.post("/audio/speech")
async def create_speech(
2025-01-09 18:41:44 -07:00
request: OpenAISpeechRequest,
2025-01-13 23:25:06 -07:00
client_request: Request,
2025-01-04 17:55:36 -07:00
x_raw_response: str = Header(None, alias="x-raw-response"),
api_key: Optional[str] = Depends(verify_api_key),
):
"""OpenAI-compatible endpoint for text-to-speech"""
# Validate model before processing request
if request.model not in _openai_mappings["models"]:
raise HTTPException(
status_code=400,
detail={
"error": "invalid_model",
"message": f"Unsupported model: {request.model}",
2025-02-09 18:32:17 -07:00
"type": "invalid_request_error",
},
)
2025-02-09 18:32:17 -07:00
try:
# model_name = get_model_name(request.model)
tts_service = await get_tts_service()
voice_name = await process_voices(request.voice, tts_service)
2025-01-01 21:50:41 -07:00
2025-01-04 17:54:54 -07:00
# Set content type based on format
content_type = {
"mp3": "audio/mpeg",
"opus": "audio/opus",
"aac": "audio/aac",
"flac": "audio/flac",
"wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
2025-03-09 15:16:45 -04:00
# Determine language code with proper fallback
if not request.lang_code:
# Use default_voice_code from settings if available
request.lang_code = settings.default_voice_code
# Otherwise, use first letter of voice name
if not request.lang_code and voice_name:
request.lang_code = voice_name[0].lower()
# Log the language code being used
logger.info(f"Starting audio generation with lang_code: {request.lang_code}")
2025-01-04 22:23:59 -07:00
# Check if streaming is requested (default for OpenAI client)
if request.stream:
# Create generator but don't start it yet
generator = stream_audio_chunks(tts_service, request, client_request)
2025-02-09 18:32:17 -07:00
# If download link requested, wrap generator with temp file writer
if request.return_download_link:
from ..services.temp_manager import TempFileWriter
2025-02-09 18:32:17 -07:00
# Use download_format if specified, otherwise use response_format
output_format = request.download_format or request.response_format
temp_writer = TempFileWriter(output_format)
await temp_writer.__aenter__() # Initialize temp file
2025-02-09 18:32:17 -07:00
# Get download path immediately after temp file creation
download_path = temp_writer.download_path
2025-02-09 18:32:17 -07:00
# Create response headers with download path
headers = {
"Content-Disposition": f"attachment; filename=speech.{output_format}",
"X-Accel-Buffering": "no",
"Cache-Control": "no-cache",
"Transfer-Encoding": "chunked",
2025-02-09 18:32:17 -07:00
"X-Download-Path": download_path,
}
# Create async generator for streaming
async def dual_output():
try:
# Write chunks to temp file and stream
async for chunk_data in generator:
if chunk_data.output: # Skip empty chunks
await temp_writer.write(chunk_data.output)
#if return_json:
# yield chunk, chunk_data
#else:
yield chunk_data.output
# Finalize the temp file
await temp_writer.finalize()
except Exception as e:
logger.error(f"Error in dual output streaming: {e}")
await temp_writer.__aexit__(type(e), e, e.__traceback__)
raise
finally:
# Ensure temp writer is closed
if not temp_writer._finalized:
await temp_writer.__aexit__(None, None, None)
# Stream with temp file writing
return StreamingResponse(
2025-02-09 18:32:17 -07:00
dual_output(), media_type=content_type, headers=headers
)
2025-02-09 18:32:17 -07:00
async def single_output():
try:
# Stream chunks
is_first_chunk=True
async for chunk_data in generator:
if chunk_data.audio is not None and len(chunk_data.audio) > 0: # Skip empty chunks
# Convert to requested format with proper encoding
encoded_chunk = await AudioService.convert_audio(
chunk_data,
24000,
request.response_format,
is_first_chunk=is_first_chunk,
is_last_chunk=False,
trim_audio=False
)
if encoded_chunk.output:
yield encoded_chunk.output
is_first_chunk=False
except Exception as e:
logger.error(f"Error in single output streaming: {e}")
raise
# Standard streaming without download link
2025-01-04 17:54:54 -07:00
return StreamingResponse(
single_output(),
2025-01-04 17:54:54 -07:00
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"X-Accel-Buffering": "no",
"Cache-Control": "no-cache",
2025-02-09 18:32:17 -07:00
"Transfer-Encoding": "chunked",
},
2025-01-04 17:54:54 -07:00
)
else:
# Generate complete audio using public interface
2025-03-09 15:16:45 -04:00
audio_data = await tts_service.generate_audio(
2025-01-04 17:54:54 -07:00
text=request.input,
voice=voice_name,
speed=request.speed,
normalization_options=request.normalization_options,
2025-03-09 15:16:45 -04:00
lang_code=request.lang_code,
2025-01-04 17:54:54 -07:00
)
audio_data = await AudioService.convert_audio(
audio_data,
2025-02-09 18:32:17 -07:00
24000,
request.response_format,
is_first_chunk=True,
2025-02-13 16:12:51 -05:00
is_last_chunk=False,
trim_audio=False
2025-02-13 16:12:51 -05:00
)
# Convert to requested format with proper finalization
final = await AudioService.convert_audio(
2025-02-13 16:12:51 -05:00
AudioChunk(np.array([], dtype=np.int16)),
24000,
request.response_format,
is_first_chunk=False,
2025-02-09 18:32:17 -07:00
is_last_chunk=True,
2025-01-09 18:41:44 -07:00
)
output=audio_data.output + final.output
2025-01-04 17:54:54 -07:00
return Response(
2025-02-13 16:12:51 -05:00
content=output,
2025-01-04 17:54:54 -07:00
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"Cache-Control": "no-cache", # Prevent caching
},
)
except ValueError as e:
# Handle validation errors
logger.warning(f"Invalid request: {str(e)}")
raise HTTPException(
status_code=400,
detail={
"error": "validation_error",
"message": str(e),
2025-02-09 18:32:17 -07:00
"type": "invalid_request_error",
},
)
except RuntimeError as e:
# Handle runtime/processing errors
logger.error(f"Processing error: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": str(e),
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)
except Exception as e:
# Handle unexpected errors
logger.error(f"Unexpected error in speech generation: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": str(e),
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)
@router.get("/download/{filename}")
async def download_audio_file(
filename: str,
api_key: Optional[str] = Depends(verify_api_key),
):
"""Download a generated audio file from temp storage"""
try:
from ..core.paths import _find_file, get_content_type
2025-02-09 18:32:17 -07:00
# Search for file in temp directory
file_path = await _find_file(
2025-02-09 18:32:17 -07:00
filename=filename, search_paths=[settings.temp_file_dir]
)
2025-02-09 18:32:17 -07:00
# Get content type from path helper
content_type = await get_content_type(file_path)
2025-02-09 18:32:17 -07:00
return FileResponse(
file_path,
media_type=content_type,
filename=filename,
headers={
"Cache-Control": "no-cache",
2025-02-09 18:32:17 -07:00
"Content-Disposition": f"attachment; filename={filename}",
},
)
2025-02-09 18:32:17 -07:00
except Exception as e:
logger.error(f"Error serving download file {filename}: {e}")
raise HTTPException(
status_code=500,
detail={
"error": "server_error",
"message": "Failed to serve audio file",
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)
@router.get("/models")
async def list_models(
api_key: Optional[str] = Depends(verify_api_key),
):
"""List all available models"""
try:
# Create standard model list
models = [
{
"id": "tts-1",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
},
{
"id": "tts-1-hd",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
},
{
"id": "kokoro",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
}
]
return {
"object": "list",
"data": models
}
except Exception as e:
logger.error(f"Error listing models: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "server_error",
"message": "Failed to retrieve model list",
"type": "server_error",
},
)
@router.get("/models/{model}")
async def retrieve_model(
model: str,
api_key: Optional[str] = Depends(verify_api_key),
):
"""Retrieve a specific model"""
try:
# Define available models
models = {
"tts-1": {
"id": "tts-1",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
},
"tts-1-hd": {
"id": "tts-1-hd",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
},
"kokoro": {
"id": "kokoro",
"object": "model",
"created": 1686935002,
"owned_by": "kokoro"
}
}
# Check if requested model exists
if model not in models:
raise HTTPException(
status_code=404,
detail={
"error": "model_not_found",
"message": f"Model '{model}' not found",
"type": "invalid_request_error"
}
)
# Return the specific model
return models[model]
except HTTPException:
raise
except Exception as e:
logger.error(f"Error retrieving model {model}: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "server_error",
"message": "Failed to retrieve model information",
"type": "server_error",
},
)
@router.get("/audio/voices")
async def list_voices(
api_key: Optional[str] = Depends(verify_api_key),
):
"""List all available voices for text-to-speech"""
try:
tts_service = await get_tts_service()
voices = await tts_service.list_voices()
return {"voices": voices}
except Exception as e:
logger.error(f"Error listing voices: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "server_error",
"message": "Failed to retrieve voice list",
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)
2024-12-31 10:30:12 -05:00
@router.post("/audio/voices/combine")
async def combine_voices(
request: Union[str, List[str]],
api_key: Optional[str] = Depends(verify_api_key),
):
"""Combine multiple voices into a new voice and return the .pt file.
2025-01-01 21:50:41 -07:00
Args:
request: Either a string with voices separated by + (e.g. "voice1+voice2")
or a list of voice names to combine
2025-01-01 21:50:41 -07:00
Returns:
FileResponse with the combined voice .pt file
2025-01-01 21:50:41 -07:00
Raises:
2025-01-01 21:50:41 -07:00
HTTPException:
- 400: Invalid request (wrong number of voices, voice not found)
- 500: Server error (file system issues, combination failed)
"""
# Check if local voice saving is allowed
if not settings.allow_local_voice_saving:
raise HTTPException(
status_code=403,
detail={
"error": "permission_denied",
"message": "Local voice saving is disabled",
2025-02-09 18:32:17 -07:00
"type": "permission_error",
},
)
2024-12-31 10:30:12 -05:00
try:
# Convert input to list of voices
if isinstance(request, str):
# Check if it's an OpenAI voice name
mapped_voice = _openai_mappings["voices"].get(request)
if mapped_voice:
request = mapped_voice
voices = [v.strip() for v in request.split("+") if v.strip()]
else:
# For list input, map each voice if it's an OpenAI voice name
voices = [_openai_mappings["voices"].get(v, v) for v in request]
voices = [v.strip() for v in voices if v.strip()]
if not voices:
raise ValueError("No voices provided")
# For multiple voices, validate base voices exist
tts_service = await get_tts_service()
available_voices = await tts_service.list_voices()
for voice in voices:
if voice not in available_voices:
raise ValueError(
f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
)
# Combine voices
combined_tensor = await tts_service.combine_voices(voices=voices)
combined_name = "+".join(voices)
# Save to temp file
temp_dir = tempfile.gettempdir()
voice_path = os.path.join(temp_dir, f"{combined_name}.pt")
buffer = io.BytesIO()
torch.save(combined_tensor, buffer)
2025-02-09 18:32:17 -07:00
async with aiofiles.open(voice_path, "wb") as f:
await f.write(buffer.getvalue())
2025-02-09 18:32:17 -07:00
return FileResponse(
voice_path,
media_type="application/octet-stream",
filename=f"{combined_name}.pt",
headers={
"Content-Disposition": f"attachment; filename={combined_name}.pt",
2025-02-09 18:32:17 -07:00
"Cache-Control": "no-cache",
},
)
2025-01-01 21:50:41 -07:00
except ValueError as e:
logger.warning(f"Invalid voice combination request: {str(e)}")
raise HTTPException(
status_code=400,
detail={
"error": "validation_error",
"message": str(e),
2025-02-09 18:32:17 -07:00
"type": "invalid_request_error",
},
)
except RuntimeError as e:
logger.error(f"Voice combination processing error: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": "Failed to process voice combination request",
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)
2024-12-31 10:30:12 -05:00
except Exception as e:
logger.error(f"Unexpected error in voice combination: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "server_error",
"message": "An unexpected error occurred",
2025-02-09 18:32:17 -07:00
"type": "server_error",
},
)