Kokoro-FastAPI/api/src/routers/openai_compatible.py

from typing import AsyncGenerator, List, Union

from fastapi import APIRouter, Depends, Header, HTTPException, Response
from fastapi.responses import StreamingResponse
from loguru import logger

from ..services.audio import AudioService
from ..services.tts_service import TTSService
from ..structures.schemas import OpenAISpeechRequest

router = APIRouter(
    tags=["OpenAI Compatible TTS"],
    responses={404: {"description": "Not found"}},
)


def get_tts_service() -> TTSService:
    """Dependency to get TTSService instance with database session"""
    return TTSService()  # Initialize TTSService with default settings


async def process_voices(
    voice_input: Union[str, List[str]], tts_service: TTSService
) -> str:
    """Process voice input into a combined voice, handling both string and list formats"""
    # Convert input to list of voices
    if isinstance(voice_input, str):
        voices = [v.strip() for v in voice_input.split("+") if v.strip()]
    else:
        voices = voice_input

    if not voices:
        raise ValueError("No voices provided")

    # Check if all voices exist
    available_voices = await tts_service.list_voices()
    for voice in voices:
        if voice not in available_voices:
            raise ValueError(
                f"Voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
            )

    # If single voice, return it directly
    if len(voices) == 1:
        return voices[0]

    # Otherwise combine voices
    return await tts_service.combine_voices(voices=voices)


async def stream_audio_chunks(
    tts_service: TTSService, request: OpenAISpeechRequest
) -> AsyncGenerator[bytes, None]:
    """Stream audio chunks as they're generated"""
    voice_to_use = await process_voices(request.voice, tts_service)
    async for chunk in tts_service.generate_audio_stream(
        text=request.input,
        voice=voice_to_use,
        speed=request.speed,
        output_format=request.response_format,
    ):
        yield chunk


@router.post("/audio/speech")
async def create_speech(
    request: OpenAISpeechRequest,
    tts_service: TTSService = Depends(get_tts_service),
    x_raw_response: str = Header(None, alias="x-raw-response"),
):
    """OpenAI-compatible endpoint for text-to-speech"""
    try:
        # Process voice combination and validate
        voice_to_use = await process_voices(request.voice, tts_service)

        # Set content type based on format
        content_type = {
            "mp3": "audio/mpeg",
            "opus": "audio/opus",
            "aac": "audio/aac",
            "flac": "audio/flac",
            "wav": "audio/wav",
            "pcm": "audio/pcm",
        }.get(request.response_format, f"audio/{request.response_format}")

        # Check if streaming is requested (default for OpenAI client)
        if request.stream:
            # Stream audio chunks as they're generated
            return StreamingResponse(
                stream_audio_chunks(tts_service, request),
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "X-Accel-Buffering": "no",  # Disable proxy buffering
                    "Cache-Control": "no-cache",  # Prevent caching
                    "Transfer-Encoding": "chunked",  # Enable chunked transfer encoding
                },
            )
        else:
            # Generate complete audio
            audio, _ = tts_service._generate_audio(
                text=request.input,
                voice=voice_to_use,
                speed=request.speed,
                stitch_long_output=True,
            )

            # Convert to requested format
            content = AudioService.convert_audio(
                audio, 24000, request.response_format, is_first_chunk=True, stream=False
            )

            return Response(
                content=content,
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "Cache-Control": "no-cache",  # Prevent caching
                },
            )

    except ValueError as e:
        logger.error(f"Invalid request: {str(e)}")
        raise HTTPException(
            status_code=400, detail={"error": "Invalid request", "message": str(e)}
        )
    except Exception as e:
        logger.error(f"Error generating speech: {str(e)}")
        raise HTTPException(
            status_code=500, detail={"error": "Server error", "message": str(e)}
        )


@router.get("/audio/voices")
async def list_voices(tts_service: TTSService = Depends(get_tts_service)):
    """List all available voices for text-to-speech"""
    try:
        voices = await tts_service.list_voices()
        return {"voices": voices}
    except Exception as e:
        logger.error(f"Error listing voices: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/audio/voices/combine")
async def combine_voices(
    request: Union[str, List[str]], tts_service: TTSService = Depends(get_tts_service)
):
    """Combine multiple voices into a new voice.

    Args:
        request: Either a string with voices separated by + (e.g. "voice1+voice2")
                or a list of voice names to combine

    Returns:
        Dict with combined voice name and list of all available voices

    Raises:
        HTTPException:
            - 400: Invalid request (wrong number of voices, voice not found)
            - 500: Server error (file system issues, combination failed)
    """
    try:
        combined_voice = await process_voices(request, tts_service)
        voices = await tts_service.list_voices()
        return {"voices": voices, "voice": combined_voice}

    except ValueError as e:
        logger.error(f"Invalid voice combination request: {str(e)}")
        raise HTTPException(
            status_code=400, detail={"error": "Invalid request", "message": str(e)}
        )

    except Exception as e:
        logger.error(f"Server error during voice combination: {str(e)}")
        raise HTTPException(
            status_code=500, detail={"error": "Server error", "message": "Server error"}
        )
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`from typing import AsyncGenerator, List, Union`
add ability to combine voices 2024-12-31 10:30:12 -05:00
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`from fastapi import APIRouter, Depends, Header, HTTPException, Response`
First streaming attempt 2025-01-04 17:54:54 -07:00			`from fastapi.responses import StreamingResponse`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`from loguru import logger`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`from ..services.audio import AudioService`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`from ..services.tts_service import TTSService`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`from ..structures.schemas import OpenAISpeechRequest`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00
			`router = APIRouter(`
			`tags=["OpenAI Compatible TTS"],`
			`responses={404: {"description": "Not found"}},`
			`)`

Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`def get_tts_service() -> TTSService:`
			`"""Dependency to get TTSService instance with database session"""`
WIP, Functional for CPU: Updated for ONNX runtime support, Dockerfile and TTS Service 2025-01-03 00:53:41 -07:00			`return TTSService() # Initialize TTSService with default settings`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00			`async def process_voices(`
			`voice_input: Union[str, List[str]], tts_service: TTSService`
			`) -> str:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`"""Process voice input into a combined voice, handling both string and list formats"""`
			`# Convert input to list of voices`
			`if isinstance(voice_input, str):`
			`voices = [v.strip() for v in voice_input.split("+") if v.strip()]`
			`else:`
			`voices = voice_input`

			`if not voices:`
			`raise ValueError("No voices provided")`

			`# Check if all voices exist`
			`available_voices = await tts_service.list_voices()`
			`for voice in voices:`
			`if voice not in available_voices:`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`raise ValueError(`
			`f"Voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"`
			`)`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00
			`# If single voice, return it directly`
			`if len(voices) == 1:`
			`return voices[0]`

			`# Otherwise combine voices`
			`return await tts_service.combine_voices(voices=voices)`


Ruff format + fix 2025-01-09 18:41:44 -07:00			`async def stream_audio_chunks(`
			`tts_service: TTSService, request: OpenAISpeechRequest`
			`) -> AsyncGenerator[bytes, None]:`
First streaming attempt 2025-01-04 17:54:54 -07:00			`"""Stream audio chunks as they're generated"""`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`voice_to_use = await process_voices(request.voice, tts_service)`
First streaming attempt 2025-01-04 17:54:54 -07:00			`async for chunk in tts_service.generate_audio_stream(`
			`text=request.input,`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`voice=voice_to_use,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`speed=request.speed,`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`output_format=request.response_format,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`):`
			`yield chunk`

WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`@router.post("/audio/speech")`
			`async def create_speech(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`request: OpenAISpeechRequest,`
WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`tts_service: TTSService = Depends(get_tts_service),`
			`x_raw_response: str = Header(None, alias="x-raw-response"),`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`):`
			`"""OpenAI-compatible endpoint for text-to-speech"""`
			`try:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`# Process voice combination and validate`
			`voice_to_use = await process_voices(request.voice, tts_service)`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`# Set content type based on format`
			`content_type = {`
			`"mp3": "audio/mpeg",`
			`"opus": "audio/opus",`
			`"aac": "audio/aac",`
			`"flac": "audio/flac",`
			`"wav": "audio/wav",`
			`"pcm": "audio/pcm",`
			`}.get(request.response_format, f"audio/{request.response_format}")`

Swapped generator to preprocessing 2025-01-04 22:23:59 -07:00			`# Check if streaming is requested (default for OpenAI client)`
			`if request.stream:`
First streaming attempt 2025-01-04 17:54:54 -07:00			`# Stream audio chunks as they're generated`
			`return StreamingResponse(`
			`stream_audio_chunks(tts_service, request),`
			`media_type=content_type,`
			`headers={`
			`"Content-Disposition": f"attachment; filename=speech.{request.response_format}",`
			`"X-Accel-Buffering": "no", # Disable proxy buffering`
			`"Cache-Control": "no-cache", # Prevent caching`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`"Transfer-Encoding": "chunked", # Enable chunked transfer encoding`
First streaming attempt 2025-01-04 17:54:54 -07:00			`},`
			`)`
			`else:`
			`# Generate complete audio`
			`audio, _ = tts_service._generate_audio(`
			`text=request.input,`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`voice=voice_to_use,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`speed=request.speed,`
			`stitch_long_output=True,`
			`)`

			`# Convert to requested format`
			`content = AudioService.convert_audio(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`audio, 24000, request.response_format, is_first_chunk=True, stream=False`
			`)`
First streaming attempt 2025-01-04 17:54:54 -07:00
			`return Response(`
			`content=content,`
			`media_type=content_type,`
			`headers={`
			`"Content-Disposition": f"attachment; filename=speech.{request.response_format}",`
			`"Cache-Control": "no-cache", # Prevent caching`
			`},`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`except ValueError as e:`
			`logger.error(f"Invalid request: {str(e)}")`
			`raise HTTPException(`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`status_code=400, detail={"error": "Invalid request", "message": str(e)}`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`except Exception as e:`
			`logger.error(f"Error generating speech: {str(e)}")`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`raise HTTPException(`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`status_code=500, detail={"error": "Server error", "message": str(e)}`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`@router.get("/audio/voices")`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`async def list_voices(tts_service: TTSService = Depends(get_tts_service)):`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""List all available voices for text-to-speech"""`
			`try:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`voices = await tts_service.list_voices()`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`return {"voices": voices}`
			`except Exception as e:`
			`logger.error(f"Error listing voices: {str(e)}")`
			`raise HTTPException(status_code=500, detail=str(e))`
add ability to combine voices 2024-12-31 10:30:12 -05:00

			`@router.post("/audio/voices/combine")`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`async def combine_voices(`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`request: Union[str, List[str]], tts_service: TTSService = Depends(get_tts_service)`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`):`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`"""Combine multiple voices into a new voice.`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`Args:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`request: Either a string with voices separated by + (e.g. "voice1+voice2")`
			`or a list of voice names to combine`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`Returns:`
			`Dict with combined voice name and list of all available voices`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`Raises:`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`HTTPException:`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`- 400: Invalid request (wrong number of voices, voice not found)`
			`- 500: Server error (file system issues, combination failed)`
			`"""`
add ability to combine voices 2024-12-31 10:30:12 -05:00			`try:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`combined_voice = await process_voices(request, tts_service)`
			`voices = await tts_service.list_voices()`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`return {"voices": voices, "voice": combined_voice}`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`except ValueError as e:`
			`logger.error(f"Invalid voice combination request: {str(e)}")`
			`raise HTTPException(`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`status_code=400, detail={"error": "Invalid request", "message": str(e)}`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`)`
Ruff Check + Format 2025-01-01 21:50:41 -07:00
add ability to combine voices 2024-12-31 10:30:12 -05:00			`except Exception as e:`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`logger.error(f"Server error during voice combination: {str(e)}")`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`raise HTTPException(`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`status_code=500, detail={"error": "Server error", "message": "Server error"}`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`)`