Kokoro-FastAPI/api/src/structures/schemas.py

45 lines
1.6 KiB
Python
Raw Normal View History

from enum import Enum
from typing import Literal, Union, List
from pydantic import Field, BaseModel
class VoiceCombineRequest(BaseModel):
"""Request schema for voice combination endpoint that accepts either a string with + or a list"""
voices: Union[str, List[str]] = Field(
...,
description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine"
)
class TTSStatus(str, Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
DELETED = "deleted" # For files removed by cleanup
# OpenAI-compatible schemas
class OpenAISpeechRequest(BaseModel):
model: Literal["tts-1", "tts-1-hd", "kokoro"] = "kokoro"
input: str = Field(..., description="The text to generate audio for")
voice: str = Field(
2025-01-01 21:50:41 -07:00
default="af",
description="The voice to use for generation. Can be a base voice or a combined voice name.",
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
default="mp3",
2025-01-04 17:54:54 -07:00
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
)
speed: float = Field(
default=1.0,
ge=0.25,
le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
2025-01-04 17:54:54 -07:00
stream: bool = Field(
2025-01-04 22:23:59 -07:00
default=True, # Default to streaming for OpenAI compatibility
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
2025-01-04 17:54:54 -07:00
)