Kokoro-FastAPI/api/src/structures/schemas.py

99 lines
3.9 KiB
Python
Raw Normal View History

from enum import Enum
from typing import List, Literal, Union, Optional
2025-01-13 20:15:46 -07:00
from pydantic import BaseModel, Field
class VoiceCombineRequest(BaseModel):
"""Request schema for voice combination endpoint that accepts either a string with + or a list"""
2025-01-09 18:41:44 -07:00
voices: Union[str, List[str]] = Field(
...,
2025-01-09 18:41:44 -07:00
description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
)
class TTSStatus(str, Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
DELETED = "deleted" # For files removed by cleanup
# OpenAI-compatible schemas
class WordTimestamp(BaseModel):
"""Word-level timestamp information"""
word: str = Field(..., description="The word or token")
start_time: float = Field(..., description="Start time in seconds")
end_time: float = Field(..., description="End time in seconds")
class CaptionedSpeechResponse(BaseModel):
"""Response schema for captioned speech endpoint"""
audio: bytes = Field(..., description="The generated audio data")
words: List[WordTimestamp] = Field(..., description="Word-level timestamps")
class OpenAISpeechRequest(BaseModel):
"""Request schema for OpenAI-compatible speech endpoint"""
model: str = Field(
default="kokoro",
description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro"
)
input: str = Field(..., description="The text to generate audio for")
voice: str = Field(
2025-01-01 21:50:41 -07:00
default="af",
description="The voice to use for generation. Can be a base voice or a combined voice name.",
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
default="mp3",
2025-01-04 17:54:54 -07:00
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
)
speed: float = Field(
default=1.0,
ge=0.25,
le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
2025-01-04 17:54:54 -07:00
stream: bool = Field(
2025-01-04 22:23:59 -07:00
default=True, # Default to streaming for OpenAI compatibility
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
2025-01-04 17:54:54 -07:00
)
return_download_link: bool = Field(
default=False,
description="If true, returns a download link in X-Download-Path header after streaming completes",
)
lang_code: Optional[str] = Field(
default=None,
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
)
class CaptionedSpeechRequest(BaseModel):
"""Request schema for captioned speech endpoint"""
model: str = Field(
default="kokoro",
description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro"
)
input: str = Field(..., description="The text to generate audio for")
voice: str = Field(
default="af",
description="The voice to use for generation. Can be a base voice or a combined voice name.",
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
default="mp3",
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
)
speed: float = Field(
default=1.0,
ge=0.25,
le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
return_timestamps: bool = Field(
default=True,
description="If true (default), returns word-level timestamps in the response",
)
lang_code: Optional[str] = Field(
default=None,
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
)