mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
134 lines
5.9 KiB
Python
134 lines
5.9 KiB
Python
from enum import Enum
|
|
from typing import List, Literal, Optional, Union
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class VoiceCombineRequest(BaseModel):
|
|
"""Request schema for voice combination endpoint that accepts either a string with + or a list"""
|
|
|
|
voices: Union[str, List[str]] = Field(
|
|
...,
|
|
description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
|
|
)
|
|
|
|
|
|
class TTSStatus(str, Enum):
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
DELETED = "deleted" # For files removed by cleanup
|
|
|
|
|
|
# OpenAI-compatible schemas
|
|
class WordTimestamp(BaseModel):
|
|
"""Word-level timestamp information"""
|
|
|
|
word: str = Field(..., description="The word or token")
|
|
start_time: float = Field(..., description="Start time in seconds")
|
|
end_time: float = Field(..., description="End time in seconds")
|
|
|
|
|
|
class CaptionedSpeechResponse(BaseModel):
|
|
"""Response schema for captioned speech endpoint"""
|
|
|
|
audio: str = Field(..., description="The generated audio data encoded in base 64")
|
|
audio_format: str = Field(..., description="The format of the output audio")
|
|
timestamps: Optional[List[WordTimestamp]] = Field(..., description="Word-level timestamps")
|
|
|
|
class NormalizationOptions(BaseModel):
|
|
"""Options for the normalization system"""
|
|
normalize: bool = Field(default=True, description="Normalizes input text to make it easier for the model to say")
|
|
unit_normalization: bool = Field(default=False,description="Transforms units like 10KB to 10 kilobytes")
|
|
url_normalization: bool = Field(default=True, description="Changes urls so they can be properly pronouced by kokoro")
|
|
email_normalization: bool = Field(default=True, description="Changes emails so they can be properly pronouced by kokoro")
|
|
optional_pluralization_normalization: bool = Field(default=True, description="Replaces (s) with s so some words get pronounced correctly")
|
|
phone_normalization: bool = Field(default=True, description="Changes phone numbers so they can be properly pronouced by kokoro")
|
|
|
|
class OpenAISpeechRequest(BaseModel):
|
|
"""Request schema for OpenAI-compatible speech endpoint"""
|
|
|
|
model: str = Field(
|
|
default="kokoro",
|
|
description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
|
|
)
|
|
input: str = Field(..., description="The text to generate audio for")
|
|
voice: str = Field(
|
|
default="af_heart",
|
|
description="The voice to use for generation. Can be a base voice or a combined voice name.",
|
|
)
|
|
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
|
|
default="mp3",
|
|
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
|
|
)
|
|
download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = Field(
|
|
default=None,
|
|
description="Optional different format for the final download. If not provided, uses response_format.",
|
|
)
|
|
speed: float = Field(
|
|
default=1.0,
|
|
ge=0.25,
|
|
le=4.0,
|
|
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
|
|
)
|
|
stream: bool = Field(
|
|
default=True, # Default to streaming for OpenAI compatibility
|
|
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
|
|
)
|
|
return_download_link: bool = Field(
|
|
default=False,
|
|
description="If true, returns a download link in X-Download-Path header after streaming completes",
|
|
)
|
|
lang_code: Optional[str] = Field(
|
|
default=None,
|
|
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
|
)
|
|
normalization_options: Optional[NormalizationOptions] = Field(
|
|
default= NormalizationOptions(),
|
|
description= "Options for the normalization system"
|
|
)
|
|
|
|
|
|
class CaptionedSpeechRequest(BaseModel):
|
|
"""Request schema for captioned speech endpoint"""
|
|
|
|
model: str = Field(
|
|
default="kokoro",
|
|
description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
|
|
)
|
|
input: str = Field(..., description="The text to generate audio for")
|
|
voice: str = Field(
|
|
default="af_heart",
|
|
description="The voice to use for generation. Can be a base voice or a combined voice name.",
|
|
)
|
|
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
|
|
default="mp3",
|
|
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
|
|
)
|
|
speed: float = Field(
|
|
default=1.0,
|
|
ge=0.25,
|
|
le=4.0,
|
|
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
|
|
)
|
|
stream: bool = Field(
|
|
default=True, # Default to streaming for OpenAI compatibility
|
|
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
|
|
)
|
|
return_timestamps: bool = Field(
|
|
default=True,
|
|
description="If true (default), returns word-level timestamps in the response",
|
|
)
|
|
return_download_link: bool = Field(
|
|
default=False,
|
|
description="If true, returns a download link in X-Download-Path header after streaming completes",
|
|
)
|
|
lang_code: Optional[str] = Field(
|
|
default=None,
|
|
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
|
)
|
|
normalization_options: Optional[NormalizationOptions] = Field(
|
|
default= NormalizationOptions(),
|
|
description= "Options for the normalization system"
|
|
)
|