Kokoro-FastAPI/api/src/structures/schemas.py

from enum import Enum
from typing import List, Literal, Optional, Union

from pydantic import BaseModel, Field


class VoiceCombineRequest(BaseModel):
    """Request schema for voice combination endpoint that accepts either a string with + or a list"""

    voices: Union[str, List[str]] = Field(
        ...,
        description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
    )


class TTSStatus(str, Enum):
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"
    DELETED = "deleted"  # For files removed by cleanup


# OpenAI-compatible schemas
class WordTimestamp(BaseModel):
    """Word-level timestamp information"""

    word: str = Field(..., description="The word or token")
    start_time: float = Field(..., description="Start time in seconds")
    end_time: float = Field(..., description="End time in seconds")


class CaptionedSpeechResponse(BaseModel):
    """Response schema for captioned speech endpoint"""

    audio: str = Field(..., description="The generated audio data encoded in base 64")
    audio_format: str = Field(..., description="The format of the output audio")
    words: List[WordTimestamp] = Field(..., description="Word-level timestamps")

class NormalizationOptions(BaseModel):
    """Options for the normalization system"""
    normalize: bool = Field(default=True, description="Normalizes input text to make it easier for the model to say")
    unit_normalization: bool = Field(default=False,description="Transforms units like 10KB to 10 kilobytes")
    url_normalization: bool = Field(default=True, description="Changes urls so they can be properly pronouced by kokoro")
    email_normalization: bool = Field(default=True, description="Changes emails so they can be properly pronouced by kokoro")
    optional_pluralization_normalization: bool = Field(default=True, description="Replaces (s) with s so some words get pronounced correctly")
    
class OpenAISpeechRequest(BaseModel):
    """Request schema for OpenAI-compatible speech endpoint"""

    model: str = Field(
        default="kokoro",
        description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
    )
    input: str = Field(..., description="The text to generate audio for")
    voice: str = Field(
        default="af",
        description="The voice to use for generation. Can be a base voice or a combined voice name.",
    )
    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
        default="mp3",
        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.25,
        le=4.0,
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
    stream: bool = Field(
        default=True,  # Default to streaming for OpenAI compatibility
        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
    )
    return_download_link: bool = Field(
        default=False,
        description="If true, returns a download link in X-Download-Path header after streaming completes",
    )
    lang_code: Optional[str] = Field(
        default=None,
        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
    )
    normalization_options: Optional[NormalizationOptions] = Field(
        default= NormalizationOptions(),
        description= "Options for the normalization system"
    )


class CaptionedSpeechRequest(BaseModel):
    """Request schema for captioned speech endpoint"""

    model: str = Field(
        default="kokoro",
        description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
    )
    input: str = Field(..., description="The text to generate audio for")
    voice: str = Field(
        default="af",
        description="The voice to use for generation. Can be a base voice or a combined voice name.",
    )
    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
        default="mp3",
        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.25,
        le=4.0,
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
    stream: bool = Field(
        default=True,  # Default to streaming for OpenAI compatibility
        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
    )
    return_timestamps: bool = Field(
        default=True,
        description="If true (default), returns word-level timestamps in the response",
    )
    return_download_link: bool = Field(
        default=False,
        description="If true, returns a download link in X-Download-Path header after streaming completes",
    )
    lang_code: Optional[str] = Field(
        default=None,
        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
    )
    normalization_options: Optional[NormalizationOptions] = Field(
        default= NormalizationOptions(),
        description= "Options for the normalization system"
    )
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`from enum import Enum`
Ruff check + formatting 2025-02-09 18:32:17 -07:00			`from typing import List, Literal, Optional, Union`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`from pydantic import BaseModel, Field`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00

- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`class VoiceCombineRequest(BaseModel):`
			`"""Request schema for voice combination endpoint that accepts either a string with + or a list"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`voices: Union[str, List[str]] = Field(`
			`...,`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",`
- Added support for combining voices via any endpoint - Updated the `process_voices` function to handle both string and list formats for voice input. 2025-01-07 03:50:08 -07:00			`)`


- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`class TTSStatus(str, Enum):`
			`PENDING = "pending"`
			`PROCESSING = "processing"`
			`COMPLETED = "completed"`
			`FAILED = "failed"`
			`DELETED = "deleted" # For files removed by cleanup`


			`# OpenAI-compatible schemas`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`class WordTimestamp(BaseModel):`
			`"""Word-level timestamp information"""`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`word: str = Field(..., description="The word or token")`
			`start_time: float = Field(..., description="Start time in seconds")`
			`end_time: float = Field(..., description="End time in seconds")`

Ruff check + formatting 2025-02-09 18:32:17 -07:00
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`class CaptionedSpeechResponse(BaseModel):`
			`"""Response schema for captioned speech endpoint"""`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
streaming word level time stamps 2025-02-14 13:37:42 -05:00			`audio: str = Field(..., description="The generated audio data encoded in base 64")`
			`audio_format: str = Field(..., description="The format of the output audio")`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`words: List[WordTimestamp] = Field(..., description="Word-level timestamps")`

Added normilization options 2025-02-11 19:09:35 -05:00			`class NormalizationOptions(BaseModel):`
			`"""Options for the normalization system"""`
			`normalize: bool = Field(default=True, description="Normalizes input text to make it easier for the model to say")`
			`unit_normalization: bool = Field(default=False,description="Transforms units like 10KB to 10 kilobytes")`
			`url_normalization: bool = Field(default=True, description="Changes urls so they can be properly pronouced by kokoro")`
			`email_normalization: bool = Field(default=True, description="Changes emails so they can be properly pronouced by kokoro")`
added optional pluralization normalization 2025-02-11 19:24:29 -05:00			`optional_pluralization_normalization: bool = Field(default=True, description="Replaces (s) with s so some words get pronounced correctly")`
Added normilization options 2025-02-11 19:09:35 -05:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`class OpenAISpeechRequest(BaseModel):`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`"""Request schema for OpenAI-compatible speech endpoint"""`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Enhance web player information, adjust text chunk size, update audio wave settings, and implement OpenAI model mappings 2025-01-23 04:11:31 -07:00			`model: str = Field(`
			`default="kokoro",`
Ruff check + formatting 2025-02-09 18:32:17 -07:00			`description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",`
Enhance web player information, adjust text chunk size, update audio wave settings, and implement OpenAI model mappings 2025-01-23 04:11:31 -07:00			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`input: str = Field(..., description="The text to generate audio for")`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`voice: str = Field(`
Ruff Check + Format 2025-01-01 21:50:41 -07:00			`default="af",`
			`description="The voice to use for generation. Can be a base voice or a combined voice name.",`
- modified voice loading to copy on init - adjustments to the combine voices functionality - error handling and analysis 2024-12-31 18:55:26 -07:00			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(`
			`default="mp3",`
First streaming attempt 2025-01-04 17:54:54 -07:00			`description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`)`
			`speed: float = Field(`
			`default=1.0,`
			`ge=0.25,`
			`le=4.0,`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`description="The speed of the generated audio. Select a value from 0.25 to 4.0.",`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`)`
First streaming attempt 2025-01-04 17:54:54 -07:00			`stream: bool = Field(`
Swapped generator to preprocessing 2025-01-04 22:23:59 -07:00			`default=True, # Default to streaming for OpenAI compatibility`
			`description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",`
First streaming attempt 2025-01-04 17:54:54 -07:00			`)`
Implement temporary file management on openai endpoint, whole file downloads 2025-01-29 04:09:38 -07:00			`return_download_link: bool = Field(`
			`default=False,`
			`description="If true, returns a download link in X-Download-Path header after streaming completes",`
			`)`
-fix voice selection not matching language phonemes -added voice language override parameter 2025-02-08 01:29:15 -07:00			`lang_code: Optional[str] = Field(`
			`default=None,`
			`description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",`
			`)`
Added normilization options 2025-02-11 19:09:35 -05:00			`normalization_options: Optional[NormalizationOptions] = Field(`
			`default= NormalizationOptions(),`
			`description= "Options for the normalization system"`
			`)`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`class CaptionedSpeechRequest(BaseModel):`
			`"""Request schema for captioned speech endpoint"""`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`model: str = Field(`
			`default="kokoro",`
Ruff check + formatting 2025-02-09 18:32:17 -07:00			`description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`)`
			`input: str = Field(..., description="The text to generate audio for")`
			`voice: str = Field(`
			`default="af",`
			`description="The voice to use for generation. Can be a base voice or a combined voice name.",`
			`)`
			`response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(`
			`default="mp3",`
			`description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",`
			`)`
			`speed: float = Field(`
			`default=1.0,`
			`ge=0.25,`
			`le=4.0,`
			`description="The speed of the generated audio. Select a value from 0.25 to 4.0.",`
			`)`
Started work on allowing streaming word level timestamps as well as transitioning the dev code so it uses a lot more from the open ai endpoint 2025-02-13 18:00:03 -05:00			`stream: bool = Field(`
			`default=True, # Default to streaming for OpenAI compatibility`
			`description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",`
			`)`
Update dependencies, enhance voice management, and add captioned speech support 2025-02-04 19:41:41 -07:00			`return_timestamps: bool = Field(`
			`default=True,`
			`description="If true (default), returns word-level timestamps in the response",`
			`)`
Started work on allowing streaming word level timestamps as well as transitioning the dev code so it uses a lot more from the open ai endpoint 2025-02-13 18:00:03 -05:00			`return_download_link: bool = Field(`
			`default=False,`
			`description="If true, returns a download link in X-Download-Path header after streaming completes",`
			`)`
-fix voice selection not matching language phonemes -added voice language override parameter 2025-02-08 01:29:15 -07:00			`lang_code: Optional[str] = Field(`
			`default=None,`
			`description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",`
			`)`
Started work on allowing streaming word level timestamps as well as transitioning the dev code so it uses a lot more from the open ai endpoint 2025-02-13 18:00:03 -05:00			`normalization_options: Optional[NormalizationOptions] = Field(`
			`default= NormalizationOptions(),`
			`description= "Options for the normalization system"`
			`)`