2024-12-31 01:52:16 -07:00
from enum import Enum
2025-02-09 18:32:17 -07:00
from typing import List , Literal , Optional , Union
2024-12-31 02:55:51 -07:00
2025-01-13 20:15:46 -07:00
from pydantic import BaseModel , Field
2024-12-31 01:52:16 -07:00
2025-01-07 03:50:08 -07:00
class VoiceCombineRequest ( BaseModel ) :
""" Request schema for voice combination endpoint that accepts either a string with + or a list """
2025-01-09 18:41:44 -07:00
2025-01-07 03:50:08 -07:00
voices : Union [ str , List [ str ] ] = Field (
. . . ,
2025-01-09 18:41:44 -07:00
description = " Either a string with voices separated by + (e.g. ' voice1+voice2 ' ) or a list of voice names to combine " ,
2025-01-07 03:50:08 -07:00
)
2024-12-31 01:52:16 -07:00
class TTSStatus ( str , Enum ) :
PENDING = " pending "
PROCESSING = " processing "
COMPLETED = " completed "
FAILED = " failed "
DELETED = " deleted " # For files removed by cleanup
# OpenAI-compatible schemas
2025-02-04 19:41:41 -07:00
class WordTimestamp ( BaseModel ) :
""" Word-level timestamp information """
2025-02-09 18:32:17 -07:00
2025-02-04 19:41:41 -07:00
word : str = Field ( . . . , description = " The word or token " )
start_time : float = Field ( . . . , description = " Start time in seconds " )
end_time : float = Field ( . . . , description = " End time in seconds " )
2025-02-09 18:32:17 -07:00
2025-02-04 19:41:41 -07:00
class CaptionedSpeechResponse ( BaseModel ) :
""" Response schema for captioned speech endpoint """
2025-02-09 18:32:17 -07:00
2025-02-14 13:37:42 -05:00
audio : str = Field ( . . . , description = " The generated audio data encoded in base 64 " )
audio_format : str = Field ( . . . , description = " The format of the output audio " )
2025-02-04 19:41:41 -07:00
words : List [ WordTimestamp ] = Field ( . . . , description = " Word-level timestamps " )
2025-02-11 19:09:35 -05:00
class NormalizationOptions ( BaseModel ) :
""" Options for the normalization system """
normalize : bool = Field ( default = True , description = " Normalizes input text to make it easier for the model to say " )
unit_normalization : bool = Field ( default = False , description = " Transforms units like 10KB to 10 kilobytes " )
url_normalization : bool = Field ( default = True , description = " Changes urls so they can be properly pronouced by kokoro " )
email_normalization : bool = Field ( default = True , description = " Changes emails so they can be properly pronouced by kokoro " )
2025-02-11 19:24:29 -05:00
optional_pluralization_normalization : bool = Field ( default = True , description = " Replaces (s) with s so some words get pronounced correctly " )
2025-02-11 19:09:35 -05:00
2024-12-31 01:52:16 -07:00
class OpenAISpeechRequest ( BaseModel ) :
2025-02-04 19:41:41 -07:00
""" Request schema for OpenAI-compatible speech endpoint """
2025-02-09 18:32:17 -07:00
2025-01-23 04:11:31 -07:00
model : str = Field (
default = " kokoro " ,
2025-02-09 18:32:17 -07:00
description = " The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro " ,
2025-01-23 04:11:31 -07:00
)
2024-12-31 01:52:16 -07:00
input : str = Field ( . . . , description = " The text to generate audio for " )
2024-12-31 18:55:26 -07:00
voice : str = Field (
2025-01-01 21:50:41 -07:00
default = " af " ,
description = " The voice to use for generation. Can be a base voice or a combined voice name. " ,
2024-12-31 18:55:26 -07:00
)
2024-12-31 01:52:16 -07:00
response_format : Literal [ " mp3 " , " opus " , " aac " , " flac " , " wav " , " pcm " ] = Field (
default = " mp3 " ,
2025-01-04 17:54:54 -07:00
description = " The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported. " ,
2024-12-31 01:52:16 -07:00
)
speed : float = Field (
default = 1.0 ,
ge = 0.25 ,
le = 4.0 ,
2024-12-31 01:57:00 -07:00
description = " The speed of the generated audio. Select a value from 0.25 to 4.0. " ,
2024-12-31 01:52:16 -07:00
)
2025-01-04 17:54:54 -07:00
stream : bool = Field (
2025-01-04 22:23:59 -07:00
default = True , # Default to streaming for OpenAI compatibility
description = " If true (default), audio will be streamed as it ' s generated. Each chunk will be a complete sentence. " ,
2025-01-04 17:54:54 -07:00
)
2025-01-29 04:09:38 -07:00
return_download_link : bool = Field (
default = False ,
description = " If true, returns a download link in X-Download-Path header after streaming completes " ,
)
2025-02-08 01:29:15 -07:00
lang_code : Optional [ str ] = Field (
default = None ,
description = " Optional language code to use for text processing. If not provided, will use first letter of voice name. " ,
)
2025-02-11 19:09:35 -05:00
normalization_options : Optional [ NormalizationOptions ] = Field (
default = NormalizationOptions ( ) ,
description = " Options for the normalization system "
)
2025-02-04 19:41:41 -07:00
2025-02-09 18:32:17 -07:00
2025-02-04 19:41:41 -07:00
class CaptionedSpeechRequest ( BaseModel ) :
""" Request schema for captioned speech endpoint """
2025-02-09 18:32:17 -07:00
2025-02-04 19:41:41 -07:00
model : str = Field (
default = " kokoro " ,
2025-02-09 18:32:17 -07:00
description = " The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro " ,
2025-02-04 19:41:41 -07:00
)
input : str = Field ( . . . , description = " The text to generate audio for " )
voice : str = Field (
default = " af " ,
description = " The voice to use for generation. Can be a base voice or a combined voice name. " ,
)
response_format : Literal [ " mp3 " , " opus " , " aac " , " flac " , " wav " , " pcm " ] = Field (
default = " mp3 " ,
description = " The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported. " ,
)
speed : float = Field (
default = 1.0 ,
ge = 0.25 ,
le = 4.0 ,
description = " The speed of the generated audio. Select a value from 0.25 to 4.0. " ,
)
2025-02-13 18:00:03 -05:00
stream : bool = Field (
default = True , # Default to streaming for OpenAI compatibility
description = " If true (default), audio will be streamed as it ' s generated. Each chunk will be a complete sentence. " ,
)
2025-02-04 19:41:41 -07:00
return_timestamps : bool = Field (
default = True ,
description = " If true (default), returns word-level timestamps in the response " ,
)
2025-02-13 18:00:03 -05:00
return_download_link : bool = Field (
default = False ,
description = " If true, returns a download link in X-Download-Path header after streaming completes " ,
)
2025-02-08 01:29:15 -07:00
lang_code : Optional [ str ] = Field (
default = None ,
description = " Optional language code to use for text processing. If not provided, will use first letter of voice name. " ,
)
2025-02-13 18:00:03 -05:00
normalization_options : Optional [ NormalizationOptions ] = Field (
default = NormalizationOptions ( ) ,
description = " Options for the normalization system "
)