mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Merge pull request #316 from JCallicoat/volume_setting
Add a volume multiplier setting
This commit is contained in:
commit
223d3077c2
6 changed files with 25 additions and 1 deletions
|
@ -31,6 +31,7 @@ class Settings(BaseSettings):
|
||||||
|
|
||||||
# Audio Settings
|
# Audio Settings
|
||||||
sample_rate: int = 24000
|
sample_rate: int = 24000
|
||||||
|
default_volume_multiplier: float = 1.0
|
||||||
# Text Processing Settings
|
# Text Processing Settings
|
||||||
target_min_tokens: int = 175 # Target minimum tokens per chunk
|
target_min_tokens: int = 175 # Target minimum tokens per chunk
|
||||||
target_max_tokens: int = 250 # Target maximum tokens per chunk
|
target_max_tokens: int = 250 # Target maximum tokens per chunk
|
||||||
|
|
|
@ -141,6 +141,8 @@ Model files not found! You need to download the Kokoro V1 model:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for chunk in self._backend.generate(*args, **kwargs):
|
async for chunk in self._backend.generate(*args, **kwargs):
|
||||||
|
if settings.default_volume_multiplier != 1.0:
|
||||||
|
chunk.audio *= settings.default_volume_multiplier
|
||||||
yield chunk
|
yield chunk
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Generation failed: {e}")
|
raise RuntimeError(f"Generation failed: {e}")
|
||||||
|
|
|
@ -319,6 +319,7 @@ async def create_captioned_speech(
|
||||||
writer=writer,
|
writer=writer,
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
return_timestamps=request.return_timestamps,
|
return_timestamps=request.return_timestamps,
|
||||||
|
volume_multiplier=request.volume_multiplier,
|
||||||
normalization_options=request.normalization_options,
|
normalization_options=request.normalization_options,
|
||||||
lang_code=request.lang_code,
|
lang_code=request.lang_code,
|
||||||
)
|
)
|
||||||
|
|
|
@ -152,6 +152,7 @@ async def stream_audio_chunks(
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
output_format=request.response_format,
|
output_format=request.response_format,
|
||||||
lang_code=request.lang_code,
|
lang_code=request.lang_code,
|
||||||
|
volume_multiplier=request.volume_multiplier,
|
||||||
normalization_options=request.normalization_options,
|
normalization_options=request.normalization_options,
|
||||||
return_timestamps=unique_properties["return_timestamps"],
|
return_timestamps=unique_properties["return_timestamps"],
|
||||||
):
|
):
|
||||||
|
@ -300,6 +301,7 @@ async def create_speech(
|
||||||
voice=voice_name,
|
voice=voice_name,
|
||||||
writer=writer,
|
writer=writer,
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
|
volume_multiplier=request.volume_multiplier,
|
||||||
normalization_options=request.normalization_options,
|
normalization_options=request.normalization_options,
|
||||||
lang_code=request.lang_code,
|
lang_code=request.lang_code,
|
||||||
)
|
)
|
||||||
|
|
|
@ -55,6 +55,7 @@ class TTSService:
|
||||||
output_format: Optional[str] = None,
|
output_format: Optional[str] = None,
|
||||||
is_first: bool = False,
|
is_first: bool = False,
|
||||||
is_last: bool = False,
|
is_last: bool = False,
|
||||||
|
volume_multiplier: Optional[float] = 1.0,
|
||||||
normalizer: Optional[AudioNormalizer] = None,
|
normalizer: Optional[AudioNormalizer] = None,
|
||||||
lang_code: Optional[str] = None,
|
lang_code: Optional[str] = None,
|
||||||
return_timestamps: Optional[bool] = False,
|
return_timestamps: Optional[bool] = False,
|
||||||
|
@ -100,6 +101,7 @@ class TTSService:
|
||||||
lang_code=lang_code,
|
lang_code=lang_code,
|
||||||
return_timestamps=return_timestamps,
|
return_timestamps=return_timestamps,
|
||||||
):
|
):
|
||||||
|
chunk_data.audio*=volume_multiplier
|
||||||
# For streaming, convert to bytes
|
# For streaming, convert to bytes
|
||||||
if output_format:
|
if output_format:
|
||||||
try:
|
try:
|
||||||
|
@ -132,7 +134,7 @@ class TTSService:
|
||||||
speed=speed,
|
speed=speed,
|
||||||
return_timestamps=return_timestamps,
|
return_timestamps=return_timestamps,
|
||||||
)
|
)
|
||||||
|
|
||||||
if chunk_data.audio is None:
|
if chunk_data.audio is None:
|
||||||
logger.error("Model generated None for audio chunk")
|
logger.error("Model generated None for audio chunk")
|
||||||
return
|
return
|
||||||
|
@ -141,6 +143,8 @@ class TTSService:
|
||||||
logger.error("Model generated empty audio chunk")
|
logger.error("Model generated empty audio chunk")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
chunk_data.audio*=volume_multiplier
|
||||||
|
|
||||||
# For streaming, convert to bytes
|
# For streaming, convert to bytes
|
||||||
if output_format:
|
if output_format:
|
||||||
try:
|
try:
|
||||||
|
@ -259,6 +263,7 @@ class TTSService:
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
output_format: str = "wav",
|
output_format: str = "wav",
|
||||||
lang_code: Optional[str] = None,
|
lang_code: Optional[str] = None,
|
||||||
|
volume_multiplier: Optional[float] = 1.0,
|
||||||
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
||||||
return_timestamps: Optional[bool] = False,
|
return_timestamps: Optional[bool] = False,
|
||||||
) -> AsyncGenerator[AudioChunk, None]:
|
) -> AsyncGenerator[AudioChunk, None]:
|
||||||
|
@ -300,6 +305,7 @@ class TTSService:
|
||||||
formatted_pause_chunk = await AudioService.convert_audio(
|
formatted_pause_chunk = await AudioService.convert_audio(
|
||||||
pause_chunk, output_format, writer, speed=speed, chunk_text="",
|
pause_chunk, output_format, writer, speed=speed, chunk_text="",
|
||||||
is_last_chunk=False, trim_audio=False, normalizer=stream_normalizer,
|
is_last_chunk=False, trim_audio=False, normalizer=stream_normalizer,
|
||||||
|
|
||||||
)
|
)
|
||||||
if formatted_pause_chunk.output:
|
if formatted_pause_chunk.output:
|
||||||
yield formatted_pause_chunk
|
yield formatted_pause_chunk
|
||||||
|
@ -330,6 +336,7 @@ class TTSService:
|
||||||
writer,
|
writer,
|
||||||
output_format,
|
output_format,
|
||||||
is_first=(chunk_index == 0),
|
is_first=(chunk_index == 0),
|
||||||
|
volume_multiplier=volume_multiplier,
|
||||||
is_last=False, # We'll update the last chunk later
|
is_last=False, # We'll update the last chunk later
|
||||||
normalizer=stream_normalizer,
|
normalizer=stream_normalizer,
|
||||||
lang_code=pipeline_lang_code, # Pass lang_code
|
lang_code=pipeline_lang_code, # Pass lang_code
|
||||||
|
@ -377,6 +384,7 @@ class TTSService:
|
||||||
output_format,
|
output_format,
|
||||||
is_first=False,
|
is_first=False,
|
||||||
is_last=True, # Signal this is the last chunk
|
is_last=True, # Signal this is the last chunk
|
||||||
|
volume_multiplier=volume_multiplier,
|
||||||
normalizer=stream_normalizer,
|
normalizer=stream_normalizer,
|
||||||
lang_code=pipeline_lang_code, # Pass lang_code
|
lang_code=pipeline_lang_code, # Pass lang_code
|
||||||
):
|
):
|
||||||
|
@ -396,6 +404,7 @@ class TTSService:
|
||||||
writer: StreamingAudioWriter,
|
writer: StreamingAudioWriter,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
return_timestamps: bool = False,
|
return_timestamps: bool = False,
|
||||||
|
volume_multiplier: Optional[float] = 1.0,
|
||||||
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
||||||
lang_code: Optional[str] = None,
|
lang_code: Optional[str] = None,
|
||||||
) -> AudioChunk:
|
) -> AudioChunk:
|
||||||
|
@ -408,6 +417,7 @@ class TTSService:
|
||||||
voice,
|
voice,
|
||||||
writer,
|
writer,
|
||||||
speed=speed,
|
speed=speed,
|
||||||
|
volume_multiplier=volume_multiplier,
|
||||||
normalization_options=normalization_options,
|
normalization_options=normalization_options,
|
||||||
return_timestamps=return_timestamps,
|
return_timestamps=return_timestamps,
|
||||||
lang_code=lang_code,
|
lang_code=lang_code,
|
||||||
|
|
|
@ -113,6 +113,10 @@ class OpenAISpeechRequest(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
||||||
)
|
)
|
||||||
|
volume_multiplier: Optional[float] = Field(
|
||||||
|
default = 1.0,
|
||||||
|
description="A volume multiplier to multiply the output audio by."
|
||||||
|
)
|
||||||
normalization_options: Optional[NormalizationOptions] = Field(
|
normalization_options: Optional[NormalizationOptions] = Field(
|
||||||
default=NormalizationOptions(),
|
default=NormalizationOptions(),
|
||||||
description="Options for the normalization system",
|
description="Options for the normalization system",
|
||||||
|
@ -157,6 +161,10 @@ class CaptionedSpeechRequest(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
|
||||||
)
|
)
|
||||||
|
volume_multiplier: Optional[float] = Field(
|
||||||
|
default = 1.0,
|
||||||
|
description="A volume multiplier to multiply the output audio by."
|
||||||
|
)
|
||||||
normalization_options: Optional[NormalizationOptions] = Field(
|
normalization_options: Optional[NormalizationOptions] = Field(
|
||||||
default=NormalizationOptions(),
|
default=NormalizationOptions(),
|
||||||
description="Options for the normalization system",
|
description="Options for the normalization system",
|
||||||
|
|
Loading…
Add table
Reference in a new issue