Added a volume multiplier as a request parameter

This commit is contained in:
Fireblade2534 2025-06-16 16:39:30 +00:00
parent 75963c4aeb
commit cd82dd0735
6 changed files with 24 additions and 4 deletions

View file

@ -31,7 +31,7 @@ class Settings(BaseSettings):
# Audio Settings
sample_rate: int = 24000
volume_multiplier: float = 1.0
default_volume_multiplier: float = 1.0
# Text Processing Settings
target_min_tokens: int = 175 # Target minimum tokens per chunk
target_max_tokens: int = 250 # Target maximum tokens per chunk

View file

@ -141,8 +141,8 @@ Model files not found! You need to download the Kokoro V1 model:
try:
async for chunk in self._backend.generate(*args, **kwargs):
if settings.volume_multiplier != 1.0:
chunk.audio *= settings.volume_multiplier
if settings.default_volume_multiplier != 1.0:
chunk.audio *= settings.default_volume_multiplier
yield chunk
except Exception as e:
raise RuntimeError(f"Generation failed: {e}")

View file

@ -319,6 +319,7 @@ async def create_captioned_speech(
writer=writer,
speed=request.speed,
return_timestamps=request.return_timestamps,
volume_multiplier=request.volume_multiplier,
normalization_options=request.normalization_options,
lang_code=request.lang_code,
)

View file

@ -152,6 +152,7 @@ async def stream_audio_chunks(
speed=request.speed,
output_format=request.response_format,
lang_code=request.lang_code,
volume_multiplier=request.volume_multiplier,
normalization_options=request.normalization_options,
return_timestamps=unique_properties["return_timestamps"],
):
@ -300,6 +301,7 @@ async def create_speech(
voice=voice_name,
writer=writer,
speed=request.speed,
volume_multiplier=request.volume_multiplier,
normalization_options=request.normalization_options,
lang_code=request.lang_code,
)

View file

@ -55,6 +55,7 @@ class TTSService:
output_format: Optional[str] = None,
is_first: bool = False,
is_last: bool = False,
volume_multiplier: Optional[float] = 1.0,
normalizer: Optional[AudioNormalizer] = None,
lang_code: Optional[str] = None,
return_timestamps: Optional[bool] = False,
@ -100,6 +101,7 @@ class TTSService:
lang_code=lang_code,
return_timestamps=return_timestamps,
):
chunk_data.audio*=volume_multiplier
# For streaming, convert to bytes
if output_format:
try:
@ -132,7 +134,7 @@ class TTSService:
speed=speed,
return_timestamps=return_timestamps,
)
if chunk_data.audio is None:
logger.error("Model generated None for audio chunk")
return
@ -141,6 +143,8 @@ class TTSService:
logger.error("Model generated empty audio chunk")
return
chunk_data.audio*=volume_multiplier
# For streaming, convert to bytes
if output_format:
try:
@ -259,6 +263,7 @@ class TTSService:
speed: float = 1.0,
output_format: str = "wav",
lang_code: Optional[str] = None,
volume_multiplier: Optional[float] = 1.0,
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
return_timestamps: Optional[bool] = False,
) -> AsyncGenerator[AudioChunk, None]:
@ -298,6 +303,7 @@ class TTSService:
output_format,
is_first=(chunk_index == 0),
is_last=False, # We'll update the last chunk later
volume_multiplier=volume_multiplier,
normalizer=stream_normalizer,
lang_code=pipeline_lang_code, # Pass lang_code
return_timestamps=return_timestamps,
@ -337,6 +343,7 @@ class TTSService:
output_format,
is_first=False,
is_last=True, # Signal this is the last chunk
volume_multiplier=volume_multiplier,
normalizer=stream_normalizer,
lang_code=pipeline_lang_code, # Pass lang_code
):
@ -356,6 +363,7 @@ class TTSService:
writer: StreamingAudioWriter,
speed: float = 1.0,
return_timestamps: bool = False,
volume_multiplier: Optional[float] = 1.0,
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
lang_code: Optional[str] = None,
) -> AudioChunk:
@ -368,6 +376,7 @@ class TTSService:
voice,
writer,
speed=speed,
volume_multiplier=volume_multiplier,
normalization_options=normalization_options,
return_timestamps=return_timestamps,
lang_code=lang_code,

View file

@ -108,6 +108,10 @@ class OpenAISpeechRequest(BaseModel):
default=None,
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
)
volume_multiplier: Optional[float] = Field(
default = 1.0,
description="A volume multiplier to multiply the output audio by."
)
normalization_options: Optional[NormalizationOptions] = Field(
default=NormalizationOptions(),
description="Options for the normalization system",
@ -152,6 +156,10 @@ class CaptionedSpeechRequest(BaseModel):
default=None,
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
)
volume_multiplier: Optional[float] = Field(
default = 1.0,
description="A volume multiplier to multiply the output audio by."
)
normalization_options: Optional[NormalizationOptions] = Field(
default=NormalizationOptions(),
description="Options for the normalization system",