From 75963c4aebea9c570b3ee015521869f4ea10a3e0 Mon Sep 17 00:00:00 2001 From: JCallicoat Date: Thu, 22 May 2025 06:49:37 -0500 Subject: [PATCH 1/2] Add a volume multiplier setting Allow configuring output volume via multiplier applied to np array of audio chunk. Defaults to 1.0 which is no-op. Fixes #110 --- api/src/core/config.py | 1 + api/src/inference/model_manager.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/api/src/core/config.py b/api/src/core/config.py index 1d4657c..3bc825c 100644 --- a/api/src/core/config.py +++ b/api/src/core/config.py @@ -31,6 +31,7 @@ class Settings(BaseSettings): # Audio Settings sample_rate: int = 24000 + volume_multiplier: float = 1.0 # Text Processing Settings target_min_tokens: int = 175 # Target minimum tokens per chunk target_max_tokens: int = 250 # Target maximum tokens per chunk diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py index 9cef95f..0b4cd81 100644 --- a/api/src/inference/model_manager.py +++ b/api/src/inference/model_manager.py @@ -141,6 +141,8 @@ Model files not found! You need to download the Kokoro V1 model: try: async for chunk in self._backend.generate(*args, **kwargs): + if settings.volume_multiplier != 1.0: + chunk.audio *= settings.volume_multiplier yield chunk except Exception as e: raise RuntimeError(f"Generation failed: {e}") From cd82dd07355fca75a9cb37e75da624a0d3b3d0ae Mon Sep 17 00:00:00 2001 From: Fireblade2534 Date: Mon, 16 Jun 2025 16:39:30 +0000 Subject: [PATCH 2/2] Added a volume multiplier as a request parameter --- api/src/core/config.py | 2 +- api/src/inference/model_manager.py | 4 ++-- api/src/routers/development.py | 1 + api/src/routers/openai_compatible.py | 2 ++ api/src/services/tts_service.py | 11 ++++++++++- api/src/structures/schemas.py | 8 ++++++++ 6 files changed, 24 insertions(+), 4 deletions(-) diff --git a/api/src/core/config.py b/api/src/core/config.py index 3bc825c..87edce0 100644 --- a/api/src/core/config.py +++ b/api/src/core/config.py @@ -31,7 +31,7 @@ class Settings(BaseSettings): # Audio Settings sample_rate: int = 24000 - volume_multiplier: float = 1.0 + default_volume_multiplier: float = 1.0 # Text Processing Settings target_min_tokens: int = 175 # Target minimum tokens per chunk target_max_tokens: int = 250 # Target maximum tokens per chunk diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py index 0b4cd81..eb817ec 100644 --- a/api/src/inference/model_manager.py +++ b/api/src/inference/model_manager.py @@ -141,8 +141,8 @@ Model files not found! You need to download the Kokoro V1 model: try: async for chunk in self._backend.generate(*args, **kwargs): - if settings.volume_multiplier != 1.0: - chunk.audio *= settings.volume_multiplier + if settings.default_volume_multiplier != 1.0: + chunk.audio *= settings.default_volume_multiplier yield chunk except Exception as e: raise RuntimeError(f"Generation failed: {e}") diff --git a/api/src/routers/development.py b/api/src/routers/development.py index d78aa3c..8c8ed7e 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -319,6 +319,7 @@ async def create_captioned_speech( writer=writer, speed=request.speed, return_timestamps=request.return_timestamps, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, lang_code=request.lang_code, ) diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 4819bc5..c325221 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -152,6 +152,7 @@ async def stream_audio_chunks( speed=request.speed, output_format=request.response_format, lang_code=request.lang_code, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, return_timestamps=unique_properties["return_timestamps"], ): @@ -300,6 +301,7 @@ async def create_speech( voice=voice_name, writer=writer, speed=request.speed, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, lang_code=request.lang_code, ) diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 0a69b85..dca0d02 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -55,6 +55,7 @@ class TTSService: output_format: Optional[str] = None, is_first: bool = False, is_last: bool = False, + volume_multiplier: Optional[float] = 1.0, normalizer: Optional[AudioNormalizer] = None, lang_code: Optional[str] = None, return_timestamps: Optional[bool] = False, @@ -100,6 +101,7 @@ class TTSService: lang_code=lang_code, return_timestamps=return_timestamps, ): + chunk_data.audio*=volume_multiplier # For streaming, convert to bytes if output_format: try: @@ -132,7 +134,7 @@ class TTSService: speed=speed, return_timestamps=return_timestamps, ) - + if chunk_data.audio is None: logger.error("Model generated None for audio chunk") return @@ -141,6 +143,8 @@ class TTSService: logger.error("Model generated empty audio chunk") return + chunk_data.audio*=volume_multiplier + # For streaming, convert to bytes if output_format: try: @@ -259,6 +263,7 @@ class TTSService: speed: float = 1.0, output_format: str = "wav", lang_code: Optional[str] = None, + volume_multiplier: Optional[float] = 1.0, normalization_options: Optional[NormalizationOptions] = NormalizationOptions(), return_timestamps: Optional[bool] = False, ) -> AsyncGenerator[AudioChunk, None]: @@ -298,6 +303,7 @@ class TTSService: output_format, is_first=(chunk_index == 0), is_last=False, # We'll update the last chunk later + volume_multiplier=volume_multiplier, normalizer=stream_normalizer, lang_code=pipeline_lang_code, # Pass lang_code return_timestamps=return_timestamps, @@ -337,6 +343,7 @@ class TTSService: output_format, is_first=False, is_last=True, # Signal this is the last chunk + volume_multiplier=volume_multiplier, normalizer=stream_normalizer, lang_code=pipeline_lang_code, # Pass lang_code ): @@ -356,6 +363,7 @@ class TTSService: writer: StreamingAudioWriter, speed: float = 1.0, return_timestamps: bool = False, + volume_multiplier: Optional[float] = 1.0, normalization_options: Optional[NormalizationOptions] = NormalizationOptions(), lang_code: Optional[str] = None, ) -> AudioChunk: @@ -368,6 +376,7 @@ class TTSService: voice, writer, speed=speed, + volume_multiplier=volume_multiplier, normalization_options=normalization_options, return_timestamps=return_timestamps, lang_code=lang_code, diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py index 260224c..f8273cc 100644 --- a/api/src/structures/schemas.py +++ b/api/src/structures/schemas.py @@ -108,6 +108,10 @@ class OpenAISpeechRequest(BaseModel): default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) + volume_multiplier: Optional[float] = Field( + default = 1.0, + description="A volume multiplier to multiply the output audio by." + ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system", @@ -152,6 +156,10 @@ class CaptionedSpeechRequest(BaseModel): default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) + volume_multiplier: Optional[float] = Field( + default = 1.0, + description="A volume multiplier to multiply the output audio by." + ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system",