From 75963c4aebea9c570b3ee015521869f4ea10a3e0 Mon Sep 17 00:00:00 2001
From: JCallicoat <jordan.callicoat@gmail.com>
Date: Thu, 22 May 2025 06:49:37 -0500
Subject: [PATCH 1/2] Add a volume multiplier setting

Allow configuring output volume via multiplier applied to np array of
audio chunk.

Defaults to 1.0 which is no-op.

Fixes #110
---
 api/src/core/config.py             | 1 +
 api/src/inference/model_manager.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/api/src/core/config.py b/api/src/core/config.py
index 1d4657c..3bc825c 100644
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@@ -31,6 +31,7 @@ class Settings(BaseSettings):
 
     # Audio Settings
     sample_rate: int = 24000
+    volume_multiplier: float = 1.0
     # Text Processing Settings
     target_min_tokens: int = 175  # Target minimum tokens per chunk
     target_max_tokens: int = 250  # Target maximum tokens per chunk
diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py
index 9cef95f..0b4cd81 100644
--- a/api/src/inference/model_manager.py
+++ b/api/src/inference/model_manager.py
@@ -141,6 +141,8 @@ Model files not found! You need to download the Kokoro V1 model:
 
         try:
             async for chunk in self._backend.generate(*args, **kwargs):
+                if settings.volume_multiplier != 1.0:
+                    chunk.audio *= settings.volume_multiplier
                 yield chunk
         except Exception as e:
             raise RuntimeError(f"Generation failed: {e}")

From cd82dd07355fca75a9cb37e75da624a0d3b3d0ae Mon Sep 17 00:00:00 2001
From: Fireblade2534 <Fireblade5234@gmail.com>
Date: Mon, 16 Jun 2025 16:39:30 +0000
Subject: [PATCH 2/2] Added a volume multiplier as a request parameter

---
 api/src/core/config.py               |  2 +-
 api/src/inference/model_manager.py   |  4 ++--
 api/src/routers/development.py       |  1 +
 api/src/routers/openai_compatible.py |  2 ++
 api/src/services/tts_service.py      | 11 ++++++++++-
 api/src/structures/schemas.py        |  8 ++++++++
 6 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/api/src/core/config.py b/api/src/core/config.py
index 3bc825c..87edce0 100644
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@@ -31,7 +31,7 @@ class Settings(BaseSettings):
 
     # Audio Settings
     sample_rate: int = 24000
-    volume_multiplier: float = 1.0
+    default_volume_multiplier: float = 1.0
     # Text Processing Settings
     target_min_tokens: int = 175  # Target minimum tokens per chunk
     target_max_tokens: int = 250  # Target maximum tokens per chunk
diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py
index 0b4cd81..eb817ec 100644
--- a/api/src/inference/model_manager.py
+++ b/api/src/inference/model_manager.py
@@ -141,8 +141,8 @@ Model files not found! You need to download the Kokoro V1 model:
 
         try:
             async for chunk in self._backend.generate(*args, **kwargs):
-                if settings.volume_multiplier != 1.0:
-                    chunk.audio *= settings.volume_multiplier
+                if settings.default_volume_multiplier != 1.0:
+                    chunk.audio *= settings.default_volume_multiplier
                 yield chunk
         except Exception as e:
             raise RuntimeError(f"Generation failed: {e}")
diff --git a/api/src/routers/development.py b/api/src/routers/development.py
index d78aa3c..8c8ed7e 100644
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@@ -319,6 +319,7 @@ async def create_captioned_speech(
                 writer=writer,
                 speed=request.speed,
                 return_timestamps=request.return_timestamps,
+                volume_multiplier=request.volume_multiplier,
                 normalization_options=request.normalization_options,
                 lang_code=request.lang_code,
             )
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
index 4819bc5..c325221 100644
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@@ -152,6 +152,7 @@ async def stream_audio_chunks(
             speed=request.speed,
             output_format=request.response_format,
             lang_code=request.lang_code,
+            volume_multiplier=request.volume_multiplier,
             normalization_options=request.normalization_options,
             return_timestamps=unique_properties["return_timestamps"],
         ):
@@ -300,6 +301,7 @@ async def create_speech(
                 voice=voice_name,
                 writer=writer,
                 speed=request.speed,
+                volume_multiplier=request.volume_multiplier,
                 normalization_options=request.normalization_options,
                 lang_code=request.lang_code,
             )
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index 0a69b85..dca0d02 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -55,6 +55,7 @@ class TTSService:
         output_format: Optional[str] = None,
         is_first: bool = False,
         is_last: bool = False,
+        volume_multiplier: Optional[float] = 1.0,
         normalizer: Optional[AudioNormalizer] = None,
         lang_code: Optional[str] = None,
         return_timestamps: Optional[bool] = False,
@@ -100,6 +101,7 @@ class TTSService:
                         lang_code=lang_code,
                         return_timestamps=return_timestamps,
                     ):
+                        chunk_data.audio*=volume_multiplier
                         # For streaming, convert to bytes
                         if output_format:
                             try:
@@ -132,7 +134,7 @@ class TTSService:
                         speed=speed,
                         return_timestamps=return_timestamps,
                     )
-
+                    
                     if chunk_data.audio is None:
                         logger.error("Model generated None for audio chunk")
                         return
@@ -141,6 +143,8 @@ class TTSService:
                         logger.error("Model generated empty audio chunk")
                         return
 
+                    chunk_data.audio*=volume_multiplier
+
                     # For streaming, convert to bytes
                     if output_format:
                         try:
@@ -259,6 +263,7 @@ class TTSService:
         speed: float = 1.0,
         output_format: str = "wav",
         lang_code: Optional[str] = None,
+        volume_multiplier: Optional[float] = 1.0,
         normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
         return_timestamps: Optional[bool] = False,
     ) -> AsyncGenerator[AudioChunk, None]:
@@ -298,6 +303,7 @@ class TTSService:
                         output_format,
                         is_first=(chunk_index == 0),
                         is_last=False,  # We'll update the last chunk later
+                        volume_multiplier=volume_multiplier,
                         normalizer=stream_normalizer,
                         lang_code=pipeline_lang_code,  # Pass lang_code
                         return_timestamps=return_timestamps,
@@ -337,6 +343,7 @@ class TTSService:
                         output_format,
                         is_first=False,
                         is_last=True,  # Signal this is the last chunk
+                        volume_multiplier=volume_multiplier,
                         normalizer=stream_normalizer,
                         lang_code=pipeline_lang_code,  # Pass lang_code
                     ):
@@ -356,6 +363,7 @@ class TTSService:
         writer: StreamingAudioWriter,
         speed: float = 1.0,
         return_timestamps: bool = False,
+        volume_multiplier: Optional[float] = 1.0,
         normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
         lang_code: Optional[str] = None,
     ) -> AudioChunk:
@@ -368,6 +376,7 @@ class TTSService:
                 voice,
                 writer,
                 speed=speed,
+                volume_multiplier=volume_multiplier,
                 normalization_options=normalization_options,
                 return_timestamps=return_timestamps,
                 lang_code=lang_code,
diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py
index 260224c..f8273cc 100644
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@@ -108,6 +108,10 @@ class OpenAISpeechRequest(BaseModel):
         default=None,
         description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
     )
+    volume_multiplier: Optional[float] = Field(
+        default = 1.0,
+        description="A volume multiplier to multiply the output audio by."
+    )
     normalization_options: Optional[NormalizationOptions] = Field(
         default=NormalizationOptions(),
         description="Options for the normalization system",
@@ -152,6 +156,10 @@ class CaptionedSpeechRequest(BaseModel):
         default=None,
         description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
     )
+    volume_multiplier: Optional[float] = Field(
+        default = 1.0,
+        description="A volume multiplier to multiply the output audio by."
+    )
     normalization_options: Optional[NormalizationOptions] = Field(
         default=NormalizationOptions(),
         description="Options for the normalization system",