diff --git a/api/src/core/config.py b/api/src/core/config.py index 1d4657c..87edce0 100644 --- a/api/src/core/config.py +++ b/api/src/core/config.py @@ -31,6 +31,7 @@ class Settings(BaseSettings): # Audio Settings sample_rate: int = 24000 + default_volume_multiplier: float = 1.0 # Text Processing Settings target_min_tokens: int = 175 # Target minimum tokens per chunk target_max_tokens: int = 250 # Target maximum tokens per chunk diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py index 9cef95f..eb817ec 100644 --- a/api/src/inference/model_manager.py +++ b/api/src/inference/model_manager.py @@ -141,6 +141,8 @@ Model files not found! You need to download the Kokoro V1 model: try: async for chunk in self._backend.generate(*args, **kwargs): + if settings.default_volume_multiplier != 1.0: + chunk.audio *= settings.default_volume_multiplier yield chunk except Exception as e: raise RuntimeError(f"Generation failed: {e}") diff --git a/api/src/routers/development.py b/api/src/routers/development.py index d78aa3c..8c8ed7e 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -319,6 +319,7 @@ async def create_captioned_speech( writer=writer, speed=request.speed, return_timestamps=request.return_timestamps, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, lang_code=request.lang_code, ) diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 4819bc5..c325221 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -152,6 +152,7 @@ async def stream_audio_chunks( speed=request.speed, output_format=request.response_format, lang_code=request.lang_code, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, return_timestamps=unique_properties["return_timestamps"], ): @@ -300,6 +301,7 @@ async def create_speech( voice=voice_name, writer=writer, speed=request.speed, + volume_multiplier=request.volume_multiplier, normalization_options=request.normalization_options, lang_code=request.lang_code, ) diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 399600e..46c2fb4 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -55,6 +55,7 @@ class TTSService: output_format: Optional[str] = None, is_first: bool = False, is_last: bool = False, + volume_multiplier: Optional[float] = 1.0, normalizer: Optional[AudioNormalizer] = None, lang_code: Optional[str] = None, return_timestamps: Optional[bool] = False, @@ -100,6 +101,7 @@ class TTSService: lang_code=lang_code, return_timestamps=return_timestamps, ): + chunk_data.audio*=volume_multiplier # For streaming, convert to bytes if output_format: try: @@ -132,7 +134,7 @@ class TTSService: speed=speed, return_timestamps=return_timestamps, ) - + if chunk_data.audio is None: logger.error("Model generated None for audio chunk") return @@ -141,6 +143,8 @@ class TTSService: logger.error("Model generated empty audio chunk") return + chunk_data.audio*=volume_multiplier + # For streaming, convert to bytes if output_format: try: @@ -259,6 +263,7 @@ class TTSService: speed: float = 1.0, output_format: str = "wav", lang_code: Optional[str] = None, + volume_multiplier: Optional[float] = 1.0, normalization_options: Optional[NormalizationOptions] = NormalizationOptions(), return_timestamps: Optional[bool] = False, ) -> AsyncGenerator[AudioChunk, None]: @@ -300,6 +305,7 @@ class TTSService: formatted_pause_chunk = await AudioService.convert_audio( pause_chunk, output_format, writer, speed=speed, chunk_text="", is_last_chunk=False, trim_audio=False, normalizer=stream_normalizer, + ) if formatted_pause_chunk.output: yield formatted_pause_chunk @@ -330,6 +336,7 @@ class TTSService: writer, output_format, is_first=(chunk_index == 0), + volume_multiplier=volume_multiplier, is_last=False, # We'll update the last chunk later normalizer=stream_normalizer, lang_code=pipeline_lang_code, # Pass lang_code @@ -377,6 +384,7 @@ class TTSService: output_format, is_first=False, is_last=True, # Signal this is the last chunk + volume_multiplier=volume_multiplier, normalizer=stream_normalizer, lang_code=pipeline_lang_code, # Pass lang_code ): @@ -396,6 +404,7 @@ class TTSService: writer: StreamingAudioWriter, speed: float = 1.0, return_timestamps: bool = False, + volume_multiplier: Optional[float] = 1.0, normalization_options: Optional[NormalizationOptions] = NormalizationOptions(), lang_code: Optional[str] = None, ) -> AudioChunk: @@ -408,6 +417,7 @@ class TTSService: voice, writer, speed=speed, + volume_multiplier=volume_multiplier, normalization_options=normalization_options, return_timestamps=return_timestamps, lang_code=lang_code, diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py index 83b1c0b..0aeab7b 100644 --- a/api/src/structures/schemas.py +++ b/api/src/structures/schemas.py @@ -113,6 +113,10 @@ class OpenAISpeechRequest(BaseModel): default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) + volume_multiplier: Optional[float] = Field( + default = 1.0, + description="A volume multiplier to multiply the output audio by." + ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system", @@ -157,6 +161,10 @@ class CaptionedSpeechRequest(BaseModel): default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) + volume_multiplier: Optional[float] = Field( + default = 1.0, + description="A volume multiplier to multiply the output audio by." + ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system",