From 20658f975967a7ce3d539093394fdacc15d27076 Mon Sep 17 00:00:00 2001 From: remsky Date: Fri, 24 Jan 2025 05:01:38 -0700 Subject: [PATCH] Performance: Adjust session timeout and GPU memory limit; minim voice pre-caching and improve singleton instance management --- api/src/core/model_config.py | 4 ++-- api/src/inference/model_manager.py | 10 ++-------- api/src/inference/voice_manager.py | 17 +++++++++++++---- web/app.js | 2 +- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/api/src/core/model_config.py b/api/src/core/model_config.py index ac21935..7da360e 100644 --- a/api/src/core/model_config.py +++ b/api/src/core/model_config.py @@ -8,7 +8,7 @@ class ONNXCPUConfig(BaseModel): # Session pooling max_instances: int = Field(4, description="Maximum concurrent model instances") - instance_timeout: int = Field(300, description="Session timeout in seconds") + instance_timeout: int = Field(60, description="Session timeout in seconds") # Runtime settings num_threads: int = Field(8, description="Number of threads for parallel operations") @@ -27,7 +27,7 @@ class ONNXGPUConfig(ONNXCPUConfig): # CUDA settings device_id: int = Field(0, description="CUDA device ID") - gpu_mem_limit: float = Field(0.7, description="Fraction of GPU memory to use") + gpu_mem_limit: float = Field(0.5, description="Fraction of GPU memory to use") cudnn_conv_algo_search: str = Field("EXHAUSTIVE", description="CuDNN convolution algorithm search") # Stream management diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py index 23f7eb1..a89fc64 100644 --- a/api/src/inference/model_manager.py +++ b/api/src/inference/model_manager.py @@ -118,14 +118,8 @@ class ModelManager: # Initialize model with warmup voice await self.load_model(model_path, warmup_voice, backend_type) - # Pre-cache common voices in background - common_voices = ['af', 'af_bella', 'af_sky', 'af_nicole'] - for voice_name in common_voices: - try: - await voice_manager.load_voice(voice_name, device=backend.device) - logger.debug(f"Pre-cached voice {voice_name}") - except Exception as e: - logger.warning(f"Failed to pre-cache voice {voice_name}: {e}") + # Only pre-cache default voice to avoid memory bloat + logger.info(f"Using {settings.default_voice} as warmup voice") # Get available voices count voices = await voice_manager.list_voices() diff --git a/api/src/inference/voice_manager.py b/api/src/inference/voice_manager.py index de9765c..fb73e35 100644 --- a/api/src/inference/voice_manager.py +++ b/api/src/inference/voice_manager.py @@ -195,8 +195,9 @@ class VoiceManager: } -# Global singleton instance +# Global singleton instance and lock _manager_instance = None +_manager_lock = asyncio.Lock() async def get_manager(config: Optional[VoiceConfig] = None) -> VoiceManager: @@ -209,6 +210,14 @@ async def get_manager(config: Optional[VoiceConfig] = None) -> VoiceManager: VoiceManager instance """ global _manager_instance - if _manager_instance is None: - _manager_instance = VoiceManager(config) - return _manager_instance \ No newline at end of file + + # Fast path - return existing instance + if _manager_instance is not None: + return _manager_instance + + # Slow path - create new instance with lock + async with _manager_lock: + # Double-check pattern + if _manager_instance is None: + _manager_instance = VoiceManager(config) + return _manager_instance \ No newline at end of file diff --git a/web/app.js b/web/app.js index 82ca420..9f699d1 100644 --- a/web/app.js +++ b/web/app.js @@ -41,7 +41,7 @@ class KokoroPlayer { container: this.elements.waveContainer, width: this.elements.waveContainer.clientWidth, height: 80, - style: '"ios9"', + style: 'ios9', // color: '#6366f1', speed: 0.02, amplitude: 0.7,