diff --git a/api/src/builds/models.py b/api/src/builds/models.py index 929dc47..bfd52ad 100644 --- a/api/src/builds/models.py +++ b/api/src/builds/models.py @@ -337,11 +337,13 @@ def recursive_munch(d): else: return d -def build_model(path, device): +async def build_model(path, device): + from ..core.paths import load_json, load_model_weights + config = Path(__file__).parent / 'config.json' assert config.exists(), f'Config path incorrect: config.json not found at {config}' - with open(config, 'r') as r: - args = recursive_munch(json.load(r)) + + args = recursive_munch(await load_json(config)) assert args.decoder.type == 'istftnet', f'Unknown decoder type: {args.decoder.type}' decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels, resblock_kernel_sizes = args.decoder.resblock_kernel_sizes, @@ -365,7 +367,8 @@ def build_model(path, device): decoder=decoder.to(device).eval(), text_encoder=text_encoder.to(device).eval(), ) - for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items(): + weights = await load_model_weights(path, device='cpu') + for key, state_dict in weights['net'].items(): assert key in model, key try: model[key].load_state_dict(state_dict) diff --git a/api/src/core/config.py b/api/src/core/config.py index 3f44350..91aec63 100644 --- a/api/src/core/config.py +++ b/api/src/core/config.py @@ -13,10 +13,15 @@ class Settings(BaseSettings): output_dir: str = "output" output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB default_voice: str = "af" - model_dir: str = "/app/models" # Base directory for model files - pytorch_model_path: str = "kokoro-v0_19.pth" - onnx_model_path: str = "kokoro-v0_19.onnx" - voices_dir: str = "voices" + use_gpu: bool = False # Whether to use GPU acceleration if available + use_onnx: bool = True # Whether to use ONNX runtime + # Paths relative to api directory + model_dir: str = "src/models" # Model directory relative to api/ + voices_dir: str = "src/voices" # Voices directory relative to api/ + + # Model filenames + pytorch_model_file: str = "kokoro-v0_19.pth" + onnx_model_file: str = "kokoro-v0_19.onnx" sample_rate: int = 24000 max_chunk_size: int = 300 # Maximum size of text chunks for processing gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds @@ -28,6 +33,12 @@ class Settings(BaseSettings): onnx_optimization_level: str = "all" # all, basic, or disabled onnx_memory_pattern: bool = True # Enable memory pattern optimization onnx_arena_extend_strategy: str = "kNextPowerOfTwo" # Memory allocation strategy + + # ONNX GPU Settings + onnx_device_id: int = 0 # GPU device ID to use + onnx_gpu_mem_limit: float = 0.7 # Limit GPU memory usage to 70% + onnx_cudnn_conv_algo_search: str = "EXHAUSTIVE" # CUDNN convolution algorithm search + onnx_do_copy_in_default_stream: bool = True # Copy in default CUDA stream class Config: env_file = ".env" diff --git a/api/src/core/model_config.py b/api/src/core/model_config.py new file mode 100644 index 0000000..dc82aee --- /dev/null +++ b/api/src/core/model_config.py @@ -0,0 +1,109 @@ +"""Model configuration schemas.""" + +from pydantic import BaseModel, Field + + +class ONNXCPUConfig(BaseModel): + """ONNX CPU runtime configuration.""" + + num_threads: int = Field(8, description="Number of threads for parallel operations") + inter_op_threads: int = Field(4, description="Number of threads for operator parallelism") + execution_mode: str = Field("parallel", description="ONNX execution mode") + optimization_level: str = Field("all", description="ONNX optimization level") + memory_pattern: bool = Field(True, description="Enable memory pattern optimization") + arena_extend_strategy: str = Field("kNextPowerOfTwo", description="Memory arena strategy") + + class Config: + frozen = True + + +class ONNXGPUConfig(ONNXCPUConfig): + """ONNX GPU-specific configuration.""" + + device_id: int = Field(0, description="CUDA device ID") + gpu_mem_limit: float = Field(0.7, description="Fraction of GPU memory to use") + cudnn_conv_algo_search: str = Field("EXHAUSTIVE", description="CuDNN convolution algorithm search") + do_copy_in_default_stream: bool = Field(True, description="Copy in default CUDA stream") + + class Config: + frozen = True + + +class PyTorchCPUConfig(BaseModel): + """PyTorch CPU backend configuration.""" + + max_batch_size: int = Field(32, description="Maximum batch size for batched inference") + stream_buffer_size: int = Field(8, description="Size of stream buffer") + memory_threshold: float = Field(0.8, description="Memory threshold for cleanup") + retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors") + num_threads: int = Field(8, description="Number of threads for parallel operations") + pin_memory: bool = Field(True, description="Whether to pin memory for faster CPU-GPU transfer") + + class Config: + frozen = True + + +class PyTorchGPUConfig(BaseModel): + """PyTorch GPU backend configuration.""" + + device_id: int = Field(0, description="CUDA device ID") + use_fp16: bool = Field(True, description="Whether to use FP16 precision") + use_triton: bool = Field(True, description="Whether to use Triton for CUDA kernels") + max_batch_size: int = Field(32, description="Maximum batch size for batched inference") + stream_buffer_size: int = Field(8, description="Size of CUDA stream buffer") + memory_threshold: float = Field(0.8, description="Memory threshold for cleanup") + retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors") + sync_cuda: bool = Field(True, description="Whether to synchronize CUDA operations") + + class Config: + frozen = True + """PyTorch CPU-specific configuration.""" + + num_threads: int = Field(8, description="Number of threads for parallel operations") + pin_memory: bool = Field(True, description="Whether to pin memory for faster CPU-GPU transfer") + + class Config: + frozen = True + + +class ModelConfig(BaseModel): + """Model configuration.""" + + # General settings + model_type: str = Field("pytorch", description="Model type ('pytorch' or 'onnx')") + device_type: str = Field("auto", description="Device type ('cpu', 'gpu', or 'auto')") + cache_models: bool = Field(True, description="Whether to cache loaded models") + cache_voices: bool = Field(True, description="Whether to cache voice tensors") + voice_cache_size: int = Field(10, description="Maximum number of cached voices") + + # Backend-specific configs + onnx_cpu: ONNXCPUConfig = Field(default_factory=ONNXCPUConfig) + onnx_gpu: ONNXGPUConfig = Field(default_factory=ONNXGPUConfig) + pytorch_cpu: PyTorchCPUConfig = Field(default_factory=PyTorchCPUConfig) + pytorch_gpu: PyTorchGPUConfig = Field(default_factory=PyTorchGPUConfig) + + class Config: + frozen = True + + def get_backend_config(self, backend_type: str): + """Get configuration for specific backend. + + Args: + backend_type: Backend type ('pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu') + + Returns: + Backend-specific configuration + + Raises: + ValueError: If backend type is invalid + """ + if backend_type not in { + 'pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu' + }: + raise ValueError(f"Invalid backend type: {backend_type}") + + return getattr(self, backend_type) + + +# Global instance +model_config = ModelConfig() \ No newline at end of file diff --git a/api/src/core/paths.py b/api/src/core/paths.py index d9096a7..3124ece 100644 --- a/api/src/core/paths.py +++ b/api/src/core/paths.py @@ -1,9 +1,10 @@ """Async file and path operations.""" import io +import json import os from pathlib import Path -from typing import List, Optional, AsyncIterator, Callable, Set +from typing import List, Optional, AsyncIterator, Callable, Set, Dict, Any import aiofiles import aiofiles.os @@ -87,10 +88,18 @@ async def get_model_path(model_name: str) -> str: Raises: RuntimeError: If model not found """ - search_paths = [ - settings.model_dir, - os.path.join(os.path.dirname(__file__), "..", "..", "..", "models") - ] + # Get api directory path (two levels up from core) + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + # Construct model directory path relative to api directory + model_dir = os.path.join(api_dir, settings.model_dir) + + # Ensure model directory exists + os.makedirs(model_dir, exist_ok=True) + + # Search in model directory + search_paths = [model_dir] + logger.debug(f"Searching for model in path: {model_dir}") return await _find_file(model_name, search_paths) @@ -107,12 +116,20 @@ async def get_voice_path(voice_name: str) -> str: Raises: RuntimeError: If voice not found """ + # Get api directory path + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + # Construct voice directory path relative to api directory + voice_dir = os.path.join(api_dir, settings.voices_dir) + + # Ensure voice directory exists + os.makedirs(voice_dir, exist_ok=True) + voice_file = f"{voice_name}.pt" - search_paths = [ - os.path.join(settings.model_dir, "..", settings.voices_dir), - os.path.join(os.path.dirname(__file__), "..", settings.voices_dir) - ] + # Search in voice directory + search_paths = [voice_dir] + logger.debug(f"Searching for voice in path: {voice_dir}") return await _find_file(voice_file, search_paths) @@ -123,10 +140,18 @@ async def list_voices() -> List[str]: Returns: List of voice names (without .pt extension) """ - search_paths = [ - os.path.join(settings.model_dir, "..", settings.voices_dir), - os.path.join(os.path.dirname(__file__), "..", settings.voices_dir) - ] + # Get api directory path + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + # Construct voice directory path relative to api directory + voice_dir = os.path.join(api_dir, settings.voices_dir) + + # Ensure voice directory exists + os.makedirs(voice_dir, exist_ok=True) + + # Search in voice directory + search_paths = [voice_dir] + logger.debug(f"Scanning for voices in path: {voice_dir}") def filter_voice_files(name: str) -> bool: return name.endswith('.pt') @@ -179,6 +204,51 @@ async def save_voice_tensor(tensor: torch.Tensor, voice_path: str) -> None: raise RuntimeError(f"Failed to save voice tensor to {voice_path}: {e}") +async def load_json(path: str) -> dict: + """Load JSON file asynchronously. + + Args: + path: Path to JSON file + + Returns: + Parsed JSON data + + Raises: + RuntimeError: If file cannot be read or parsed + """ + try: + async with aiofiles.open(path, 'r', encoding='utf-8') as f: + content = await f.read() + return json.loads(content) + except Exception as e: + raise RuntimeError(f"Failed to load JSON file {path}: {e}") + + +async def load_model_weights(path: str, device: str = "cpu") -> dict: + """Load model weights asynchronously. + + Args: + path: Path to model file (.pth or .onnx) + device: Device to load model to + + Returns: + Model weights + + Raises: + RuntimeError: If file cannot be read + """ + try: + async with aiofiles.open(path, 'rb') as f: + data = await f.read() + return torch.load( + io.BytesIO(data), + map_location=device, + weights_only=True + ) + except Exception as e: + raise RuntimeError(f"Failed to load model weights from {path}: {e}") + + async def read_file(path: str) -> str: """Read text file asynchronously. diff --git a/api/src/inference/__init__.py b/api/src/inference/__init__.py index c98b884..98dc42c 100644 --- a/api/src/inference/__init__.py +++ b/api/src/inference/__init__.py @@ -1,4 +1,4 @@ -"""Inference backends and model management.""" +"""Model inference package.""" from .base import BaseModelBackend from .model_manager import ModelManager, get_manager @@ -6,15 +6,13 @@ from .onnx_cpu import ONNXCPUBackend from .onnx_gpu import ONNXGPUBackend from .pytorch_cpu import PyTorchCPUBackend from .pytorch_gpu import PyTorchGPUBackend -from ..structures.model_schemas import ModelConfig __all__ = [ 'BaseModelBackend', 'ModelManager', 'get_manager', - 'ModelConfig', 'ONNXCPUBackend', - 'ONNXGPUBackend', + 'ONNXGPUBackend', 'PyTorchCPUBackend', - 'PyTorchGPUBackend' + 'PyTorchGPUBackend', ] \ No newline at end of file diff --git a/api/src/inference/model_manager.py b/api/src/inference/model_manager.py index c3b76c7..26621c2 100644 --- a/api/src/inference/model_manager.py +++ b/api/src/inference/model_manager.py @@ -1,21 +1,18 @@ """Model management and caching.""" -import os -from typing import Dict, List, Optional, Union +from typing import Dict, Optional import torch from loguru import logger -from pydantic import BaseModel +from ..core import paths +from ..core.config import settings +from ..core.model_config import ModelConfig, model_config from .base import BaseModelBackend -from .voice_manager import get_manager as get_voice_manager from .onnx_cpu import ONNXCPUBackend from .onnx_gpu import ONNXGPUBackend from .pytorch_cpu import PyTorchCPUBackend from .pytorch_gpu import PyTorchGPUBackend -from ..core import paths -from ..core.config import settings -from ..structures.model_schemas import ModelConfig class ModelManager: @@ -27,44 +24,63 @@ class ModelManager: Args: config: Optional configuration """ - self._config = config or ModelConfig() + self._config = config or model_config self._backends: Dict[str, BaseModelBackend] = {} self._current_backend: Optional[str] = None - self._voice_manager = get_voice_manager() self._initialize_backends() def _initialize_backends(self) -> None: - """Initialize available backends.""" - """Initialize available backends.""" - # Initialize GPU backends if available - if settings.use_gpu and torch.cuda.is_available(): - try: - # PyTorch GPU - self._backends['pytorch_gpu'] = PyTorchGPUBackend() - self._current_backend = 'pytorch_gpu' - logger.info("Initialized PyTorch GPU backend") - - # ONNX GPU - self._backends['onnx_gpu'] = ONNXGPUBackend() - logger.info("Initialized ONNX GPU backend") - except Exception as e: - logger.error(f"Failed to initialize GPU backends: {e}") - # Fallback to CPU if GPU fails + """Initialize available backends based on settings.""" + has_gpu = settings.use_gpu and torch.cuda.is_available() + + try: + if has_gpu: + if settings.use_onnx: + # ONNX GPU primary + self._backends['onnx_gpu'] = ONNXGPUBackend() + self._current_backend = 'onnx_gpu' + logger.info("Initialized ONNX GPU backend") + + # PyTorch GPU fallback + self._backends['pytorch_gpu'] = PyTorchGPUBackend() + logger.info("Initialized PyTorch GPU backend") + else: + # PyTorch GPU primary + self._backends['pytorch_gpu'] = PyTorchGPUBackend() + self._current_backend = 'pytorch_gpu' + logger.info("Initialized PyTorch GPU backend") + + # ONNX GPU fallback + self._backends['onnx_gpu'] = ONNXGPUBackend() + logger.info("Initialized ONNX GPU backend") + else: self._initialize_cpu_backends() - else: + except Exception as e: + logger.error(f"Failed to initialize GPU backends: {e}") + # Fallback to CPU if GPU fails self._initialize_cpu_backends() def _initialize_cpu_backends(self) -> None: - """Initialize CPU backends.""" + """Initialize CPU backends based on settings.""" try: - # PyTorch CPU - self._backends['pytorch_cpu'] = PyTorchCPUBackend() - self._current_backend = 'pytorch_cpu' - logger.info("Initialized PyTorch CPU backend") - - # ONNX CPU - self._backends['onnx_cpu'] = ONNXCPUBackend() - logger.info("Initialized ONNX CPU backend") + if settings.use_onnx: + # ONNX CPU primary + self._backends['onnx_cpu'] = ONNXCPUBackend() + self._current_backend = 'onnx_cpu' + logger.info("Initialized ONNX CPU backend") + + # PyTorch CPU fallback + self._backends['pytorch_cpu'] = PyTorchCPUBackend() + logger.info("Initialized PyTorch CPU backend") + else: + # PyTorch CPU primary + self._backends['pytorch_cpu'] = PyTorchCPUBackend() + self._current_backend = 'pytorch_cpu' + logger.info("Initialized PyTorch CPU backend") + + # ONNX CPU fallback + self._backends['onnx_cpu'] = ONNXCPUBackend() + logger.info("Initialized ONNX CPU backend") except Exception as e: logger.error(f"Failed to initialize CPU backends: {e}") raise RuntimeError("No backends available") @@ -98,7 +114,7 @@ class ModelManager: return self._backends[backend_type] def _determine_backend(self, model_path: str) -> str: - """Determine appropriate backend based on model file. + """Determine appropriate backend based on model file and settings. Args: model_path: Path to model file @@ -106,10 +122,10 @@ class ModelManager: Returns: Backend type to use """ - is_onnx = model_path.lower().endswith('.onnx') has_gpu = settings.use_gpu and torch.cuda.is_available() - if is_onnx: + # If ONNX is preferred or model is ONNX format + if settings.use_onnx or model_path.lower().endswith('.onnx'): return 'onnx_gpu' if has_gpu else 'onnx_cpu' else: return 'pytorch_gpu' if has_gpu else 'pytorch_cpu' @@ -117,12 +133,14 @@ class ModelManager: async def load_model( self, model_path: str, + warmup_voice: Optional[torch.Tensor] = None, backend_type: Optional[str] = None ) -> None: """Load model on specified backend. Args: model_path: Path to model file + warmup_voice: Optional voice tensor for warmup, skips warmup if None backend_type: Backend to load on, uses default if None Raises: @@ -138,35 +156,39 @@ class ModelManager: backend = self.get_backend(backend_type) - # Load model and run warmup + # Load model await backend.load_model(abs_path) logger.info(f"Loaded model on {backend_type} backend") - await self._warmup_inference(backend) + + # Run warmup if voice provided + if warmup_voice is not None: + await self._warmup_inference(backend, warmup_voice) except Exception as e: raise RuntimeError(f"Failed to load model: {e}") - async def _warmup_inference(self, backend: BaseModelBackend) -> None: - """Run warmup inference to initialize model.""" + async def _warmup_inference(self, backend: BaseModelBackend, voice: torch.Tensor) -> None: + """Run warmup inference to initialize model. + + Args: + backend: Model backend to warm up + voice: Voice tensor already loaded on correct device + """ try: # Import here to avoid circular imports - from ..text_processing import process_text - - # Load default voice for warmup - voice = await self._voice_manager.load_voice(settings.default_voice, device=backend.device) - logger.info(f"Loaded voice {settings.default_voice} for warmup") + from ..services.text_processing import process_text # Use real text text = "Testing text to speech synthesis." - logger.info(f"Running warmup inference with voice: af") # Process through pipeline - sequences = process_text(text) - if not sequences: + tokens = process_text(text) + if not tokens: raise ValueError("Text processing failed") # Run inference - backend.generate(sequences[0], voice, speed=1.0) + backend.generate(tokens, voice, speed=1.0) + logger.info("Completed warmup inference") except Exception as e: logger.warning(f"Warmup inference failed: {e}") @@ -175,7 +197,7 @@ class ModelManager: async def generate( self, tokens: list[int], - voice_name: str, + voice: torch.Tensor, speed: float = 1.0, backend_type: Optional[str] = None ) -> torch.Tensor: @@ -183,7 +205,7 @@ class ModelManager: Args: tokens: Input token IDs - voice_name: Name of voice to use + voice: Voice tensor already loaded on correct device speed: Speed multiplier backend_type: Backend to use, uses default if None @@ -198,10 +220,7 @@ class ModelManager: raise RuntimeError("Model not loaded") try: - # Load voice using voice manager - voice = await self._voice_manager.load_voice(voice_name, device=backend.device) - - # Generate audio + # Generate audio using provided voice tensor return backend.generate(tokens, voice, speed) except Exception as e: diff --git a/api/src/inference/onnx_cpu.py b/api/src/inference/onnx_cpu.py index 600157f..34d68ac 100644 --- a/api/src/inference/onnx_cpu.py +++ b/api/src/inference/onnx_cpu.py @@ -13,8 +13,7 @@ from onnxruntime import ( ) from ..core import paths -from ..core.config import settings -from ..structures.model_schemas import ONNXConfig +from ..core.model_config import model_config from .base import BaseModelBackend @@ -26,14 +25,11 @@ class ONNXCPUBackend(BaseModelBackend): super().__init__() self._device = "cpu" self._session: Optional[InferenceSession] = None - self._config = ONNXConfig( - optimization_level=settings.onnx_optimization_level, - num_threads=settings.onnx_num_threads, - inter_op_threads=settings.onnx_inter_op_threads, - execution_mode=settings.onnx_execution_mode, - memory_pattern=settings.onnx_memory_pattern, - arena_extend_strategy=settings.onnx_arena_extend_strategy - ) + + @property + def is_loaded(self) -> bool: + """Check if model is loaded.""" + return self._session is not None async def load_model(self, path: str) -> None: """Load ONNX model. @@ -115,28 +111,29 @@ class ONNXCPUBackend(BaseModelBackend): Configured session options """ options = SessionOptions() + config = model_config.onnx_cpu # Set optimization level - if self._config.optimization_level == "all": + if config.optimization_level == "all": options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL - elif self._config.optimization_level == "basic": + elif config.optimization_level == "basic": options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC else: options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL # Configure threading - options.intra_op_num_threads = self._config.num_threads - options.inter_op_num_threads = self._config.inter_op_threads + options.intra_op_num_threads = config.num_threads + options.inter_op_num_threads = config.inter_op_threads # Set execution mode options.execution_mode = ( ExecutionMode.ORT_PARALLEL - if self._config.execution_mode == "parallel" + if config.execution_mode == "parallel" else ExecutionMode.ORT_SEQUENTIAL ) # Configure memory optimization - options.enable_mem_pattern = self._config.memory_pattern + options.enable_mem_pattern = config.memory_pattern return options @@ -148,7 +145,15 @@ class ONNXCPUBackend(BaseModelBackend): """ return { "CPUExecutionProvider": { - "arena_extend_strategy": self._config.arena_extend_strategy, + "arena_extend_strategy": model_config.onnx_cpu.arena_extend_strategy, "cpu_memory_arena_cfg": "cpu:0" } - } \ No newline at end of file + } + + def unload(self) -> None: + """Unload model and free resources.""" + if self._session is not None: + del self._session + self._session = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() \ No newline at end of file diff --git a/api/src/inference/onnx_gpu.py b/api/src/inference/onnx_gpu.py index d003a86..2df32e2 100644 --- a/api/src/inference/onnx_gpu.py +++ b/api/src/inference/onnx_gpu.py @@ -13,8 +13,7 @@ from onnxruntime import ( ) from ..core import paths -from ..core.config import settings -from ..structures.model_schemas import ONNXGPUConfig +from ..core.model_config import model_config from .base import BaseModelBackend @@ -28,18 +27,11 @@ class ONNXGPUBackend(BaseModelBackend): raise RuntimeError("CUDA not available") self._device = "cuda" self._session: Optional[InferenceSession] = None - self._config = ONNXGPUConfig( - optimization_level=settings.onnx_optimization_level, - num_threads=settings.onnx_num_threads, - inter_op_threads=settings.onnx_inter_op_threads, - execution_mode=settings.onnx_execution_mode, - memory_pattern=settings.onnx_memory_pattern, - arena_extend_strategy=settings.onnx_arena_extend_strategy, - device_id=0, - gpu_mem_limit=0.7, - cudnn_conv_algo_search="EXHAUSTIVE", - do_copy_in_default_stream=True - ) + + @property + def is_loaded(self) -> bool: + """Check if model is loaded.""" + return self._session is not None async def load_model(self, path: str) -> None: """Load ONNX model. @@ -121,28 +113,29 @@ class ONNXGPUBackend(BaseModelBackend): Configured session options """ options = SessionOptions() + config = model_config.onnx_gpu # Set optimization level - if self._config.optimization_level == "all": + if config.optimization_level == "all": options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL - elif self._config.optimization_level == "basic": + elif config.optimization_level == "basic": options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC else: options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL # Configure threading - options.intra_op_num_threads = self._config.num_threads - options.inter_op_num_threads = self._config.inter_op_threads + options.intra_op_num_threads = config.num_threads + options.inter_op_num_threads = config.inter_op_threads # Set execution mode options.execution_mode = ( ExecutionMode.ORT_PARALLEL - if self._config.execution_mode == "parallel" + if config.execution_mode == "parallel" else ExecutionMode.ORT_SEQUENTIAL ) # Configure memory optimization - options.enable_mem_pattern = self._config.memory_pattern + options.enable_mem_pattern = config.memory_pattern return options @@ -152,12 +145,21 @@ class ONNXGPUBackend(BaseModelBackend): Returns: Provider configuration """ + config = model_config.onnx_gpu return { "CUDAExecutionProvider": { - "device_id": self._config.device_id, - "arena_extend_strategy": self._config.arena_extend_strategy, - "gpu_mem_limit": int(self._config.gpu_mem_limit * torch.cuda.get_device_properties(0).total_memory), - "cudnn_conv_algo_search": self._config.cudnn_conv_algo_search, - "do_copy_in_default_stream": self._config.do_copy_in_default_stream + "device_id": config.device_id, + "arena_extend_strategy": config.arena_extend_strategy, + "gpu_mem_limit": int(config.gpu_mem_limit * torch.cuda.get_device_properties(0).total_memory), + "cudnn_conv_algo_search": config.cudnn_conv_algo_search, + "do_copy_in_default_stream": config.do_copy_in_default_stream } - } \ No newline at end of file + } + + def unload(self) -> None: + """Unload model and free resources.""" + if self._session is not None: + del self._session + self._session = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() \ No newline at end of file diff --git a/api/src/inference/pytorch_cpu.py b/api/src/inference/pytorch_cpu.py index e3fe5fa..b1d4328 100644 --- a/api/src/inference/pytorch_cpu.py +++ b/api/src/inference/pytorch_cpu.py @@ -9,7 +9,7 @@ from loguru import logger from ..builds.models import build_model from ..core import paths -from ..structures.model_schemas import PyTorchCPUConfig +from ..core.model_config import model_config from .base import BaseModelBackend @@ -118,12 +118,12 @@ class PyTorchCPUBackend(BaseModelBackend): super().__init__() self._device = "cpu" self._model: Optional[torch.nn.Module] = None - self._config = PyTorchCPUConfig() # Configure PyTorch CPU settings - if self._config.num_threads > 0: - torch.set_num_threads(self._config.num_threads) - if self._config.pin_memory: + config = model_config.pytorch_cpu + if config.num_threads > 0: + torch.set_num_threads(config.num_threads) + if config.pin_memory: torch.set_default_tensor_type(torch.FloatTensor) async def load_model(self, path: str) -> None: diff --git a/api/src/inference/pytorch_gpu.py b/api/src/inference/pytorch_gpu.py index 3b41f9f..2995818 100644 --- a/api/src/inference/pytorch_gpu.py +++ b/api/src/inference/pytorch_gpu.py @@ -9,7 +9,7 @@ from loguru import logger from ..builds.models import build_model from ..core import paths -from ..structures.model_schemas import PyTorchConfig +from ..core.model_config import model_config from .base import BaseModelBackend @@ -96,7 +96,12 @@ class PyTorchGPUBackend(BaseModelBackend): raise RuntimeError("CUDA not available") self._device = "cuda" self._model: Optional[torch.nn.Module] = None - self._config = PyTorchConfig() + + # Configure GPU settings + config = model_config.pytorch_gpu + if config.sync_cuda: + torch.cuda.synchronize() + torch.cuda.set_device(config.device_id) async def load_model(self, path: str) -> None: """Load PyTorch model. @@ -154,13 +159,19 @@ class PyTorchGPUBackend(BaseModelBackend): except Exception as e: logger.error(f"Generation failed: {e}") + if model_config.pytorch_gpu.retry_on_oom and "out of memory" in str(e).lower(): + self._clear_memory() + return self.generate(tokens, voice, speed) # Retry once raise + finally: + if model_config.pytorch_gpu.sync_cuda: + torch.cuda.synchronize() def _check_memory(self) -> bool: """Check if memory usage is above threshold.""" if torch.cuda.is_available(): memory_gb = torch.cuda.memory_allocated() / 1e9 - return memory_gb > self._config.memory_threshold + return memory_gb > model_config.pytorch_gpu.memory_threshold return False def _clear_memory(self) -> None: diff --git a/api/src/inference/voice_manager.py b/api/src/inference/voice_manager.py index 295040c..15932c7 100644 --- a/api/src/inference/voice_manager.py +++ b/api/src/inference/voice_manager.py @@ -33,7 +33,15 @@ class VoiceManager: Returns: Path to voice file if exists, None otherwise """ - voice_path = os.path.join(settings.voices_dir, f"{voice_name}.pt") + # Get api directory path (two levels up from inference) + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + # Construct voice path relative to api directory + voice_path = os.path.join(api_dir, settings.voices_dir, f"{voice_name}.pt") + + # Ensure voices directory exists + os.makedirs(os.path.dirname(voice_path), exist_ok=True) + return voice_path if os.path.exists(voice_path) else None async def load_voice(self, voice_name: str, device: str = "cpu") -> torch.Tensor: @@ -112,8 +120,15 @@ class VoiceManager: combined_name = "_".join(voices) combined_tensor = torch.mean(torch.stack(voice_tensors), dim=0) + # Get api directory path + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + voices_dir = os.path.join(api_dir, settings.voices_dir) + + # Ensure voices directory exists + os.makedirs(voices_dir, exist_ok=True) + # Save combined voice - combined_path = os.path.join(settings.voices_dir, f"{combined_name}.pt") + combined_path = os.path.join(voices_dir, f"{combined_name}.pt") try: torch.save(combined_tensor, combined_path) except Exception as e: @@ -132,7 +147,15 @@ class VoiceManager: """ voices = [] try: - for entry in os.listdir(settings.voices_dir): + # Get api directory path + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + voices_dir = os.path.join(api_dir, settings.voices_dir) + + # Ensure voices directory exists + os.makedirs(voices_dir, exist_ok=True) + + # List voice files + for entry in os.listdir(voices_dir): if entry.endswith(".pt"): voices.append(entry[:-3]) # Remove .pt extension except Exception as e: diff --git a/api/src/main.py b/api/src/main.py index 4effda2..ec1b6c8 100644 --- a/api/src/main.py +++ b/api/src/main.py @@ -13,7 +13,6 @@ from loguru import logger from .core.config import settings from .routers.development import router as dev_router from .routers.openai_compatible import router as openai_router -from .services.tts_model import TTSModel from .services.tts_service import TTSService @@ -44,25 +43,32 @@ async def lifespan(app: FastAPI): """Lifespan context manager for model initialization""" logger.info("Loading TTS model and voice packs...") - # Initialize the main model with warm-up - voicepack_count = await TTSModel.setup() - # boundary = "█████╗"*9 + # Initialize service + service = TTSService() + await service.ensure_initialized() + + # Get available voices + voices = await service.list_voices() + voicepack_count = len(voices) + + # Get device info from model manager + device = "GPU" if settings.use_gpu else "CPU" + model = "ONNX" if settings.use_onnx else "PyTorch" boundary = "░" * 2*12 startup_msg = f""" {boundary} ╔═╗┌─┐┌─┐┌┬┐ - ╠╣ ├─┤└─┐ │ - ╚ ┴ ┴└─┘ ┴ + ╠╣ ├─┤└─┐ │ + ╚ ┴ ┴└─┘ ┴ ╦╔═┌─┐┬┌─┌─┐ ╠╩╗│ │├┴┐│ │ ╩ ╩└─┘┴ ┴└─┘ {boundary} """ - # TODO: Improve CPU warmup, threads, memory, etc - startup_msg += f"\nModel warmed up on {TTSModel.get_device()}" + startup_msg += f"\nModel warmed up on {device}: {model}" startup_msg += f"\n{voicepack_count} voice packs loaded\n" startup_msg += f"\n{boundary}\n" logger.info(startup_msg) diff --git a/api/src/routers/development.py b/api/src/routers/development.py index bc43eb0..4713440 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -6,7 +6,6 @@ from loguru import logger from ..services.audio import AudioService from ..services.text_processing import phonemize, tokenize -from ..services.tts_model import TTSModel from ..services.tts_service import TTSService from ..structures.text_schemas import ( GenerateFromPhonemesRequest, @@ -82,27 +81,34 @@ async def generate_from_phonemes( detail={"error": "Invalid request", "message": "Phonemes cannot be empty"}, ) - # Validate voice exists - voice_path = tts_service._get_voice_path(request.voice) - if not voice_path: - raise HTTPException( - status_code=400, - detail={ - "error": "Invalid request", - "message": f"Voice not found: {request.voice}", - }, - ) - try: - # Load voice - voicepack = tts_service._load_voice(voice_path) + # Ensure service is initialized + await tts_service.ensure_initialized() + + # Validate voice exists + available_voices = await tts_service.list_voices() + if request.voice not in available_voices: + raise HTTPException( + status_code=400, + detail={ + "error": "Invalid request", + "message": f"Voice not found: {request.voice}", + }, + ) # Convert phonemes to tokens tokens = tokenize(request.phonemes) tokens = [0] + tokens + [0] # Add start/end tokens # Generate audio directly from tokens - audio = TTSModel.generate_from_tokens(tokens, voicepack, request.speed) + audio = await tts_service.model_manager.generate( + tokens, + request.voice, + speed=request.speed + ) + + if audio is None: + raise ValueError("Failed to generate audio") # Convert to WAV bytes wav_bytes = AudioService.convert_audio( diff --git a/api/src/services/text_processing/__init__.py b/api/src/services/text_processing/__init__.py index 8427232..c9b3eb4 100644 --- a/api/src/services/text_processing/__init__.py +++ b/api/src/services/text_processing/__init__.py @@ -1,13 +1,28 @@ -from .normalizer import normalize_text -from .phonemizer import EspeakBackend, PhonemizerBackend, phonemize -from .vocabulary import VOCAB, decode_tokens, tokenize +"""Text processing pipeline.""" -__all__ = [ - "normalize_text", - "phonemize", - "tokenize", - "decode_tokens", - "VOCAB", - "PhonemizerBackend", - "EspeakBackend", -] +from .chunker import split_text +from .normalizer import normalize_text +from .phonemizer import phonemize +from .vocabulary import tokenize + + +def process_text(text: str, language: str = "a") -> list[int]: + """Process text through the full pipeline. + + Args: + text: Input text + language: Language code ('a' for US English, 'b' for British English) + + Returns: + List of token IDs + + Note: + The pipeline: + 1. Converts text to phonemes using phonemizer + 2. Converts phonemes to token IDs using vocabulary + """ + # Convert text to phonemes + phonemes = phonemize(text, language=language) + + # Convert phonemes to token IDs + return tokenize(phonemes) diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 20e9b69..9c51b19 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -5,17 +5,16 @@ import os import time from typing import List, Tuple -import torch - import numpy as np import scipy.io.wavfile as wavfile +import torch from loguru import logger from ..core.config import settings from ..inference.model_manager import get_manager as get_model_manager from ..inference.voice_manager import get_manager as get_voice_manager from .audio import AudioNormalizer, AudioService -from .text_processing import chunker, normalize_text +from .text_processing import chunker, normalize_text, process_text class TTSService: @@ -41,16 +40,33 @@ class TTSService: raise self._initialization_error try: - # Determine model path based on hardware + # Get api directory path (one level up from src) + api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + # Determine model file and backend based on hardware if settings.use_gpu and torch.cuda.is_available(): - model_path = os.path.join(settings.model_dir, settings.pytorch_model_path) + model_file = settings.pytorch_model_file backend_type = 'pytorch_gpu' else: - model_path = os.path.join(settings.model_dir, settings.onnx_model_path) + model_file = settings.onnx_model_file backend_type = 'onnx_cpu' - # Initialize model - await self.model_manager.load_model(model_path, backend_type) + # Construct model path relative to api directory + model_path = os.path.join(api_dir, settings.model_dir, model_file) + + # Ensure model directory exists + os.makedirs(os.path.dirname(model_path), exist_ok=True) + + if not os.path.exists(model_path): + raise RuntimeError(f"Model file not found: {model_path}") + + # Load default voice for warmup + backend = self.model_manager.get_backend(backend_type) + warmup_voice = await self.voice_manager.load_voice(settings.default_voice, device=backend.device) + logger.info(f"Loaded voice {settings.default_voice} for warmup") + + # Initialize model with warmup voice + await self.model_manager.load_model(model_path, warmup_voice, backend_type) logger.info(f"Initialized model on {backend_type} backend") self._initialized = True @@ -86,16 +102,19 @@ class TTSService: audio_chunks = [] for chunk in chunker.split_text(text): try: - # Process text - - sequences = process_text(chunk) - if not sequences: + # Convert chunk to token IDs + tokens = process_text(chunk) + if not tokens: continue + # Get backend and load voice + backend = self.model_manager.get_backend() + voice_tensor = await self.voice_manager.load_voice(voice, device=backend.device) + # Generate audio chunk_audio = await self.model_manager.generate( - sequences[0], - voice, + tokens, + voice_tensor, speed=speed ) if chunk_audio is not None: @@ -154,14 +173,17 @@ class TTSService: while current_chunk is not None: next_chunk = next(chunk_gen, None) try: - # Process text - from ..text_processing import process_text - sequences = process_text(current_chunk) - if sequences: + # Convert chunk to token IDs + tokens = process_text(current_chunk) + if tokens: + # Get backend and load voice + backend = self.model_manager.get_backend() + voice_tensor = await self.voice_manager.load_voice(voice, device=backend.device) + # Generate audio chunk_audio = await self.model_manager.generate( - sequences[0], - voice, + tokens, + voice_tensor, speed=speed ) diff --git a/api/src/structures/model_schemas.py b/api/src/structures/model_schemas.py index 51ef910..b6ed61d 100644 --- a/api/src/structures/model_schemas.py +++ b/api/src/structures/model_schemas.py @@ -1,26 +1,13 @@ -"""Model and voice configuration schemas.""" +"""Voice configuration schemas.""" -from pydantic import BaseModel - - -class ModelConfig(BaseModel): - """Model configuration.""" - optimization_level: str = "all" # all, basic, none - num_threads: int = 4 - inter_op_threads: int = 4 - execution_mode: str = "parallel" # parallel, sequential - memory_pattern: bool = True - arena_extend_strategy: str = "kNextPowerOfTwo" - - class Config: - frozen = True # Make config immutable +from pydantic import BaseModel, Field class VoiceConfig(BaseModel): """Voice configuration.""" - use_cache: bool = True - cache_size: int = 3 # Number of voices to cache - validate_on_load: bool = True # Whether to validate voices when loading + use_cache: bool = Field(True, description="Whether to cache loaded voices") + cache_size: int = Field(3, description="Number of voices to cache") + validate_on_load: bool = Field(True, description="Whether to validate voices when loading") class Config: frozen = True # Make config immutable \ No newline at end of file diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile index e4bd32c..0613022 100644 --- a/docker/cpu/Dockerfile +++ b/docker/cpu/Dockerfile @@ -17,29 +17,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN useradd -m -u 1000 appuser # Create directories and set ownership -RUN mkdir -p /app/models && \ - mkdir -p /app/api/src/voices && \ +RUN mkdir -p /app/api/src/voices && \ chown -R appuser:appuser /app USER appuser - -# Download and extract models -WORKDIR /app/models -RUN set -x && \ - curl -L -o model.tar.gz https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.0.1/kokoro-82m-onnx.tar.gz && \ - echo "Downloaded model.tar.gz:" && ls -lh model.tar.gz && \ - tar xzf model.tar.gz && \ - echo "Contents after extraction:" && ls -lhR && \ - rm model.tar.gz && \ - echo "Final contents:" && ls -lhR - -# Download and extract voice models -WORKDIR /app/api/src/voices -RUN curl -L -o voices.tar.gz https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.0.1/voice-models.tar.gz && \ - tar xzf voices.tar.gz && \ - rm voices.tar.gz - -# Switch back to app directory WORKDIR /app # Copy dependency files @@ -59,9 +40,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Set environment variables ENV PYTHONUNBUFFERED=1 -ENV PYTHONPATH=/app:/app/models +ENV PYTHONPATH=/app ENV PATH="/app/.venv/bin:$PATH" ENV UV_LINK_MODE=copy +ENV USE_GPU=false # Run FastAPI server CMD ["uv", "run", "python", "-m", "uvicorn", "api.src.main:app", "--host", "0.0.0.0", "--port", "8880", "--log-level", "debug"] diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml index 0f9003a..2138594 100644 --- a/docker/cpu/docker-compose.yml +++ b/docker/cpu/docker-compose.yml @@ -6,12 +6,11 @@ services: context: ../.. dockerfile: docker/cpu/Dockerfile volumes: - - ../../api/src:/app/api/src - - ../../api/src/voices:/app/api/src/voices + - ../../api:/app/api ports: - "8880:8880" environment: - - PYTHONPATH=/app:/app/models + - PYTHONPATH=/app:/app/api # ONNX Optimization Settings for vectorized operations - ONNX_NUM_THREADS=8 # Maximize core usage for vectorized ops - ONNX_INTER_OP_THREADS=4 # Higher inter-op for parallel matrix operations @@ -20,20 +19,20 @@ services: - ONNX_MEMORY_PATTERN=true - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo - # Gradio UI service [Comment out everything below if you don't need it] - gradio-ui: - image: ghcr.io/remsky/kokoro-fastapi-ui:v0.1.0 - # Uncomment below (and comment out above) to build from source instead of using the released image - build: - context: ../../ui - ports: - - "7860:7860" - volumes: - - ../../ui/data:/app/ui/data - - ../../ui/app.py:/app/app.py # Mount app.py for hot reload - environment: - - GRADIO_WATCH=True # Enable hot reloading - - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered - - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view - - API_HOST=kokoro-tts # Set TTS service URL - - API_PORT=8880 # Set TTS service PORT + # # Gradio UI service [Comment out everything below if you don't need it] + # gradio-ui: + # image: ghcr.io/remsky/kokoro-fastapi-ui:v0.1.0 + # # Uncomment below (and comment out above) to build from source instead of using the released image + # build: + # context: ../../ui + # ports: + # - "7860:7860" + # volumes: + # - ../../ui/data:/app/ui/data + # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload + # environment: + # - GRADIO_WATCH=True # Enable hot reloading + # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view + # - API_HOST=kokoro-tts # Set TTS service URL + # - API_PORT=8880 # Set TTS service PORT diff --git a/docker/cpu/download_onnx.py b/docker/cpu/download_onnx.py new file mode 100755 index 0000000..a04718c --- /dev/null +++ b/docker/cpu/download_onnx.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import os +import sys +import requests +from pathlib import Path +from typing import List + +def download_file(url: str, output_dir: Path) -> None: + """Download a file from URL to the specified directory.""" + filename = os.path.basename(url) + output_path = output_dir / filename + + print(f"Downloading {filename}...") + response = requests.get(url, stream=True) + response.raise_for_status() + + with open(output_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + +def find_project_root() -> Path: + """Find project root by looking for api directory.""" + max_steps = 5 + current = Path(__file__).resolve() + for _ in range(max_steps): + if (current / 'api').is_dir(): + return current + current = current.parent + raise RuntimeError("Could not find project root (no api directory found)") + +def main(custom_models: List[str] = None): + # Always use top-level models directory relative to project root + project_root = find_project_root() + models_dir = project_root / 'api' / 'src' / 'models' + models_dir.mkdir(exist_ok=True, parents=True) + + # Default ONNX model if no arguments provided + default_models = [ + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.onnx", + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx" + ] + + # Use provided models or default + models_to_download = custom_models if custom_models else default_models + + for model_url in models_to_download: + try: + download_file(model_url, models_dir) + except Exception as e: + print(f"Error downloading {model_url}: {e}") + +if __name__ == "__main__": + main(sys.argv[1:] if len(sys.argv) > 1 else None) \ No newline at end of file diff --git a/docker/cpu/download_onnx.sh b/docker/cpu/download_onnx.sh new file mode 100755 index 0000000..c0a250b --- /dev/null +++ b/docker/cpu/download_onnx.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Ensure models directory exists +mkdir -p api/src/models + +# Function to download a file +download_file() { + local url="$1" + local filename=$(basename "$url") + echo "Downloading $filename..." + curl -L "$url" -o "api/src/models/$filename" +} + +# Default ONNX model if no arguments provided +DEFAULT_MODELS=( + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.onnx" + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx" +) + +# Use provided models or default +if [ $# -gt 0 ]; then + MODELS=("$@") +else + MODELS=("${DEFAULT_MODELS[@]}") +fi + +# Download all models +for model in "${MODELS[@]}"; do + download_file "$model" +done + +echo "ONNX model download complete!" \ No newline at end of file diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile index ed4676d..e6bf9ff 100644 --- a/docker/gpu/Dockerfile +++ b/docker/gpu/Dockerfile @@ -1,9 +1,8 @@ -FROM nvidia/cuda:12.1.0-base-ubuntu22.04 - -# Install Python and other dependencies +FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime +# Set non-interactive frontend +ENV DEBIAN_FRONTEND=noninteractive +# Install dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.10 \ - python3.10-venv \ espeak-ng \ git \ libsndfile1 \ @@ -19,25 +18,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN useradd -m -u 1000 appuser # Create directories and set ownership -RUN mkdir -p /app/models && \ - mkdir -p /app/api/src/voices && \ +RUN mkdir -p /app/api/src/voices && \ chown -R appuser:appuser /app USER appuser - -# Download and extract models -WORKDIR /app/models -RUN curl -L -o model.tar.gz https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.0.1/kokoro-82m-pytorch.tar.gz && \ - tar xzf model.tar.gz && \ - rm model.tar.gz - -# Download and extract voice models -WORKDIR /app/api/src/voices -RUN curl -L -o voices.tar.gz https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.0.1/voice-models.tar.gz && \ - tar xzf voices.tar.gz && \ - rm voices.tar.gz - -# Switch back to app directory WORKDIR /app # Copy dependency files @@ -57,9 +41,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Set environment variables ENV PYTHONUNBUFFERED=1 -ENV PYTHONPATH=/app:/app/models +ENV PYTHONPATH=/app ENV PATH="/app/.venv/bin:$PATH" ENV UV_LINK_MODE=copy +ENV USE_GPU=true # Run FastAPI server CMD ["uv", "run", "python", "-m", "uvicorn", "api.src.main:app", "--host", "0.0.0.0", "--port", "8880", "--log-level", "debug"] diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml index 9433dc6..458eff9 100644 --- a/docker/gpu/docker-compose.yml +++ b/docker/gpu/docker-compose.yml @@ -6,12 +6,14 @@ services: context: ../.. dockerfile: docker/gpu/Dockerfile volumes: - - ../../api/src:/app/api/src # Mount src for development - - ../../api/src/voices:/app/api/src/voices # Mount voices for persistence + - ../../api:/app/api ports: - "8880:8880" environment: - - PYTHONPATH=/app:/app/models + - PYTHONPATH=/app + - USE_GPU=true + - USE_ONNX=false + - PYTHONUNBUFFERED=1 deploy: resources: reservations: @@ -20,20 +22,20 @@ services: count: 1 capabilities: [gpu] - # Gradio UI service - gradio-ui: - image: ghcr.io/remsky/kokoro-fastapi-ui:v0.1.0 - # Uncomment below to build from source instead of using the released image - # build: - # context: ../../ui - ports: - - "7860:7860" - volumes: - - ../../ui/data:/app/ui/data - - ../../ui/app.py:/app/app.py # Mount app.py for hot reload - environment: - - GRADIO_WATCH=1 # Enable hot reloading - - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered - - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view - - API_HOST=kokoro-tts # Set TTS service URL - - API_PORT=8880 # Set TTS service PORT + # # Gradio UI service + # gradio-ui: + # image: ghcr.io/remsky/kokoro-fastapi-ui:v0.1.0 + # # Uncomment below to build from source instead of using the released image + # # build: + # # context: ../../ui + # ports: + # - "7860:7860" + # volumes: + # - ../../ui/data:/app/ui/data + # - ../../ui/app.py:/app/app.py # Mount app.py for hot reload + # environment: + # - GRADIO_WATCH=1 # Enable hot reloading + # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + # - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view + # - API_HOST=kokoro-tts # Set TTS service URL + # - API_PORT=8880 # Set TTS service PORT diff --git a/docker/gpu/download_pth.py b/docker/gpu/download_pth.py new file mode 100755 index 0000000..f58f29d --- /dev/null +++ b/docker/gpu/download_pth.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +import os +import sys +import requests +from pathlib import Path +from typing import List + +def download_file(url: str, output_dir: Path) -> None: + """Download a file from URL to the specified directory.""" + filename = os.path.basename(url) + if not filename.endswith('.pth'): + print(f"Warning: {filename} is not a .pth file") + return + + output_path = output_dir / filename + + print(f"Downloading {filename}...") + response = requests.get(url, stream=True) + response.raise_for_status() + + with open(output_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + +def find_project_root() -> Path: + """Find project root by looking for api directory.""" + max_steps = 5 + current = Path(__file__).resolve() + for _ in range(max_steps): + if (current / 'api').is_dir(): + return current + current = current.parent + raise RuntimeError("Could not find project root (no api directory found)") + +def main(custom_models: List[str] = None): + # Find project root and ensure models directory exists + project_root = find_project_root() + models_dir = project_root / 'api' / 'src' / 'models' + print(f"Downloading models to {models_dir}") + models_dir.mkdir(exist_ok=True) + + # Default PTH model if no arguments provided + default_models = [ + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.pth" + ] + + # Use provided models or default + models_to_download = custom_models if custom_models else default_models + + for model_url in models_to_download: + try: + download_file(model_url, models_dir) + except Exception as e: + print(f"Error downloading {model_url}: {e}") + +if __name__ == "__main__": + main(sys.argv[1:] if len(sys.argv) > 1 else None) \ No newline at end of file diff --git a/docker/gpu/download_pth.sh b/docker/gpu/download_pth.sh new file mode 100755 index 0000000..c8bda83 --- /dev/null +++ b/docker/gpu/download_pth.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Ensure models directory exists +mkdir -p api/src/models + +# Function to download a file +download_file() { + local url="$1" + local filename=$(basename "$url") + echo "Downloading $filename..." + curl -L "$url" -o "api/src/models/$filename" +} + +# Default PTH model if no arguments provided +DEFAULT_MODELS=( + "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.pth" +) + +# Use provided models or default +if [ $# -gt 0 ]; then + MODELS=("$@") +else + MODELS=("${DEFAULT_MODELS[@]}") +fi + +# Download all models +for model in "${MODELS[@]}"; do + download_file "$model" +done + +echo "PyTorch model download complete!" \ No newline at end of file diff --git a/uv.lock b/uv.lock index 52e4de2..ab36884 100644 --- a/uv.lock +++ b/uv.lock @@ -2,9 +2,17 @@ version = 1 requires-python = ">=3.10" resolution-markers = [ "python_full_version == '3.11.*'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')", "python_full_version < '3.11'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')", "python_full_version >= '3.13'", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform == 'darwin')", "python_full_version == '3.12.*'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')", ] conflicts = [[ { package = "kokoro-fastapi", extra = "cpu" }, @@ -798,6 +806,7 @@ dependencies = [ { name = "phonemizer" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "pydub" }, { name = "python-dotenv" }, { name = "regex" }, { name = "requests" }, @@ -812,7 +821,8 @@ dependencies = [ [package.optional-dependencies] cpu = [ - { name = "torch", version = "2.5.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" } }, + { name = "torch", version = "2.5.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "torch", version = "2.5.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] gpu = [ { name = "torch", version = "2.5.1+cu121", source = { registry = "https://download.pytorch.org/whl/cu121" } }, @@ -824,7 +834,6 @@ test = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, - { name = "ruff" }, ] [package.metadata] @@ -845,18 +854,18 @@ requires-dist = [ { name = "phonemizer", specifier = "==3.3.0" }, { name = "pydantic", specifier = "==2.10.4" }, { name = "pydantic-settings", specifier = "==2.7.0" }, + { name = "pydub", specifier = ">=0.25.1" }, { name = "pytest", marker = "extra == 'test'", specifier = "==8.0.0" }, { name = "pytest-asyncio", marker = "extra == 'test'", specifier = "==0.23.5" }, { name = "pytest-cov", marker = "extra == 'test'", specifier = "==4.1.0" }, { name = "python-dotenv", specifier = "==1.0.1" }, { name = "regex", specifier = "==2024.11.6" }, { name = "requests", specifier = "==2.32.3" }, - { name = "ruff", marker = "extra == 'test'", specifier = ">=0.2.2" }, { name = "scipy", specifier = "==1.14.1" }, { name = "soundfile", specifier = "==0.13.0" }, { name = "sqlalchemy", specifier = "==2.0.27" }, { name = "tiktoken", specifier = "==0.8.0" }, - { name = "torch", marker = "extra == 'cpu'", specifier = "==2.5.1+cpu", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "kokoro-fastapi", extra = "cpu" } }, + { name = "torch", marker = "extra == 'cpu'", specifier = "==2.5.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "kokoro-fastapi", extra = "cpu" } }, { name = "torch", marker = "extra == 'gpu'", specifier = "==2.5.1+cu121", index = "https://download.pytorch.org/whl/cu121", conflict = { package = "kokoro-fastapi", extra = "gpu" } }, { name = "tqdm", specifier = "==4.67.1" }, { name = "transformers", specifier = "==4.47.1" }, @@ -2334,24 +2343,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/4f/12207897848a653d03ebbf6775a29d949408ded5f99b2d87198bc5c93508/tomlkit-0.12.0-py3-none-any.whl", hash = "sha256:926f1f37a1587c7a4f6c7484dae538f1345d96d793d9adab5d3675957b1d0766", size = 37334 }, ] +[[package]] +name = "torch" +version = "2.5.1" +source = { registry = "https://download.pytorch.org/whl/cpu" } +resolution-markers = [ + "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')", + "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')", + "(python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform == 'darwin')", + "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')", +] +dependencies = [ + { name = "filelock", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "setuptools", marker = "(python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'darwin')" }, + { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:269b10c34430aa8e9643dbe035dc525c4a9b1d671cd3dbc8ecbcaed280ae322d" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5b3203f191bc40783c99488d2e776dcf93ac431a59491d627a1ca5b3ae20b22" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36d1be99281b6f602d9639bd0af3ee0006e7aab16f6718d86f709d395b6f262c" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1" }, +] + [[package]] name = "torch" version = "2.5.1+cpu" source = { registry = "https://download.pytorch.org/whl/cpu" } resolution-markers = [ - "python_full_version == '3.11.*'", - "python_full_version < '3.11'", - "python_full_version >= '3.13'", - "python_full_version == '3.12.*'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ - { name = "filelock" }, - { name = "fsspec" }, - { name = "jinja2" }, - { name = "networkx" }, - { name = "setuptools", marker = "python_full_version >= '3.12'" }, - { name = "sympy" }, - { name = "typing-extensions" }, + { name = "filelock", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "fsspec", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "jinja2", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "networkx", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "setuptools", marker = "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "typing-extensions", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:7f91a2200e352745d70e22396bd501448e28350fbdbd8d8b1c83037e25451150" },