v1_0 multilanguage initial support

-note: all languages currently installed, selected by prefix of first chosen voice in call
2025-08-05 16:48:53 +00:00 · 2025-02-03 03:33:12 -07:00 · 2025-02-03 03:33:12 -07:00 · 903bf91c81
commit 903bf91c81
parent 240d991405
50 changed files with 997 additions and 167 deletions
--- a/api/src/builds/v1_0/config.json
+++ b/api/src/builds/v1_0/config.json
@ -0,0 +1,172 @@
 {
  "istftnet": {
    "upsample_kernel_sizes": [
      20,
      12
    ],
    "upsample_rates": [
      10,
      6
    ],
    "gen_istft_hop_size": 5,
    "gen_istft_n_fft": 20,
    "resblock_dilation_sizes": [
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ]
    ],
    "resblock_kernel_sizes": [
      3,
      7,
      11
    ],
    "upsample_initial_channel": 512
  },
  "dim_in": 64,
  "dropout": 0.2,
  "hidden_dim": 512,
  "max_conv_dim": 512,
  "max_dur": 50,
  "multispeaker": true,
  "n_layer": 3,
  "n_mels": 80,
  "n_token": 178,
  "style_dim": 128,
  "text_encoder_kernel_size": 5,
  "plbert": {
    "hidden_size": 768,
    "num_attention_heads": 12,
    "intermediate_size": 2048,
    "max_position_embeddings": 512,
    "num_hidden_layers": 12,
    "dropout": 0.1
  },
  "vocab": {
    ";": 1,
    ":": 2,
    ",": 3,
    ".": 4,
    "!": 5,
    "?": 6,
    "—": 9,
    "…": 10,
    "\"": 11,
    "(": 12,
    ")": 13,
    "“": 14,
    "”": 15,
    " ": 16,
    "̃": 17,
    "ʣ": 18,
    "ʥ": 19,
    "ʦ": 20,
    "ʨ": 21,
    "ᵝ": 22,
    "ꭧ": 23,
    "A": 24,
    "I": 25,
    "O": 31,
    "Q": 33,
    "S": 35,
    "T": 36,
    "W": 39,
    "Y": 41,
    "ᵊ": 42,
    "a": 43,
    "b": 44,
    "c": 45,
    "d": 46,
    "e": 47,
    "f": 48,
    "h": 50,
    "i": 51,
    "j": 52,
    "k": 53,
    "l": 54,
    "m": 55,
    "n": 56,
    "o": 57,
    "p": 58,
    "q": 59,
    "r": 60,
    "s": 61,
    "t": 62,
    "u": 63,
    "v": 64,
    "w": 65,
    "x": 66,
    "y": 67,
    "z": 68,
    "ɑ": 69,
    "ɐ": 70,
    "ɒ": 71,
    "æ": 72,
    "β": 75,
    "ɔ": 76,
    "ɕ": 77,
    "ç": 78,
    "ɖ": 80,
    "ð": 81,
    "ʤ": 82,
    "ə": 83,
    "ɚ": 85,
    "ɛ": 86,
    "ɜ": 87,
    "ɟ": 90,
    "ɡ": 92,
    "ɥ": 99,
    "ɨ": 101,
    "ɪ": 102,
    "ʝ": 103,
    "ɯ": 110,
    "ɰ": 111,
    "ŋ": 112,
    "ɳ": 113,
    "ɲ": 114,
    "ɴ": 115,
    "ø": 116,
    "ɸ": 118,
    "θ": 119,
    "œ": 120,
    "ɹ": 123,
    "ɾ": 125,
    "ɻ": 126,
    "ʁ": 128,
    "ɽ": 129,
    "ʂ": 130,
    "ʃ": 131,
    "ʈ": 132,
    "ʧ": 133,
    "ʊ": 135,
    "ʋ": 136,
    "ʌ": 138,
    "ɣ": 139,
    "ɤ": 140,
    "χ": 142,
    "ʎ": 143,
    "ʒ": 147,
    "ʔ": 148,
    "ˈ": 156,
    "ˌ": 157,
    "ː": 158,
    "ʰ": 162,
    "ʲ": 164,
    "↓": 169,
    "→": 171,
    "↗": 172,
    "↘": 173,
    "ᵻ": 177
  }
 }
--- a/api/src/core/model_config.py
+++ b/api/src/core/model_config.py
@ -2,6 +2,9 @@
 from pydantic import BaseModel, Field
 class KokoroV1Config(BaseModel):
    languages: list[str] = ["en"]
 class ONNXCPUConfig(BaseModel):
    """ONNX CPU runtime configuration."""
@ -77,6 +80,7 @@ class ModelConfig(BaseModel):
    voice_cache_size: int = Field(2, description="Maximum number of cached voices")
    # Model filenames
    pytorch_kokoro_v1_file: str = Field("v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename")
    pytorch_model_file: str = Field("kokoro-v0_19-half.pth", description="PyTorch model filename")
    onnx_model_file: str = Field("kokoro-v0_19.onnx", description="ONNX model filename")
@ -93,7 +97,7 @@ class ModelConfig(BaseModel):
        """Get configuration for specific backend.
        Args:
-            backend_type: Backend type ('pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu')
+            backend_type: Backend type ('pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu', 'kokoro_v1')
        Returns:
            Backend-specific configuration
@ -102,7 +106,7 @@ class ModelConfig(BaseModel):
            ValueError: If backend type is invalid
        """
        if backend_type not in {
-            'pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu'
+            'pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu', 'kokoro_v1'
        }:
            raise ValueError(f"Invalid backend type: {backend_type}")
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -0,0 +1,186 @@
 """PyTorch inference backend with environment-based configuration."""
 import gc
 import os
 from typing import AsyncGenerator, Optional, List, Union, Tuple
 from contextlib import nullcontext
 import numpy as np
 import torch
 from loguru import logger
 from ..core import paths
 from ..core.model_config import model_config
 from ..core.config import settings
 from .base import BaseModelBackend
 from kokoro import KModel, KPipeline
 class KokoroV1(BaseModelBackend):
    """Kokoro package based inference backend with environment-based configuration."""
    def __init__(self):
        """Initialize backend based on environment configuration."""
        super().__init__()
        # Configure device based on settings
        self._device = (
            "cuda" if settings.use_gpu and torch.cuda.is_available() else "cpu"
        )
        self._model: Optional[KModel] = None
        self._pipeline: Optional[KPipeline] = None
    async def load_model(self, path: str) -> None:
        """Load Kokoro model.
        Args:
            path: Path to model file
        Raises:
            RuntimeError: If model loading fails
        """
        try:
            # Get verified model path
            model_path = await paths.get_model_path(path)
            # Get config.json path from the same directory
            config_path = os.path.join(os.path.dirname(model_path), 'config.json')
            if not os.path.exists(config_path):
                raise RuntimeError(f"Config file not found: {config_path}")
            logger.info(f"Loading Kokoro model on {self._device}")
            logger.info(f"Config path: {config_path}")
            logger.info(f"Model path: {model_path}")
            # Initialize model with config and weights
            self._model = KModel(config=config_path, model=model_path).to(self._device).eval()
            # Initialize pipeline with American English by default
            self._pipeline = KPipeline(lang_code='a', model=self._model, device=self._device)
        except Exception as e:
            raise RuntimeError(f"Failed to load Kokoro model: {e}")
    async def generate(
        self, text: str, voice: Union[str, Tuple[str, Union[torch.Tensor, str]]], speed: float = 1.0
    ) -> AsyncGenerator[np.ndarray, None]:
        """Generate audio using model.
        Args:
            text: Input text to synthesize
            voice: Either a voice path string or a tuple of (voice_name, voice_tensor_or_path)
            speed: Speed multiplier
        Yields:
            Generated audio chunks
        Raises:
            RuntimeError: If generation fails
        """
        if not self.is_loaded:
            raise RuntimeError("Model not loaded")
        try:
            # Memory management for GPU
            if self._device == "cuda":
                if self._check_memory():
                    self._clear_memory()
            # Handle voice input
            if isinstance(voice, str):
                voice_path = voice  # Voice path provided directly
                logger.debug(f"Using voice path directly: {voice_path}")
                # Get language code from first letter of voice name
                try:
                    name = os.path.basename(voice_path)
                    logger.debug(f"Voice basename: {name}")
                    if name.endswith('.pt'):
                        name = name[:-3]
                    lang_code = name[0]
                    logger.debug(f"Extracted language code: {lang_code}")
                except Exception as e:
                    # Default to American English if we can't get language code
                    logger.warning(f"Failed to extract language code: {e}, defaulting to 'a'")
                    lang_code = 'a'
            else:
                # Unpack voice name and tensor/path
                voice_name, voice_data = voice
                # If voice_data is a path, use it directly
                if isinstance(voice_data, str):
                    voice_path = voice_data
                    logger.debug(f"Using provided voice path: {voice_path}")
                else:
                    # Save tensor to temporary file
                    import tempfile
                    temp_dir = tempfile.gettempdir()
                    voice_path = os.path.join(temp_dir, f"{voice_name}.pt")
                    logger.debug(f"Saving voice tensor to: {voice_path}")
                    torch.save(voice_data, voice_path)
                # Get language code from voice name
                lang_code = voice_name[0]
                logger.debug(f"Using language code '{lang_code}' from voice name {voice_name}")
            # Update pipeline's language code if needed
            if self._pipeline.lang_code != lang_code:
                logger.debug(f"Creating pipeline with lang_code='{lang_code}'")
                self._pipeline = KPipeline(lang_code=lang_code, model=self._model, device=self._device)
            # Generate audio using pipeline
            logger.debug(f"Generating audio for text: '{text[:100]}...'")
            for i, result in enumerate(self._pipeline(text, voice=voice_path, speed=speed)):
                logger.debug(f"Processing chunk {i+1}")
                if result.audio is not None:
                    logger.debug(f"Got audio chunk {i+1} with shape: {result.audio.shape}")
                    yield result.audio.numpy()
                else:
                    logger.warning(f"No audio in chunk {i+1}")
        except Exception as e:
            logger.error(f"Generation failed: {e}")
            if (
                self._device == "cuda"
                and model_config.pytorch_gpu.retry_on_oom
                and "out of memory" in str(e).lower()
            ):
                self._clear_memory()
                async for chunk in self.generate(text, voice, speed):
                    yield chunk
            raise
        finally:
            if self._device == "cuda" and model_config.pytorch_gpu.sync_cuda:
                torch.cuda.synchronize()
    def _check_memory(self) -> bool:
        """Check if memory usage is above threshold."""
        if self._device == "cuda":
            memory_gb = torch.cuda.memory_allocated() / 1e9
            return memory_gb > model_config.pytorch_gpu.memory_threshold
        return False
    def _clear_memory(self) -> None:
        """Clear device memory."""
        if self._device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
    def unload(self) -> None:
        """Unload model and free resources."""
        if self._model is not None:
            del self._model
            self._model = None
        if self._pipeline is not None:
            del self._pipeline
            self._pipeline = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    @property
    def is_loaded(self) -> bool:
        """Check if model is loaded."""
        return self._model is not None and self._pipeline is not None
    @property
    def device(self) -> str:
        """Get device model is running on."""
        return self._device
--- a/api/src/inference/model_manager.py
+++ b/api/src/inference/model_manager.py
@ -1,9 +1,10 @@
 """Model management and caching."""
 import asyncio
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, Union, AsyncGenerator
 import torch
 import numpy as np
 from loguru import logger
 from ..core import paths
@ -13,6 +14,7 @@ from .base import BaseModelBackend
 from .onnx_cpu import ONNXCPUBackend
 from .onnx_gpu import ONNXGPUBackend
 from .pytorch_backend import PyTorchBackend
 from .kokoro_v1 import KokoroV1
 from .session_pool import CPUSessionPool, StreamingSessionPool
@ -56,7 +58,13 @@ class ModelManager:
        device = self._determine_device()
        try:
-            if device == "cuda":
+            # First check if we should use Kokoro V1
            if model_config.pytorch_kokoro_v1_file:
                self._backends['kokoro_v1'] = KokoroV1()
                self._current_backend = 'kokoro_v1'
                logger.info(f"Initialized new Kokoro V1 backend on {device}")
            # Otherwise use legacy backends
            elif device == "cuda":
                if settings.use_onnx:
                    self._backends['onnx_gpu'] = ONNXGPUBackend()
                    self._current_backend = 'onnx_gpu'
@ -93,8 +101,11 @@ class ModelManager:
            RuntimeError: If initialization fails
        """
        try:
-            # Determine backend type based on settings
+            # First check if we should use Kokoro V1
-            if settings.use_onnx:
+            if model_config.pytorch_kokoro_v1_file:
                backend_type = 'kokoro_v1'
            # Otherwise determine legacy backend type
            elif settings.use_onnx:
                backend_type = 'onnx_gpu' if settings.use_gpu and torch.cuda.is_available() else 'onnx_cpu'
            else:
                backend_type = 'pytorch'
@ -103,17 +114,26 @@ class ModelManager:
            backend = self.get_backend(backend_type)
            # Get and verify model path
-            model_file = model_config.pytorch_model_file if not settings.use_onnx else model_config.onnx_model_file
+            if backend_type == 'kokoro_v1':
                model_file = model_config.pytorch_kokoro_v1_file
            else:
                model_file = model_config.pytorch_model_file if not settings.use_onnx else model_config.onnx_model_file
            model_path = await paths.get_model_path(model_file)
            if not await paths.verify_model_path(model_path):
                raise RuntimeError(f"Model file not found: {model_path}")
            # Pre-cache default voice and use for warmup
-            warmup_voice = await voice_manager.load_voice(
+            warmup_voice_tensor = await voice_manager.load_voice(
                settings.default_voice, device=backend.device)
            logger.info(f"Pre-cached voice {settings.default_voice} for warmup")
            # For Kokoro V1, wrap voice in tuple with name
            if isinstance(backend, KokoroV1):
                warmup_voice = (settings.default_voice, warmup_voice_tensor)
            else:
                warmup_voice = warmup_voice_tensor
            # Initialize model with warmup voice
            await self.load_model(model_path, warmup_voice, backend_type)
@ -126,7 +146,7 @@ class ModelManager:
            # Get device info for return
            device = "GPU" if settings.use_gpu else "CPU"
-            model = "ONNX" if settings.use_onnx else "PyTorch"
+            model = "Kokoro V1" if backend_type == 'kokoro_v1' else ("ONNX" if settings.use_onnx else "PyTorch")
            return device, model, voicepack_count
@ -137,7 +157,7 @@ class ModelManager:
    def get_backend(self, backend_type: Optional[str] = None) -> BaseModelBackend:
        """Get specified backend.
        Args:
-            backend_type: Backend type ('pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu'),
+            backend_type: Backend type ('pytorch_cpu', 'pytorch_gpu', 'onnx_cpu', 'onnx_gpu', 'kokoro_v1'),
                         uses default if None
        Returns:
            Model backend instance
@ -166,15 +186,18 @@ class ModelManager:
        Returns:
            Backend type to use
        """
-        # If ONNX is preferred or model is ONNX format
+        # Check if it's a Kokoro V1 model
-        if settings.use_onnx or model_path.lower().endswith('.onnx'):
+        if model_path.endswith(model_config.pytorch_kokoro_v1_file):
            return 'kokoro_v1'
        # Otherwise use legacy backend determination
        elif settings.use_onnx or model_path.lower().endswith('.onnx'):
            return 'onnx_gpu' if settings.use_gpu and torch.cuda.is_available() else 'onnx_cpu'
        return 'pytorch'
    async def load_model(
        self,
        model_path: str,
-        warmup_voice: Optional[torch.Tensor] = None,
+        warmup_voice: Optional[Union[str, Tuple[str, torch.Tensor]]] = None,
        backend_type: Optional[str] = None
    ) -> None:
        """Load model on specified backend.
@ -206,7 +229,7 @@ class ModelManager:
                    self._loaded_models[backend_type] = abs_path
                    logger.info(f"Fetched model instance from {backend_type} pool")
-                # For PyTorch backends, load normally
+                # For PyTorch and Kokoro backends, load normally
                else:
                    # Check if model is already loaded
                    if (backend_type in self._loaded_models and 
@ -229,27 +252,34 @@ class ModelManager:
            self._loaded_models.pop(backend_type, None)
            raise RuntimeError(f"Failed to load model: {e}")
-    async def _warmup_inference(self, backend: BaseModelBackend, voice: torch.Tensor) -> None:
+    async def _warmup_inference(
        self, 
        backend: BaseModelBackend, 
        voice: Union[str, Tuple[str, torch.Tensor]]
    ) -> None:
        """Run warmup inference to initialize model.
        Args:
            backend: Model backend to warm up
-            voice: Voice tensor already loaded on correct device
+            voice: Voice path or (name, tensor) tuple
        """
        try:
-            # Import here to avoid circular imports
+            # Use real text for warmup
            from ..services.text_processing import process_text
            # Use real text
            text = "Testing text to speech synthesis."
            # Process through pipeline
            tokens = process_text(text)
            if not tokens:
                raise ValueError("Text processing failed")
            # Run inference
-            backend.generate(tokens, voice, speed=1.0)
+            if isinstance(backend, KokoroV1):
                async for _ in backend.generate(text, voice, speed=1.0):
                    pass  # Just run through the chunks
            else:
                # Import here to avoid circular imports
                from ..services.text_processing import process_text
                tokens = process_text(text)
                if not tokens:
                    raise ValueError("Text processing failed")
                # For legacy backends, extract tensor if needed
                voice_tensor = voice[1] if isinstance(voice, tuple) else voice
                backend.generate(tokens, voice_tensor, speed=1.0)
            logger.debug("Completed warmup inference")
        except Exception as e:
@ -258,21 +288,21 @@ class ModelManager:
    async def generate(
        self,
-        tokens: list[int],
+        input_text: str,
-        voice: torch.Tensor,
+        voice: Union[str, Tuple[str, torch.Tensor]],
        speed: float = 1.0,
        backend_type: Optional[str] = None
-    ) -> torch.Tensor:
+    ) -> AsyncGenerator[np.ndarray, None]:
        """Generate audio using specified backend.
        Args:
-            tokens: Input token IDs
+            input_text: Input text to synthesize
-            voice: Voice tensor already loaded on correct device
+            voice: Voice path or (name, tensor) tuple
            speed: Speed multiplier
            backend_type: Backend to use, uses default if None
-        Returns:
+        Yields:
-            Generated audio tensor
+            Generated audio chunks
        Raises:
            RuntimeError: If generation fails
@ -282,9 +312,20 @@ class ModelManager:
            raise RuntimeError("Model not loaded")
        try:
-            # Generate audio using provided voice tensor
+            # Generate audio using provided voice
            # No lock needed here since inference is thread-safe
-            return backend.generate(tokens, voice, speed)
+            if isinstance(backend, KokoroV1):
                async for chunk in backend.generate(input_text, voice, speed):
                    yield chunk
            else:
                # Import here to avoid circular imports
                from ..services.text_processing import process_text
                tokens = process_text(input_text)
                if not tokens:
                    raise ValueError("Text processing failed")
                # For legacy backends, extract tensor if needed
                voice_tensor = voice[1] if isinstance(voice, tuple) else voice
                yield backend.generate(tokens, voice_tensor, speed)
        except Exception as e:
            raise RuntimeError(f"Generation failed: {e}")
@ -294,7 +335,7 @@ class ModelManager:
        for pool in self._session_pools.values():
            pool.cleanup()
-        # Unload PyTorch backends
+        # Unload all backends
        for backend in self._backends.values():
            backend.unload()
@ -303,14 +344,12 @@ class ModelManager:
    @property
    def available_backends(self) -> list[str]:
-        """Get list of available backends.
+        """Get list of available backends."""
        """
        return list(self._backends.keys())
    @property
    def current_backend(self) -> str:
-        """Get current default backend.
+        """Get current default backend."""
        """
        return self._current_backend
@ -336,4 +375,3 @@ async def get_manager(config: Optional[ModelConfig] = None) -> ModelManager:
            _manager_instance = ModelManager(config)
            await _manager_instance.initialize()
        return _manager_instance
--- a/api/src/inference/voice_manager.py
+++ b/api/src/inference/voice_manager.py
@ -8,6 +8,7 @@ from loguru import logger
 from ..core import paths
 from ..core.config import settings
 from ..core.model_config import model_config
 from ..structures.model_schemas import VoiceConfig
@ -33,8 +34,28 @@ class VoiceManager:
            Path to voice file if exists, None otherwise
        """
        api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-        voice_path = os.path.join(api_dir, settings.voices_dir, f"{voice_name}.pt")
+        voices_dir = os.path.join(api_dir, settings.voices_dir)
-        return voice_path if os.path.exists(voice_path) else None
+
        logger.debug(f"Looking for voice: {voice_name}")
        logger.debug(f"Base voices directory: {voices_dir}")
        # Check v1_0 subdirectory first if using Kokoro V1
        if model_config.pytorch_kokoro_v1_file:
            v1_path = os.path.join(voices_dir, 'v1_0', f"{voice_name}.pt")
            logger.debug(f"Checking v1_0 path: {v1_path}")
            if os.path.exists(v1_path):
                logger.debug(f"Found voice in v1_0: {v1_path}")
                return v1_path
        # Fall back to main voices directory
        voice_path = os.path.join(voices_dir, f"{voice_name}.pt")
        logger.debug(f"Checking main path: {voice_path}")
        if os.path.exists(voice_path):
            logger.debug(f"Found voice in main dir: {voice_path}")
            return voice_path
        logger.debug(f"Voice not found: {voice_name}")
        return None
    async def load_voice(self, voice_name: str, device: str = "cpu") -> torch.Tensor:
        """Load voice tensor.
@ -74,10 +95,12 @@ class VoiceManager:
        # Check cache
        cache_key = f"{voice_path}_{device}"
        if self._config.use_cache and cache_key in self._voice_cache:
            logger.debug(f"Using cached voice: {voice_name} from {voice_path}")
            return self._voice_cache[cache_key]
        # Load voice tensor
        try:
            logger.debug(f"Loading voice tensor from: {voice_path}")
            voice = await paths.load_voice_tensor(voice_path, device=device)
        except Exception as e:
            raise RuntimeError(f"Failed to load voice {voice_name}: {e}")
@ -86,7 +109,7 @@ class VoiceManager:
        if self._config.use_cache:
            self._manage_cache()
            self._voice_cache[cache_key] = voice
-            logger.debug(f"Cached voice: {voice_name} on {device}")
+            logger.debug(f"Cached voice: {voice_name} on {device} from {voice_path}")
        return voice
@ -128,6 +151,11 @@ class VoiceManager:
                # Save to disk
                api_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
                voices_dir = os.path.join(api_dir, settings.voices_dir)
                # Save in v1_0 directory if using Kokoro V1
                if model_config.pytorch_kokoro_v1_file:
                    voices_dir = os.path.join(voices_dir, 'v1_0')
                os.makedirs(voices_dir, exist_ok=True)
                combined_path = os.path.join(voices_dir, f"{combined_name}.pt")
@ -157,9 +185,21 @@ class VoiceManager:
            voices_dir = os.path.join(api_dir, settings.voices_dir)
            os.makedirs(voices_dir, exist_ok=True)
-            for entry in os.listdir(voices_dir):
+            # Check v1_0 subdirectory if using Kokoro V1
-                if entry.endswith(".pt"):
+            if model_config.pytorch_kokoro_v1_file:
-                    voices.add(entry[:-3])
+                v1_dir = os.path.join(voices_dir, 'v1_0')
                logger.debug(f"Checking v1_0 directory: {v1_dir}")
                if os.path.exists(v1_dir):
                    for entry in os.listdir(v1_dir):
                        if entry.endswith(".pt"):
                            voices.add(entry[:-3])
                            logger.debug(f"Found v1_0 voice: {entry[:-3]}")
            else:
                # Check main voices directory
                for entry in os.listdir(voices_dir):
                    if entry.endswith(".pt"):
                        voices.add(entry[:-3])
                        logger.debug(f"Found main voice: {entry[:-3]}")
        except Exception as e:
            logger.error(f"Error listing voices: {e}")
@ -177,7 +217,7 @@ class VoiceManager:
        try:
            if not os.path.exists(voice_path):
                return False
-            voice = torch.load(voice_path, map_location="cpu")
+            voice = torch.load(voice_path, map_location="cpu", weights_only=False)
            return isinstance(voice, torch.Tensor)
        except Exception:
            return False
--- a/api/src/main.py
+++ b/api/src/main.py
@ -30,7 +30,7 @@ def setup_logger():
                "{level: <8} | "
                "{message}",
                "colorize": True,
-                "level": "INFO",
+                "level": "DEBUG",
            },
        ],
    }
--- a/api/src/models/v1_0/config.json
+++ b/api/src/models/v1_0/config.json
@ -0,0 +1,172 @@
 {
  "istftnet": {
    "upsample_kernel_sizes": [
      20,
      12
    ],
    "upsample_rates": [
      10,
      6
    ],
    "gen_istft_hop_size": 5,
    "gen_istft_n_fft": 20,
    "resblock_dilation_sizes": [
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ]
    ],
    "resblock_kernel_sizes": [
      3,
      7,
      11
    ],
    "upsample_initial_channel": 512
  },
  "dim_in": 64,
  "dropout": 0.2,
  "hidden_dim": 512,
  "max_conv_dim": 512,
  "max_dur": 50,
  "multispeaker": true,
  "n_layer": 3,
  "n_mels": 80,
  "n_token": 178,
  "style_dim": 128,
  "text_encoder_kernel_size": 5,
  "plbert": {
    "hidden_size": 768,
    "num_attention_heads": 12,
    "intermediate_size": 2048,
    "max_position_embeddings": 512,
    "num_hidden_layers": 12,
    "dropout": 0.1
  },
  "vocab": {
    ";": 1,
    ":": 2,
    ",": 3,
    ".": 4,
    "!": 5,
    "?": 6,
    "—": 9,
    "…": 10,
    "\"": 11,
    "(": 12,
    ")": 13,
    "“": 14,
    "”": 15,
    " ": 16,
    "̃": 17,
    "ʣ": 18,
    "ʥ": 19,
    "ʦ": 20,
    "ʨ": 21,
    "ᵝ": 22,
    "ꭧ": 23,
    "A": 24,
    "I": 25,
    "O": 31,
    "Q": 33,
    "S": 35,
    "T": 36,
    "W": 39,
    "Y": 41,
    "ᵊ": 42,
    "a": 43,
    "b": 44,
    "c": 45,
    "d": 46,
    "e": 47,
    "f": 48,
    "h": 50,
    "i": 51,
    "j": 52,
    "k": 53,
    "l": 54,
    "m": 55,
    "n": 56,
    "o": 57,
    "p": 58,
    "q": 59,
    "r": 60,
    "s": 61,
    "t": 62,
    "u": 63,
    "v": 64,
    "w": 65,
    "x": 66,
    "y": 67,
    "z": 68,
    "ɑ": 69,
    "ɐ": 70,
    "ɒ": 71,
    "æ": 72,
    "β": 75,
    "ɔ": 76,
    "ɕ": 77,
    "ç": 78,
    "ɖ": 80,
    "ð": 81,
    "ʤ": 82,
    "ə": 83,
    "ɚ": 85,
    "ɛ": 86,
    "ɜ": 87,
    "ɟ": 90,
    "ɡ": 92,
    "ɥ": 99,
    "ɨ": 101,
    "ɪ": 102,
    "ʝ": 103,
    "ɯ": 110,
    "ɰ": 111,
    "ŋ": 112,
    "ɳ": 113,
    "ɲ": 114,
    "ɴ": 115,
    "ø": 116,
    "ɸ": 118,
    "θ": 119,
    "œ": 120,
    "ɹ": 123,
    "ɾ": 125,
    "ɻ": 126,
    "ʁ": 128,
    "ɽ": 129,
    "ʂ": 130,
    "ʃ": 131,
    "ʈ": 132,
    "ʧ": 133,
    "ʊ": 135,
    "ʋ": 136,
    "ʌ": 138,
    "ɣ": 139,
    "ɤ": 140,
    "χ": 142,
    "ʎ": 143,
    "ʒ": 147,
    "ʔ": 148,
    "ˈ": 156,
    "ˌ": 157,
    "ː": 158,
    "ʰ": 162,
    "ʲ": 164,
    "↓": 169,
    "→": 171,
    "↗": 172,
    "↘": 173,
    "ᵻ": 177
  }
 }
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -1,9 +1,11 @@
 """TTS service using model and voice managers."""
 import os
 import time
 import tempfile
 from typing import List, Tuple, Optional, AsyncGenerator, Union
 import asyncio
 import asyncio
 import numpy as np
 import torch
 from loguru import logger
@ -14,6 +16,8 @@ from ..inference.voice_manager import get_manager as get_voice_manager
 from .audio import AudioNormalizer, AudioService
 from .text_processing.text_processor import process_text_chunk, smart_split
 from .text_processing import tokenize
 from ..inference.kokoro_v1 import KokoroV1
 class TTSService:
    """Text-to-speech service."""
@ -37,14 +41,16 @@ class TTSService:
    async def _process_chunk(
        self,
        chunk_text: str,
        tokens: List[int],
-        voice_tensor: torch.Tensor,
+        voice_name: str,
        voice_path: str,
        speed: float,
        output_format: Optional[str] = None,
        is_first: bool = False,
        is_last: bool = False,
        normalizer: Optional[AudioNormalizer] = None,
-    ) -> Optional[Union[np.ndarray, bytes]]:
+    ) -> AsyncGenerator[Union[np.ndarray, bytes], None]:
        """Process tokens into audio."""
        async with self._chunk_semaphore:
            try:
@ -52,9 +58,10 @@ class TTSService:
                if is_last:
                    # Skip format conversion for raw audio mode
                    if not output_format:
-                        return np.array([], dtype=np.float32)
+                        yield np.array([], dtype=np.float32)
                        return
-                    return await AudioService.convert_audio(
+                    result = await AudioService.convert_audio(
                        np.array([0], dtype=np.float32),  # Dummy data for type checking
                        24000,
                        output_format,
@ -62,45 +69,126 @@ class TTSService:
                        normalizer=normalizer,
                        is_last_chunk=True
                    )
                    yield result
                    return
                # Skip empty chunks
-                if not tokens:
+                if not tokens and not chunk_text:
-                    return None
+                    return
                # Get backend
                backend = self.model_manager.get_backend()
                # Generate audio using pre-warmed model
-                chunk_audio = await self.model_manager.generate(
+                if isinstance(backend, KokoroV1):
-                    tokens,
+                    # For Kokoro V1, pass text and voice info
-                    voice_tensor,
+                    async for chunk_audio in self.model_manager.generate(
-                    speed=speed
+                        chunk_text,
-                )
+                        (voice_name, voice_path),
                        speed=speed
                    ):
                        # For streaming, convert to bytes
                        if output_format:
                            try:
                                converted = await AudioService.convert_audio(
                                    chunk_audio,
                                    24000,
                                    output_format,
                                    is_first_chunk=is_first,
                                    normalizer=normalizer,
                                    is_last_chunk=is_last
                                )
                                yield converted
                            except Exception as e:
                                logger.error(f"Failed to convert audio: {str(e)}")
                        else:
                            yield chunk_audio
                else:
                    # For legacy backends, load voice tensor
                    voice_tensor = await self._voice_manager.load_voice(voice_name, device=backend.device)
                    chunk_audio = await self.model_manager.generate(
                        tokens,
                        voice_tensor,
                        speed=speed
                    )
-                if chunk_audio is None:
+                    if chunk_audio is None:
-                    logger.error("Model generated None for audio chunk")
+                        logger.error("Model generated None for audio chunk")
-                    return None
+                        return
-                if len(chunk_audio) == 0:
+                    if len(chunk_audio) == 0:
-                    logger.error("Model generated empty audio chunk")
+                        logger.error("Model generated empty audio chunk")
-                    return None
+                        return
-                # For streaming, convert to bytes
+                    # For streaming, convert to bytes
-                if output_format:
+                    if output_format:
-                    try:
+                        try:
-                        return await AudioService.convert_audio(
+                            converted = await AudioService.convert_audio(
-                            chunk_audio,
+                                chunk_audio,
-                            24000,
+                                24000,
-                            output_format,
+                                output_format,
-                            is_first_chunk=is_first,
+                                is_first_chunk=is_first,
-                            normalizer=normalizer,
+                                normalizer=normalizer,
-                            is_last_chunk=is_last
+                                is_last_chunk=is_last
-                        )
+                            )
-                    except Exception as e:
+                            yield converted
-                        logger.error(f"Failed to convert audio: {str(e)}")
+                        except Exception as e:
-                        return None
+                            logger.error(f"Failed to convert audio: {str(e)}")
-                    
+                    else:
-                return chunk_audio
+                        yield chunk_audio
            except Exception as e:
                logger.error(f"Failed to process tokens: {str(e)}")
-                return None
+
    async def _get_voice_path(self, voice: str) -> Tuple[str, str]:
        """Get voice path, handling combined voices.
        Args:
            voice: Voice name or combined voice names (e.g., 'af_jadzia+af_jessica')
        Returns:
            Tuple of (voice name to use, voice path to use)
        Raises:
            RuntimeError: If voice not found
        """
        try:
            # Check if it's a combined voice
            if "+" in voice:
                voices = [v.strip() for v in voice.split("+") if v.strip()]
                if len(voices) < 2:
                    raise RuntimeError(f"Invalid combined voice name: {voice}")
                # Load and combine voices
                voice_tensors = []
                for v in voices:
                    path = self._voice_manager.get_voice_path(v)
                    if not path:
                        raise RuntimeError(f"Voice not found: {v}")
                    logger.debug(f"Loading voice tensor from: {path}")
                    voice_tensor = torch.load(path, map_location="cpu")
                    voice_tensors.append(voice_tensor)
                # Average the voice tensors
                logger.debug(f"Combining {len(voice_tensors)} voice tensors")
                combined = torch.mean(torch.stack(voice_tensors), dim=0)
                # Save combined tensor
                temp_dir = tempfile.gettempdir()
                combined_path = os.path.join(temp_dir, f"{voice}.pt")
                logger.debug(f"Saving combined voice to: {combined_path}")
                torch.save(combined, combined_path)
                return voice, combined_path
            else:
                # Single voice
                path = self._voice_manager.get_voice_path(voice)
                if not path:
                    raise RuntimeError(f"Voice not found: {voice}")
                logger.debug(f"Using single voice path: {path}")
                return voice, path
        except Exception as e:
            logger.error(f"Failed to get voice path: {e}")
            raise
    async def generate_audio_stream(
        self,
@ -111,33 +199,36 @@ class TTSService:
    ) -> AsyncGenerator[bytes, None]:
        """Generate and stream audio chunks."""
        stream_normalizer = AudioNormalizer()
        voice_tensor = None
        chunk_index = 0
        try:
-            # Get backend and load voice (should be fast if cached)
+            # Get backend
            backend = self.model_manager.get_backend()
-            voice_tensor = await self._voice_manager.load_voice(voice, device=backend.device)
+
            # Get voice path, handling combined voices
            voice_name, voice_path = await self._get_voice_path(voice)
            logger.debug(f"Using voice path: {voice_path}")
            # Process text in chunks with smart splitting
            async for chunk_text, tokens in smart_split(text):
                try:
                    # Process audio for chunk
-                    result = await self._process_chunk(
+                    async for result in self._process_chunk(
-                        tokens,  # Now always a flat List[int]
+                        chunk_text,  # Pass text for Kokoro V1
-                        voice_tensor,
+                        tokens,      # Pass tokens for legacy backends
                        voice_name,  # Pass voice name
                        voice_path,  # Pass voice path
                        speed,
                        output_format,
                        is_first=(chunk_index == 0),
                        is_last=False,  # We'll update the last chunk later
                        normalizer=stream_normalizer
-                    )
+                    ):
-                    
+                        if result is not None:
-                    if result is not None:
+                            yield result
-                        yield result
+                            chunk_index += 1
-                        chunk_index += 1
+                        else:
-                    else:
+                            logger.warning(f"No audio generated for chunk: '{chunk_text[:100]}...'")
                        logger.warning(f"No audio generated for chunk: '{chunk_text[:100]}...'")
                except Exception as e:
                    logger.error(f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}")
@ -147,81 +238,25 @@ class TTSService:
            if chunk_index > 0:
                try:
                    # Empty tokens list to finalize audio
-                    final_result = await self._process_chunk(
+                    async for result in self._process_chunk(
-                        [],  # Empty tokens list
+                        "",  # Empty text
-                        voice_tensor,
+                        [],  # Empty tokens
                        voice_name,
                        voice_path,
                        speed,
                        output_format,
                        is_first=False,
-                        is_last=True,
+                        is_last=True,  # Signal this is the last chunk
                        normalizer=stream_normalizer
-                    )
+                    ):
-                    if final_result is not None:
+                        if result is not None:
-                        logger.debug("Yielding final chunk to finalize audio")
+                            yield result
                        yield final_result
                    else:
                        logger.warning("Final chunk processing returned None")
                except Exception as e:
-                    logger.error(f"Failed to process final chunk: {str(e)}")
+                    logger.error(f"Failed to finalize audio stream: {str(e)}")
            else:
                logger.warning("No audio chunks were successfully processed")
        except Exception as e:
            logger.error(f"Error in audio generation stream: {str(e)}")
            raise
        finally:
            if voice_tensor is not None:
                del voice_tensor
                torch.cuda.empty_cache()
    async def generate_from_phonemes(
        self, phonemes: str, voice: str, speed: float = 1.0
    ) -> Tuple[np.ndarray, float]:
        """Generate audio from phonemes.
        Args:
            phonemes: Phoneme string to synthesize
            voice: Voice ID to use
            speed: Speed multiplier
        Returns:
            Tuple of (audio array, processing time)
        """
        start_time = time.time()
        voice_tensor = None
        try:
            # Get backend and load voice
            backend = self.model_manager.get_backend()
            voice_tensor = await self._voice_manager.load_voice(voice, device=backend.device)
            # Convert phonemes to tokens
            tokens = tokenize(phonemes)
            if len(tokens) > 500:  # Model context limit
                raise ValueError(f"Phoneme sequence too long ({len(tokens)} tokens, max 500)")
            tokens = [0] + tokens + [0]  # Add start/end tokens
            # Generate audio
            audio = await self.model_manager.generate(
                tokens,
                voice_tensor,
                speed=speed
            )
            if audio is None:
                raise ValueError("Failed to generate audio")
            processing_time = time.time() - start_time
            return audio, processing_time
        except Exception as e:
            logger.error(f"Error in phoneme audio generation: {str(e)}")
            raise
        finally:
            if voice_tensor is not None:
                del voice_tensor
                torch.cuda.empty_cache()
    async def generate_audio(
        self, text: str, voice: str, speed: float = 1.0
--- a/api/src/voices/v1_0/af_bella.pt
+++ b/api/src/voices/v1_0/af_bella.pt
--- a/api/src/voices/v1_0/af_heart.pt
+++ b/api/src/voices/v1_0/af_heart.pt
--- a/api/src/voices/v1_0/af_jadzia.pt
+++ b/api/src/voices/v1_0/af_jadzia.pt
--- a/api/src/voices/v1_0/af_nicole.pt
+++ b/api/src/voices/v1_0/af_nicole.pt
--- a/api/src/voices/v1_0/af_sarah.pt
+++ b/api/src/voices/v1_0/af_sarah.pt
--- a/api/src/voices/v1_0/af_sky.pt
+++ b/api/src/voices/v1_0/af_sky.pt
--- a/api/src/voices/v1_0/am_adam.pt
+++ b/api/src/voices/v1_0/am_adam.pt
--- a/api/src/voices/v1_0/am_michael.pt
+++ b/api/src/voices/v1_0/am_michael.pt
--- a/api/src/voices/v1_0/am_santa.pt
+++ b/api/src/voices/v1_0/am_santa.pt
--- a/api/src/voices/v1_0/bf_emma.pt
+++ b/api/src/voices/v1_0/bf_emma.pt
--- a/api/src/voices/v1_0/bf_isabella.pt
+++ b/api/src/voices/v1_0/bf_isabella.pt
--- a/api/src/voices/v1_0/bm_george.pt
+++ b/api/src/voices/v1_0/bm_george.pt
--- a/api/src/voices/v1_0/bm_lewis.pt
+++ b/api/src/voices/v1_0/bm_lewis.pt
--- a/api/src/voices/v1_0/ef_dora.pt
+++ b/api/src/voices/v1_0/ef_dora.pt
--- a/api/src/voices/v1_0/em_alex.pt
+++ b/api/src/voices/v1_0/em_alex.pt
--- a/api/src/voices/v1_0/em_santa.pt
+++ b/api/src/voices/v1_0/em_santa.pt
--- a/api/src/voices/v1_0/ff_siwis.pt
+++ b/api/src/voices/v1_0/ff_siwis.pt
--- a/api/src/voices/v1_0/hf_alpha.pt
+++ b/api/src/voices/v1_0/hf_alpha.pt
--- a/api/src/voices/v1_0/hf_beta.pt
+++ b/api/src/voices/v1_0/hf_beta.pt
--- a/api/src/voices/v1_0/hm_omega.pt
+++ b/api/src/voices/v1_0/hm_omega.pt
--- a/api/src/voices/v1_0/hm_psi.pt
+++ b/api/src/voices/v1_0/hm_psi.pt
--- a/api/src/voices/v1_0/if_sara.pt
+++ b/api/src/voices/v1_0/if_sara.pt
--- a/api/src/voices/v1_0/im_nicola.pt
+++ b/api/src/voices/v1_0/im_nicola.pt
--- a/api/src/voices/v1_0/jf_alpha.pt
+++ b/api/src/voices/v1_0/jf_alpha.pt
--- a/api/src/voices/v1_0/jf_gongitsune.pt
+++ b/api/src/voices/v1_0/jf_gongitsune.pt
--- a/api/src/voices/v1_0/jf_nezumi.pt
+++ b/api/src/voices/v1_0/jf_nezumi.pt
--- a/api/src/voices/v1_0/jf_tebukuro.pt
+++ b/api/src/voices/v1_0/jf_tebukuro.pt
--- a/api/src/voices/v1_0/jm_kumo.pt
+++ b/api/src/voices/v1_0/jm_kumo.pt
--- a/api/src/voices/v1_0/pf_dora.pt
+++ b/api/src/voices/v1_0/pf_dora.pt
--- a/api/src/voices/v1_0/pm_alex.pt
+++ b/api/src/voices/v1_0/pm_alex.pt
--- a/api/src/voices/v1_0/pm_santa.pt
+++ b/api/src/voices/v1_0/pm_santa.pt
--- a/api/src/voices/v1_0/zf_xiaobei.pt
+++ b/api/src/voices/v1_0/zf_xiaobei.pt
--- a/api/src/voices/v1_0/zf_xiaoni.pt
+++ b/api/src/voices/v1_0/zf_xiaoni.pt
--- a/api/src/voices/v1_0/zf_xiaoxiao.pt
+++ b/api/src/voices/v1_0/zf_xiaoxiao.pt
--- a/api/src/voices/v1_0/zf_xiaoyi.pt
+++ b/api/src/voices/v1_0/zf_xiaoyi.pt
--- a/api/src/voices/v1_0/zm_yunjian.pt
+++ b/api/src/voices/v1_0/zm_yunjian.pt
--- a/api/src/voices/v1_0/zm_yunxi.pt
+++ b/api/src/voices/v1_0/zm_yunxi.pt
--- a/api/src/voices/v1_0/zm_yunxia.pt
+++ b/api/src/voices/v1_0/zm_yunxia.pt
--- a/api/src/voices/v1_0/zm_yunyang.pt
+++ b/api/src/voices/v1_0/zm_yunyang.pt
--- a/docs/architecture/kokoro_v1_integration.md
+++ b/docs/architecture/kokoro_v1_integration.md
@ -0,0 +1,113 @@
 # Kokoro V1 Integration Architecture
 ## Overview
 This document outlines the architectural approach for integrating the new Kokoro V1 library into our existing inference system. The goal is to bypass most of the legacy model machinery while maintaining compatibility with our existing interfaces, particularly the OpenAI-compatible streaming endpoint.
 ## Current System
 The current system uses a `ModelBackend` interface with multiple implementations (ONNX CPU/GPU, PyTorch CPU/GPU). This interface requires:
 - Async model loading
 - Audio generation from tokens and voice tensors
 - Resource cleanup
 - Device management
 ## Integration Approach
 ### 1. KokoroV1 Backend Implementation
 We'll create a `KokoroV1` class implementing the `ModelBackend` interface that wraps the new Kokoro library:
 ```python
 class KokoroV1(BaseModelBackend):
    def __init__(self):
        super().__init__()
        self._model = None
        self._pipeline = None
        self._device = "cuda" if settings.use_gpu and torch.cuda.is_available() else "cpu"
 ```
 ### 2. Model Loading
 The load_model method will initialize both KModel and KPipeline:
 ```python
 async def load_model(self, path: str) -> None:
    model_path = await paths.get_model_path(path)
    self._model = KModel(model_path).to(self._device).eval()
    self._pipeline = KPipeline(model=self._model, device=self._device)
 ```
 ### 3. Audio Generation
 The generate method will adapt our token/voice tensor format to work with KPipeline:
 ```python
 def generate(self, tokens: list[int], voice: torch.Tensor, speed: float = 1.0) -> np.ndarray:
    # Convert tokens to text using pipeline's tokenizer
    # Use voice tensor as voice embedding
    # Return generated audio
 ```
 ### 4. Streaming Support
 The Kokoro V1 backend must maintain compatibility with our OpenAI-compatible streaming endpoint. Key requirements:
 1. **Chunked Generation**: The pipeline's output should be compatible with our streaming infrastructure:
   ```python
   async def generate_stream(self, text: str, voice_path: str) -> AsyncGenerator[bytes, None]:
       results = self._pipeline(text, voice=voice_path)
       for result in results:
           yield result.audio.numpy()
   ```
 2. **Format Conversion**: Support for various output formats:
   - MP3
   - Opus
   - AAC
   - FLAC
   - WAV
   - PCM
 3. **Voice Management**:
   - Support for voice combination (mean of multiple voice embeddings)
   - Dynamic voice loading and caching
   - Voice listing and validation
 4. **Error Handling**:
   - Proper error propagation for client disconnects
   - Format conversion errors
   - Resource cleanup on failures
 ### 5. Configuration Integration
 We'll use the existing configuration system:
 ```python
 config = model_config.pytorch_kokoro_v1_file  # Model file path
 ```
 ## Benefits
 1. **Simplified Pipeline**: Direct use of Kokoro library's built-in pipeline
 2. **Better Language Support**: Access to Kokoro's wider language capabilities
 3. **Automatic Chunking**: Built-in text chunking and processing
 4. **Phoneme Generation**: Access to phoneme output for better analysis
 5. **Streaming Compatibility**: Maintains existing streaming functionality
 ## Migration Strategy
 1. Implement KokoroV1 backend with streaming support
 2. Add to model manager's available backends
 3. Make it the default for new requests
 4. Keep legacy backends available for backward compatibility
 5. Update voice management to handle both legacy and new voice formats
 ## Next Steps
 1. Switch to Code mode to implement the KokoroV1 backend
 2. Ensure streaming compatibility with OpenAI endpoint
 3. Add tests to verify both streaming and non-streaming functionality
 4. Update documentation for new capabilities
 5. Add monitoring for streaming performance
--- a/docs/architecture/nlp_dependencies.md
+++ b/docs/architecture/nlp_dependencies.md
@ -0,0 +1,66 @@
 # NLP Dependencies Management
 ## Overview
 This document outlines our approach to managing NLP dependencies, particularly focusing on spaCy models that are required by our dependencies (such as misaki). The goal is to ensure reliable model availability while preventing runtime download attempts that could cause failures.
 ## Challenge
 One of our dependencies, misaki, attempts to download the spaCy model `en_core_web_sm` during runtime. This can lead to failures if:
 - The download fails due to network issues
 - The environment lacks proper permissions
 - The system is running in a restricted environment
 ## Solution
 ### Model Management with UV
 We use UV (Universal Versioner) as our package manager. For spaCy model management, we have two approaches:
 1. **Development Environment Setup**
   ```bash
   uv run --with spacy -- spacy download en_core_web_sm
   ```
   This command:
   - Temporarily installs spaCy if not present
   - Downloads the required model
   - Places it in the appropriate location
 2. **Project Environment**
   - Add spaCy as a project dependency in pyproject.toml
   - Run `uv run -- spacy download en_core_web_sm` in the project directory
   - This installs the model in the project's virtual environment
 ### Docker Environment
 For containerized deployments:
 1. Add the model download step in the Dockerfile
 2. Ensure the model is available before application startup
 3. Configure misaki to use the pre-downloaded model
 ## Benefits
 1. **Reliability**: Prevents runtime download attempts
 2. **Reproducibility**: Model version is consistent across environments
 3. **Performance**: No startup delay from download attempts
 4. **Security**: Better control over external downloads
 ## Implementation Notes
 1. Development environments should use the `uv run --with spacy` approach for flexibility
 2. CI/CD pipelines should include model download in their setup phase
 3. Docker builds should pre-download models during image creation
 4. Application code should verify model availability at startup
 ## Future Considerations
 1. Consider caching models in a shared location for multiple services
 2. Implement version pinning for NLP models
 3. Add health checks to verify model availability
 4. Monitor model usage and performance
 ## Related Documentation
 - [Kokoro V1 Integration](kokoro_v1_integration.md)
 - UV Package Manager Documentation
 - spaCy Model Management Guide
--- a/pyproject.toml
+++ b/pyproject.toml
@ -30,13 +30,17 @@ dependencies = [
    "loguru==0.7.3",
    "transformers==4.47.1",
    "openai>=1.59.6",
-    "ebooklib>=0.18",
+    # "ebooklib>=0.18",
-    "html2text>=2024.2.26",
+    # "html2text>=2024.2.26",
    "pydub>=0.25.1",
    "matplotlib>=3.10.0",
    "semchunk>=3.0.1",
    "mutagen>=1.47.0",
    "psutil>=6.1.1",
    "kokoro==0.3.5",
    'misaki[en,ja,ko,zh,vi]==0.6.7',
    "spacy>=3.7.6",
    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
 ]
 [project.optional-dependencies]