WIP: CPU/GPU Functional, few straggling tests to fix and check.

2025-08-05 16:48:53 +00:00 · 2025-01-03 03:16:42 -07:00 · 2025-01-03 03:16:42 -07:00 · 9496a3a63f
commit 9496a3a63f
parent e4d8e74738
11 changed files with 366 additions and 233 deletions
--- a/api/src/main.py
+++ b/api/src/main.py
@ -21,8 +21,8 @@ async def lifespan(app: FastAPI):
    logger.info("Loading TTS model and voice packs...")
    # Initialize the main model with warm-up
-    voicepack_count = TTSModel.initialize()
+    voicepack_count = TTSModel.setup()
-    logger.info(f"Model loaded and warmed up on {TTSModel._device}")
+    logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
    logger.info(f"{voicepack_count} voice packs loaded successfully")
    yield
--- a/api/src/services/init.py
+++ b/api/src/services/init.py
@ -1,4 +1,3 @@
 from .tts_model import TTSModel
 from .tts_service import TTSService
-__all__ = ["TTSService", "TTSModel"]
+__all__ = ["TTSService"]
--- a/api/src/services/tts_base.py
+++ b/api/src/services/tts_base.py
@ -0,0 +1,110 @@
 import os
 import threading
 from abc import ABC, abstractmethod
 import torch
 import numpy as np
 from loguru import logger
 from kokoro import tokenize, phonemize
 from typing import Union, List
 from ..core.config import settings
 class TTSBaseModel(ABC):
    _instance = None
    _lock = threading.Lock()
    _device = None
    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
    @classmethod
    def setup(cls):
        """Initialize model and setup voices"""
        with cls._lock:
            # Set device
            cuda_available = torch.cuda.is_available()
            logger.info(f"CUDA available: {cuda_available}")
            if cuda_available:
                try:
                    # Test CUDA device
                    test_tensor = torch.zeros(1).cuda()
                    logger.info("CUDA test successful")
                    cls._device = "cuda"
                except Exception as e:
                    logger.error(f"CUDA test failed: {e}")
                    cls._device = "cpu"
            else:
                cls._device = "cpu"
            logger.info(f"Initializing model on {cls._device}")
            # Initialize model
            if not cls.initialize(settings.model_dir, settings.model_path):
                raise RuntimeError(f"Failed to initialize {cls._device.upper()} model")
            # Setup voices directory
            os.makedirs(cls.VOICES_DIR, exist_ok=True)
            # Copy base voices to local directory
            base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir)
            if os.path.exists(base_voices_dir):
                for file in os.listdir(base_voices_dir):
                    if file.endswith(".pt"):
                        voice_name = file[:-3]
                        voice_path = os.path.join(cls.VOICES_DIR, file)
                        if not os.path.exists(voice_path):
                            try:
                                logger.info(f"Copying base voice {voice_name} to voices directory")
                                base_path = os.path.join(base_voices_dir, file)
                                voicepack = torch.load(base_path, map_location=cls._device, weights_only=True)
                                torch.save(voicepack, voice_path)
                            except Exception as e:
                                logger.error(f"Error copying voice {voice_name}: {str(e)}")
            # Warm up with default voice
            try:
                dummy_text = "Hello"
                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
                dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
                if cls._device == "cuda":
                    cls.generate(dummy_text, dummy_voicepack, "a", 1.0)
                else:
                    ps = phonemize(dummy_text, "a")
                    tokens = tokenize(ps)
                    tokens = [0] + tokens + [0]
                    cls.generate(tokens, dummy_voicepack, 1.0)
                logger.info("Model warm-up complete")
            except Exception as e:
                logger.warning(f"Model warm-up failed: {e}")
            # Count voices in directory
            voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])
            return voice_count
    @classmethod
    @abstractmethod
    def initialize(cls, model_dir: str, model_path: str = None):
        """Initialize the model"""
        pass
    @classmethod
    @abstractmethod
    def generate(cls, input_data: Union[str, List[int]], voicepack: torch.Tensor, *args) -> np.ndarray:
        """Generate audio from input
        Args:
            input_data: Either text string (GPU) or tokenized input (CPU)
            voicepack: Voice tensor
            *args: Additional args (lang+speed for GPU, speed for CPU)
        Returns:
            np.ndarray: Generated audio samples
        """
        pass
    @classmethod
    def get_device(cls):
        """Get the current device"""
        if cls._device is None:
            raise RuntimeError("Model not initialized. Call setup() first.")
        return cls._device
--- a/api/src/services/tts_cpu.py
+++ b/api/src/services/tts_cpu.py
@ -4,17 +4,35 @@ import torch
 from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel, ExecutionMode
 from loguru import logger
-class TTSCPUModel:
+from .tts_base import TTSBaseModel
 class TTSCPUModel(TTSBaseModel):
    _instance = None
    _onnx_session = None
    @classmethod
-    def initialize(cls, model_dir: str):
+    def initialize(cls, model_dir: str, model_path: str = None):
        """Initialize ONNX model for CPU inference"""
        if cls._onnx_session is None:
            # Try loading ONNX model
-            onnx_path = os.path.join(model_dir, "kokoro-v0_19.onnx")
+            # First try the specified path if provided
-            if not os.path.exists(onnx_path):
+            if model_path and model_path.endswith('.onnx'):
                onnx_path = os.path.join(model_dir, model_path)
                if os.path.exists(onnx_path):
                    logger.info(f"Loading specified ONNX model from {onnx_path}")
                else:
                    onnx_path = None
            else:
                # Look for any .onnx file in the directory as fallback
                onnx_files = [f for f in os.listdir(model_dir) if f.endswith('.onnx')]
                if onnx_files:
                    onnx_path = os.path.join(model_dir, onnx_files[0])
                    logger.info(f"Found ONNX model: {onnx_path}")
                else:
                    logger.error(f"No ONNX model found in {model_dir}")
                    return None
            if not onnx_path:
                return None
            logger.info(f"Loading ONNX model from {onnx_path}")
@ -44,22 +62,33 @@ class TTSCPUModel:
        return cls._onnx_session
    @classmethod
-    def generate(cls, tokens: list, voicepack: torch.Tensor, speed: float) -> np.ndarray:
+    def generate(cls, input_data: list[int], voicepack: torch.Tensor, *args) -> np.ndarray:
-        """Generate audio using ONNX model"""
+        """Generate audio using ONNX model
        Args:
            input_data: list of token IDs
            voicepack: Voice tensor
            *args: (speed,) tuple
        Returns:
            np.ndarray: Generated audio samples
        """
        if cls._onnx_session is None:
            raise RuntimeError("ONNX model not initialized")
        speed = args[0]
        # Pre-allocate and prepare inputs
-        tokens_input = np.array([tokens], dtype=np.int64)
+        tokens_input = np.array([input_data], dtype=np.int64)
-        style_input = voicepack[len(tokens)-2].numpy()  # Already has correct dimensions
+        style_input = voicepack[len(input_data)-2].numpy()  # Already has correct dimensions
        speed_input = np.full(1, speed, dtype=np.float32)  # More efficient than ones * speed
        # Run inference with optimized inputs
-        return cls._onnx_session.run(
+        result = cls._onnx_session.run(
            None,
            {
                'tokens': tokens_input,
                'style': style_input,
                'speed': speed_input
            }
-        )[0]
+        )
        return result[0]
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -1,10 +1,13 @@
 import os
 import numpy as np
 import torch
 from loguru import logger
 from models import build_model
 from kokoro import generate
-class TTSGPUModel:
+from .tts_base import TTSBaseModel
 class TTSGPUModel(TTSBaseModel):
    _instance = None
    _device = "cuda"
@ -24,9 +27,26 @@ class TTSGPUModel:
        return cls._instance
    @classmethod
-    def generate(cls, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> tuple[torch.Tensor, dict]:
+    def generate(cls, input_data: str, voicepack: torch.Tensor, *args) -> np.ndarray:
-        """Generate audio using PyTorch model on GPU"""
+        """Generate audio using PyTorch model on GPU
        Args:
            input_data: Text string to generate audio from
            voicepack: Voice tensor
            *args: (lang, speed) tuple
        Returns:
            np.ndarray: Generated audio samples
        """
        if cls._instance is None:
            raise RuntimeError("GPU model not initialized")
-        return generate(cls._instance, text, voicepack, lang=lang, speed=speed)
+        lang, speed = args
        result = generate(cls._instance, input_data, voicepack, lang=lang, speed=speed)
        # kokoro.generate returns (audio, metadata, info), we only want audio
        audio = result[0]
        # Convert to numpy array if needed
        if isinstance(audio, torch.Tensor):
            audio = audio.cpu().numpy()
        return audio
--- a/api/src/services/tts_model.py
+++ b/api/src/services/tts_model.py
@ -1,94 +1,8 @@
 import os
 import threading
 import torch
 from loguru import logger
 from kokoro import tokenize, phonemize
-from ..core.config import settings
+if torch.cuda.is_available():
-from .tts_cpu import TTSCPUModel
+    from .tts_gpu import TTSGPUModel as TTSModel
 from .tts_gpu import TTSGPUModel
 class TTSModel:
    _device = None
    _lock = threading.Lock()
    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
    @classmethod
    def initialize(cls):
        """Initialize and warm up the model"""
        with cls._lock:
            # Set device and initialize model
            cls._device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info(f"Initializing model on {cls._device}")
            # Initialize appropriate model based on device
            if cls._device == "cuda":
                if not TTSGPUModel.initialize(settings.model_dir, settings.model_path):
                    raise RuntimeError("Failed to initialize GPU model")
 else:
-                # Try CPU ONNX first, fallback to CPU PyTorch if needed
+    from .tts_cpu import TTSCPUModel as TTSModel
                if not TTSCPUModel.initialize(settings.model_dir):
                    logger.warning("ONNX initialization failed, falling back to PyTorch CPU")
                    if not TTSGPUModel.initialize(settings.model_dir, settings.model_path):
                        raise RuntimeError("Failed to initialize CPU model")
-            # Setup voices directory
+__all__ = ["TTSModel"]
            os.makedirs(cls.VOICES_DIR, exist_ok=True)
            # Copy base voices to local directory
            base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir)
            if os.path.exists(base_voices_dir):
                for file in os.listdir(base_voices_dir):
                    if file.endswith(".pt"):
                        voice_name = file[:-3]
                        voice_path = os.path.join(cls.VOICES_DIR, file)
                        if not os.path.exists(voice_path):
                            try:
                                logger.info(
                                    f"Copying base voice {voice_name} to voices directory"
                                )
                                base_path = os.path.join(base_voices_dir, file)
                                voicepack = torch.load(
                                    base_path,
                                    map_location=cls._device,
                                    weights_only=True,
                                )
                                torch.save(voicepack, voice_path)
                            except Exception as e:
                                logger.error(
                                    f"Error copying voice {voice_name}: {str(e)}"
                                )
            # Warm up with default voice
            try:
                dummy_text = "Hello"
                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
                dummy_voicepack = torch.load(
                    voice_path, map_location=cls._device, weights_only=True
                )
                if cls._device == "cuda":
                    TTSGPUModel.generate(dummy_text, dummy_voicepack, "a", 1.0)
                else:
                    ps = phonemize(dummy_text, "a")
                    tokens = tokenize(ps)
                    tokens = [0] + tokens + [0]
                    TTSCPUModel.generate(tokens, dummy_voicepack, 1.0)
                logger.info("Model warm-up complete")
            except Exception as e:
                logger.warning(f"Model warm-up failed: {e}")
            # Count voices in directory
            voice_count = len(
                [f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")]
            )
            return voice_count
    @classmethod
    def get_device(cls):
        """Get the current device or raise an error"""
        if cls._device is None:
            raise RuntimeError("Model not initialized. Call initialize() first.")
        return cls._device
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -12,8 +12,6 @@ from loguru import logger
 from ..core.config import settings
 from .tts_model import TTSModel
 from .tts_cpu import TTSCPUModel
 from .tts_gpu import TTSGPUModel
 class TTSService:
@ -22,6 +20,8 @@ class TTSService:
    def _split_text(self, text: str) -> List[str]:
        """Split text into sentences"""
        if not isinstance(text, str):
            text = str(text) if text is not None else ""
        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    def _get_voice_path(self, voice_name: str) -> Optional[str]:
@ -37,9 +37,12 @@ class TTSService:
        try:
            # Normalize text once at the start
            text = normalize_text(text)
            if not text:
                raise ValueError("Text is empty after preprocessing")
            normalized = normalize_text(text)
            if not normalized:
                raise ValueError("Text is empty after preprocessing")
            text = str(normalized)
            # Check voice exists
            voice_path = self._get_voice_path(voice)
@ -61,12 +64,18 @@ class TTSService:
                    try:
                        # Process chunk
                        if TTSModel.get_device() == "cuda":
-                            chunk_audio, _ = TTSGPUModel.generate(chunk, voicepack, voice[0], speed)
+                            # GPU takes (text, voicepack, lang, speed)
                            try:
                                chunk_audio = TTSModel.generate(chunk, voicepack, voice[0], speed)
                            except RuntimeError as e:
                                logger.error(f"Failed to generate audio: {str(e)}")
                                chunk_audio = None
                        else:
                            # CPU takes (tokens, voicepack, speed)
                            ps = phonemize(chunk, voice[0])
                            tokens = tokenize(ps)
-                            tokens = [0] + tokens + [0]  # Add padding
+                            tokens = [0] + list(tokens) + [0]  # Add padding
-                            chunk_audio = TTSCPUModel.generate(tokens, voicepack, speed)
+                            chunk_audio = TTSModel.generate(tokens, voicepack, speed)
                        if chunk_audio is not None:
                            audio_chunks.append(chunk_audio)
@ -90,12 +99,18 @@ class TTSService:
            else:
                # Process single chunk
                if TTSModel.get_device() == "cuda":
-                    audio, _ = TTSGPUModel.generate(text, voicepack, voice[0], speed)
+                    # GPU takes (text, voicepack, lang, speed)
                    try:
                        audio = TTSModel.generate(text, voicepack, voice[0], speed)
                    except RuntimeError as e:
                        logger.error(f"Failed to generate audio: {str(e)}")
                        raise ValueError("No audio chunks were generated successfully")
                else:
                    # CPU takes (tokens, voicepack, speed)
                    ps = phonemize(text, voice[0])
                    tokens = tokenize(ps)
-                    tokens = [0] + tokens + [0]  # Add padding
+                    tokens = [0] + list(tokens) + [0]  # Add padding
-                    audio = TTSCPUModel.generate(tokens, voicepack, speed)
+                    audio = TTSModel.generate(tokens, voicepack, speed)
            processing_time = time.time() - start_time
            return audio, processing_time
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@ -36,7 +36,7 @@ sys.modules["kokoro.tokenize"] = Mock()
@pytest.fixture(autouse=True)
 def mock_tts_model():
    """Mock TTSModel to avoid loading real models during tests"""
-    with patch("api.src.services.tts.TTSModel") as mock:
+    with patch("api.src.services.tts_model.TTSModel") as mock:
        model_instance = Mock()
        model_instance.get_instance.return_value = model_instance
        model_instance.get_voicepack.return_value = None
--- a/api/tests/test_main.py
+++ b/api/tests/test_main.py
@ -26,13 +26,11 @@ def test_health_check(test_client):
@patch("api.src.main.logger")
 async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
    """Test successful model warmup in lifespan"""
    # Mock the model initialization with model info and voicepack count
    mock_model = MagicMock()
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
-        mock_tts_model.initialize.return_value = (mock_model, 3)  # 3 voice files
+        mock_tts_model.setup.return_value = 3  # 3 voice files
-        mock_tts_model._device = "cuda"  # Set device class variable
+        mock_tts_model.get_device.return_value = "cuda"
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
@ -44,8 +42,8 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
    mock_logger.info.assert_any_call("Model loaded and warmed up on cuda")
    mock_logger.info.assert_any_call("3 voice packs loaded successfully")
-    # Verify model initialization was called
+    # Verify model setup was called
-    mock_tts_model.initialize.assert_called_once()
+    mock_tts_model.setup.assert_called_once()
    # Clean up
    await async_gen.__aexit__(None, None, None)
@ -56,14 +54,14 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
@patch("api.src.main.logger")
 async def test_lifespan_failed_warmup(mock_logger, mock_tts_model):
    """Test failed model warmup in lifespan"""
-    # Mock the model initialization to fail
+    # Mock the model setup to fail
-    mock_tts_model.initialize.side_effect = Exception("Failed to initialize model")
+    mock_tts_model.setup.side_effect = RuntimeError("Failed to initialize model")
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
    # Verify the exception is raised
-    with pytest.raises(Exception, match="Failed to initialize model"):
+    with pytest.raises(RuntimeError, match="Failed to initialize model"):
        await async_gen.__aenter__()
    # Verify the expected logging sequence
@ -77,20 +75,18 @@ async def test_lifespan_failed_warmup(mock_logger, mock_tts_model):
@patch("api.src.main.TTSModel")
 async def test_lifespan_cuda_warmup(mock_tts_model):
    """Test model warmup specifically on CUDA"""
    # Mock the model initialization with CUDA and voicepacks
    mock_model = MagicMock()
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
-        mock_tts_model.initialize.return_value = (mock_model, 2)  # 2 voice files
+        mock_tts_model.setup.return_value = 2  # 2 voice files
-        mock_tts_model._device = "cuda"  # Set device class variable
+        mock_tts_model.get_device.return_value = "cuda"
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
    await async_gen.__aenter__()
-    # Verify model was initialized
+    # Verify model setup was called
-    mock_tts_model.initialize.assert_called_once()
+    mock_tts_model.setup.assert_called_once()
    # Clean up
    await async_gen.__aexit__(None, None, None)
@ -100,22 +96,20 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
@patch("api.src.main.TTSModel")
 async def test_lifespan_cpu_fallback(mock_tts_model):
    """Test model warmup falling back to CPU"""
    # Mock the model initialization with CPU and voicepacks
    mock_model = MagicMock()
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    with patch(
        "os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
    ):
-        mock_tts_model.initialize.return_value = (mock_model, 4)  # 4 voice files
+        mock_tts_model.setup.return_value = 4  # 4 voice files
-        mock_tts_model._device = "cpu"  # Set device class variable
+        mock_tts_model.get_device.return_value = "cpu"
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
    await async_gen.__aenter__()
-    # Verify model was initialized
+    # Verify model setup was called
-    mock_tts_model.initialize.assert_called_once()
+    mock_tts_model.setup.assert_called_once()
    # Clean up
    await async_gen.__aexit__(None, None, None)
--- a/api/tests/test_tts_service.py
+++ b/api/tests/test_tts_service.py
@ -7,6 +7,7 @@ import numpy as np
 import torch
 import pytest
 from api.src.core.config import settings
 from api.src.services.tts_model import TTSModel
 from api.src.services.tts_service import TTSService
@ -14,7 +15,7 @@ from api.src.services.tts_service import TTSService
@pytest.fixture
 def tts_service():
    """Create a TTSService instance for testing"""
-    return TTSService(start_worker=False)
+    return TTSService()
@pytest.fixture
@ -86,6 +87,7 @@ def test_generate_audio_empty_text(
 ):
    """Test generating audio with empty text"""
    mock_normalize.return_value = ""
    mock_instance.return_value = (MagicMock(), "cpu")
    with pytest.raises(ValueError, match="Text is empty after preprocessing"):
        tts_service._generate_audio("", "af", 1.0)
@ -111,7 +113,7 @@ def test_generate_audio_no_chunks(
    """Test generating audio with no successful chunks"""
    mock_normalize.return_value = "Test text"
    mock_phonemize.return_value = "Test text"
-    mock_tokenize.return_value = ["test", "text"]
+    mock_tokenize.return_value = [1, 2]  # Return integers instead of strings
    mock_generate.return_value = (None, None)
    mock_instance.return_value = (MagicMock(), "cpu")
    mock_exists.return_value = True
@ -156,57 +158,23 @@ def test_combine_voices_invalid_input(tts_service):
        tts_service.combine_voices(["voice1"])
-@patch("os.makedirs")
+
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("api.src.services.tts_model.TTSModel.generate")
@patch("os.path.exists")
-@patch("os.listdir")
+@patch("kokoro.normalize_text")
-@patch("torch.load")
+@patch("kokoro.phonemize")
-@patch("torch.save")
+@patch("kokoro.tokenize")
@patch("os.path.join")
 def test_ensure_voices(
    mock_join,
    mock_save,
    mock_load,
    mock_listdir,
    mock_exists,
    mock_makedirs,
    tts_service,
 ):
    """Test voice directory initialization"""
    # Setup mocks
    mock_exists.side_effect = [
        True,
        False,
        False,
    ]  # base_dir exists, voice files don't exist
    mock_listdir.return_value = ["voice1.pt", "voice2.pt"]
    mock_load.return_value = MagicMock()
    mock_join.return_value = "/fake/path"
    # Test voice directory initialization
    tts_service._ensure_voices()
    # Verify directory was created
    mock_makedirs.assert_called_once()
    # Verify voices were loaded and saved
    assert mock_load.call_count == len(mock_listdir.return_value)
    assert mock_save.call_count == len(mock_listdir.return_value)
@patch("api.src.services.tts.TTSModel.get_instance")
@patch("os.path.exists")
@patch("api.src.services.tts.normalize_text")
@patch("api.src.services.tts.phonemize")
@patch("api.src.services.tts.tokenize")
@patch("api.src.services.tts.generate")
@patch("torch.load")
 def test_generate_audio_success(
    mock_torch_load,
    mock_generate,
    mock_tokenize,
    mock_phonemize,
    mock_normalize,
    mock_exists,
    mock_model_generate,
    mock_get_device,
    mock_instance,
    tts_service,
    sample_audio,
@ -214,12 +182,17 @@ def test_generate_audio_success(
    """Test successful audio generation"""
    mock_normalize.return_value = "Test text"
    mock_phonemize.return_value = "Test text"
-    mock_tokenize.return_value = ["test", "text"]
+    mock_tokenize.return_value = [1, 2]  # Return integers instead of strings
-    mock_generate.return_value = (sample_audio, None)
+    mock_model_generate.return_value = sample_audio
    mock_instance.return_value = (MagicMock(), "cpu")
    mock_get_device.return_value = "cpu"
    mock_exists.return_value = True
    mock_torch_load.return_value = MagicMock()
    # Initialize model
    TTSModel._instance = None
    TTSModel._device = "cpu"
    audio, processing_time = tts_service._generate_audio("Test text", "af", 1.0)
    assert isinstance(audio, np.ndarray)
    assert isinstance(processing_time, float)
@ -227,35 +200,94 @@ def test_generate_audio_success(
@patch("torch.cuda.is_available")
-@patch("models.build_model")
+@patch("api.src.services.tts_gpu.TTSGPUModel.initialize")
-def test_model_initialization_cuda(mock_build_model, mock_cuda_available):
+@patch("os.makedirs")
@patch("os.path.exists")
@patch("os.listdir")
@patch("torch.load")
@patch("torch.save")
@patch("api.src.core.config.settings")
@patch("torch.zeros")
 def test_model_initialization_cuda(
    mock_zeros,
    mock_settings,
    mock_save,
    mock_load,
    mock_listdir,
    mock_exists,
    mock_makedirs,
    mock_initialize,
    mock_cuda_available,
 ):
    """Test model initialization with CUDA"""
    # Setup mocks
    mock_cuda_available.return_value = True
-    mock_model = MagicMock()
+    mock_initialize.return_value = True
-    mock_build_model.return_value = mock_model
+    mock_exists.return_value = True
    mock_listdir.return_value = ["voice1.pt", "voice2.pt"]
    mock_load.return_value = torch.zeros(1)
    mock_settings.model_dir = "test_dir"
    mock_settings.model_path = "test_path"
    mock_settings.voices_dir = "voices"
    mock_zeros.return_value = torch.zeros(1)
-    TTSModel._instance = None  # Reset singleton
+    # Reset singleton and device
-    model, voice_count = TTSModel.initialize()
+    TTSModel._instance = None
    TTSModel._device = None
-    assert TTSModel._device == "cuda"  # Check the class variable instead
+    # Mock settings to prevent actual file operations
-    assert model == mock_model
+    with patch.object(settings, 'model_dir', 'test_dir'), \
-    mock_build_model.assert_called_once()
+         patch.object(settings, 'model_path', 'test_path'):
        voice_count = TTSModel.setup()
        assert TTSModel.get_device() == "cuda"
        assert voice_count == 2
        mock_initialize.assert_called_once_with("test_dir", "test_path")
-@patch("api.src.services.tts.torch.cuda.is_available")
+@patch("torch.cuda.is_available")
-@patch("api.src.services.tts.build_model")
+@patch("api.src.services.tts_base.TTSBaseModel.initialize")
-def test_model_initialization_cpu(mock_build_model, mock_cuda_available):
+@patch("os.makedirs")
@patch("os.path.exists")
@patch("os.listdir")
@patch("torch.load")
@patch("torch.save")
@patch("api.src.core.config.settings")
@patch("torch.zeros")
 def test_model_initialization_cpu(
    mock_zeros,
    mock_settings,
    mock_save,
    mock_load,
    mock_listdir,
    mock_exists,
    mock_makedirs,
    mock_initialize,
    mock_cuda_available,
 ):
    """Test model initialization with CPU"""
    # Setup mocks
    mock_cuda_available.return_value = False
-    mock_model = MagicMock()
+    mock_initialize.return_value = False  # This will trigger the RuntimeError
-    mock_build_model.return_value = mock_model
+    mock_exists.return_value = True
    mock_listdir.return_value = ["voice1.pt", "voice2.pt", "voice3.pt"]
    mock_load.return_value = torch.zeros(1)
    mock_settings.model_dir = "test_dir"
    mock_settings.model_path = "test_path"
    mock_settings.voices_dir = "voices"
    mock_zeros.return_value = torch.zeros(1)
-    TTSModel._instance = None  # Reset singleton
+    # Reset singleton and device
-    model, voice_count = TTSModel.initialize()
+    TTSModel._instance = None
    TTSModel._device = None
-    assert TTSModel._device == "cpu"  # Check the class variable instead
+    # Mock settings to prevent actual file operations
-    assert model == mock_model
+    with patch.object(settings, 'model_dir', 'test_dir'), \
-    mock_build_model.assert_called_once()
+         patch.object(settings, 'model_path', 'test_path'), \
         pytest.raises(RuntimeError, match="Failed to initialize CPU model"):
        TTSModel.setup()
    mock_initialize.assert_called_once_with("test_dir", "test_path")
@patch("api.src.services.tts_service.TTSService._get_voice_path")
@ -267,7 +299,7 @@ def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path):
    TTSModel._voicepacks = {}  # Reset voicepacks
-    service = TTSService(start_worker=False)
+    service = TTSService()
    with pytest.raises(ValueError, match="Voice not found: nonexistent_voice"):
        service._generate_audio("test", "nonexistent_voice", 1.0)
@ -286,23 +318,32 @@ def test_save_audio(mock_tts_model, tts_service, sample_audio, tmp_path):
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("api.src.services.tts_model.TTSModel.generate")
@patch("os.path.exists")
-@patch("api.src.services.tts.normalize_text")
+@patch("kokoro.normalize_text")
-@patch("api.src.services.tts.generate")
+@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("torch.load")
 def test_generate_audio_without_stitching(
    mock_torch_load,
-    mock_generate,
+    mock_tokenize,
    mock_phonemize,
    mock_normalize,
    mock_exists,
    mock_model_generate,
    mock_get_device,
    mock_instance,
    tts_service,
    sample_audio,
 ):
    """Test generating audio without text stitching"""
    mock_normalize.return_value = "Test text"
-    mock_generate.return_value = (sample_audio, None)
+    mock_phonemize.return_value = "Test text"
    mock_tokenize.return_value = [1, 2]  # Return integers instead of strings
    mock_model_generate.return_value = sample_audio
    mock_instance.return_value = (MagicMock(), "cpu")
    mock_get_device.return_value = "cpu"
    mock_exists.return_value = True
    mock_torch_load.return_value = MagicMock()
@ -311,7 +352,7 @@ def test_generate_audio_without_stitching(
    )
    assert isinstance(audio, np.ndarray)
    assert len(audio) > 0
-    mock_generate.assert_called_once()
+    mock_model_generate.assert_called_once()
@patch("os.listdir")
@ -323,12 +364,13 @@ def test_list_voices_error(mock_listdir, tts_service):
    assert voices == []
-@patch("api.src.services.tts.TTSModel.get_instance")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("os.path.exists")
-@patch("api.src.services.tts.normalize_text")
+@patch("kokoro.normalize_text")
-@patch("api.src.services.tts.phonemize")
+@patch("kokoro.phonemize")
-@patch("api.src.services.tts.tokenize")
+@patch("kokoro.tokenize")
-@patch("api.src.services.tts.generate")
+@patch("kokoro.generate")
@patch("torch.load")
 def test_generate_audio_phonemize_error(
    mock_torch_load,
@ -337,6 +379,7 @@ def test_generate_audio_phonemize_error(
    mock_phonemize,
    mock_normalize,
    mock_exists,
    mock_get_device,
    mock_instance,
    tts_service,
 ):
@ -344,33 +387,51 @@ def test_generate_audio_phonemize_error(
    mock_normalize.return_value = "Test text"
    mock_phonemize.side_effect = Exception("Phonemization failed")
    mock_instance.return_value = (MagicMock(), "cpu")
    mock_get_device.return_value = "cpu"
    mock_exists.return_value = True
    mock_torch_load.return_value = MagicMock()
    mock_generate.return_value = (None, None)
    # Initialize model
    TTSModel._instance = None
    TTSModel._device = "cpu"
    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)
-@patch("api.src.services.tts.TTSModel.get_instance")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("os.path.exists")
-@patch("api.src.services.tts.normalize_text")
+@patch("kokoro.normalize_text")
-@patch("api.src.services.tts.generate")
+@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("kokoro.generate")
@patch("torch.load")
 def test_generate_audio_error(
    mock_torch_load,
    mock_generate,
    mock_tokenize,
    mock_phonemize,
    mock_normalize,
    mock_exists,
    mock_get_device,
    mock_instance,
    tts_service,
 ):
    """Test handling generation error"""
    mock_normalize.return_value = "Test text"
    mock_phonemize.return_value = "Test text"
    mock_tokenize.return_value = [1, 2]  # Return integers instead of strings
    mock_generate.side_effect = Exception("Generation failed")
    mock_instance.return_value = (MagicMock(), "cpu")
    mock_get_device.return_value = "cpu"
    mock_exists.return_value = True
    mock_torch_load.return_value = MagicMock()
    # Initialize model
    TTSModel._instance = None
    TTSModel._device = "cpu"
    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)
--- a/examples/benchmarks/benchmark_tts_rtf.py
+++ b/examples/benchmarks/benchmark_tts_rtf.py
@ -69,22 +69,13 @@ def get_gpu_memory():
 def get_system_metrics():
    """Get current system metrics"""
-    # Take multiple CPU measurements over a short period
+    # Get per-CPU percentages and calculate average
-    samples = []
+    cpu_percentages = psutil.cpu_percent(percpu=True)
-    for _ in range(3):  # Take 3 samples
+    avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
        # Get both overall and per-CPU percentages
        overall_cpu = psutil.cpu_percent(interval=0.1)
        per_cpu = psutil.cpu_percent(percpu=True)
        avg_per_cpu = sum(per_cpu) / len(per_cpu)
        # Use the maximum of overall and average per-CPU
        samples.append(max(overall_cpu, avg_per_cpu))
    # Use the maximum CPU usage from all samples
    cpu_usage = round(max(samples), 2)
    metrics = {
        "timestamp": datetime.now().isoformat(),
-        "cpu_percent": cpu_usage,
+        "cpu_percent": round(avg_cpu, 2),
        "ram_percent": psutil.virtual_memory().percent,
        "ram_used_gb": psutil.virtual_memory().used / (1024**3),
    }