From 9496a3a63f4f7d6ee90010f05f9774f3c95bbb42 Mon Sep 17 00:00:00 2001 From: remsky Date: Fri, 3 Jan 2025 03:16:42 -0700 Subject: [PATCH] WIP: CPU/GPU Functional, few straggling tests to fix and check. --- api/src/main.py | 4 +- api/src/services/__init__.py | 3 +- api/src/services/tts_base.py | 110 +++++++++++ api/src/services/tts_cpu.py | 49 ++++- api/src/services/tts_gpu.py | 28 ++- api/src/services/tts_model.py | 96 +--------- api/src/services/tts_service.py | 33 +++- api/tests/conftest.py | 2 +- api/tests/test_main.py | 36 ++-- api/tests/test_tts_service.py | 221 +++++++++++++++-------- examples/benchmarks/benchmark_tts_rtf.py | 17 +- 11 files changed, 366 insertions(+), 233 deletions(-) create mode 100644 api/src/services/tts_base.py diff --git a/api/src/main.py b/api/src/main.py index 6f5c6ac..1521a14 100644 --- a/api/src/main.py +++ b/api/src/main.py @@ -21,8 +21,8 @@ async def lifespan(app: FastAPI): logger.info("Loading TTS model and voice packs...") # Initialize the main model with warm-up - voicepack_count = TTSModel.initialize() - logger.info(f"Model loaded and warmed up on {TTSModel._device}") + voicepack_count = TTSModel.setup() + logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}") logger.info(f"{voicepack_count} voice packs loaded successfully") yield diff --git a/api/src/services/__init__.py b/api/src/services/__init__.py index 0760951..82cf76e 100644 --- a/api/src/services/__init__.py +++ b/api/src/services/__init__.py @@ -1,4 +1,3 @@ -from .tts_model import TTSModel from .tts_service import TTSService -__all__ = ["TTSService", "TTSModel"] +__all__ = ["TTSService"] diff --git a/api/src/services/tts_base.py b/api/src/services/tts_base.py new file mode 100644 index 0000000..1ef913d --- /dev/null +++ b/api/src/services/tts_base.py @@ -0,0 +1,110 @@ +import os +import threading +from abc import ABC, abstractmethod +import torch +import numpy as np +from loguru import logger +from kokoro import tokenize, phonemize +from typing import Union, List + +from ..core.config import settings + + +class TTSBaseModel(ABC): + _instance = None + _lock = threading.Lock() + _device = None + VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices") + + @classmethod + def setup(cls): + """Initialize model and setup voices""" + with cls._lock: + # Set device + cuda_available = torch.cuda.is_available() + logger.info(f"CUDA available: {cuda_available}") + if cuda_available: + try: + # Test CUDA device + test_tensor = torch.zeros(1).cuda() + logger.info("CUDA test successful") + cls._device = "cuda" + except Exception as e: + logger.error(f"CUDA test failed: {e}") + cls._device = "cpu" + else: + cls._device = "cpu" + logger.info(f"Initializing model on {cls._device}") + + # Initialize model + if not cls.initialize(settings.model_dir, settings.model_path): + raise RuntimeError(f"Failed to initialize {cls._device.upper()} model") + + # Setup voices directory + os.makedirs(cls.VOICES_DIR, exist_ok=True) + + # Copy base voices to local directory + base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir) + if os.path.exists(base_voices_dir): + for file in os.listdir(base_voices_dir): + if file.endswith(".pt"): + voice_name = file[:-3] + voice_path = os.path.join(cls.VOICES_DIR, file) + if not os.path.exists(voice_path): + try: + logger.info(f"Copying base voice {voice_name} to voices directory") + base_path = os.path.join(base_voices_dir, file) + voicepack = torch.load(base_path, map_location=cls._device, weights_only=True) + torch.save(voicepack, voice_path) + except Exception as e: + logger.error(f"Error copying voice {voice_name}: {str(e)}") + + # Warm up with default voice + try: + dummy_text = "Hello" + voice_path = os.path.join(cls.VOICES_DIR, "af.pt") + dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True) + + if cls._device == "cuda": + cls.generate(dummy_text, dummy_voicepack, "a", 1.0) + else: + ps = phonemize(dummy_text, "a") + tokens = tokenize(ps) + tokens = [0] + tokens + [0] + cls.generate(tokens, dummy_voicepack, 1.0) + + logger.info("Model warm-up complete") + except Exception as e: + logger.warning(f"Model warm-up failed: {e}") + + # Count voices in directory + voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")]) + return voice_count + + @classmethod + @abstractmethod + def initialize(cls, model_dir: str, model_path: str = None): + """Initialize the model""" + pass + + @classmethod + @abstractmethod + def generate(cls, input_data: Union[str, List[int]], voicepack: torch.Tensor, *args) -> np.ndarray: + """Generate audio from input + + Args: + input_data: Either text string (GPU) or tokenized input (CPU) + voicepack: Voice tensor + *args: Additional args (lang+speed for GPU, speed for CPU) + + Returns: + np.ndarray: Generated audio samples + """ + pass + + @classmethod + def get_device(cls): + """Get the current device""" + if cls._device is None: + raise RuntimeError("Model not initialized. Call setup() first.") + return cls._device diff --git a/api/src/services/tts_cpu.py b/api/src/services/tts_cpu.py index f93199f..74c1bca 100644 --- a/api/src/services/tts_cpu.py +++ b/api/src/services/tts_cpu.py @@ -4,17 +4,35 @@ import torch from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel, ExecutionMode from loguru import logger -class TTSCPUModel: +from .tts_base import TTSBaseModel + +class TTSCPUModel(TTSBaseModel): _instance = None _onnx_session = None @classmethod - def initialize(cls, model_dir: str): + def initialize(cls, model_dir: str, model_path: str = None): """Initialize ONNX model for CPU inference""" if cls._onnx_session is None: # Try loading ONNX model - onnx_path = os.path.join(model_dir, "kokoro-v0_19.onnx") - if not os.path.exists(onnx_path): + # First try the specified path if provided + if model_path and model_path.endswith('.onnx'): + onnx_path = os.path.join(model_dir, model_path) + if os.path.exists(onnx_path): + logger.info(f"Loading specified ONNX model from {onnx_path}") + else: + onnx_path = None + else: + # Look for any .onnx file in the directory as fallback + onnx_files = [f for f in os.listdir(model_dir) if f.endswith('.onnx')] + if onnx_files: + onnx_path = os.path.join(model_dir, onnx_files[0]) + logger.info(f"Found ONNX model: {onnx_path}") + else: + logger.error(f"No ONNX model found in {model_dir}") + return None + + if not onnx_path: return None logger.info(f"Loading ONNX model from {onnx_path}") @@ -44,22 +62,33 @@ class TTSCPUModel: return cls._onnx_session @classmethod - def generate(cls, tokens: list, voicepack: torch.Tensor, speed: float) -> np.ndarray: - """Generate audio using ONNX model""" + def generate(cls, input_data: list[int], voicepack: torch.Tensor, *args) -> np.ndarray: + """Generate audio using ONNX model + + Args: + input_data: list of token IDs + voicepack: Voice tensor + *args: (speed,) tuple + + Returns: + np.ndarray: Generated audio samples + """ if cls._onnx_session is None: raise RuntimeError("ONNX model not initialized") + speed = args[0] # Pre-allocate and prepare inputs - tokens_input = np.array([tokens], dtype=np.int64) - style_input = voicepack[len(tokens)-2].numpy() # Already has correct dimensions + tokens_input = np.array([input_data], dtype=np.int64) + style_input = voicepack[len(input_data)-2].numpy() # Already has correct dimensions speed_input = np.full(1, speed, dtype=np.float32) # More efficient than ones * speed # Run inference with optimized inputs - return cls._onnx_session.run( + result = cls._onnx_session.run( None, { 'tokens': tokens_input, 'style': style_input, 'speed': speed_input } - )[0] + ) + return result[0] diff --git a/api/src/services/tts_gpu.py b/api/src/services/tts_gpu.py index eca5f35..5da5563 100644 --- a/api/src/services/tts_gpu.py +++ b/api/src/services/tts_gpu.py @@ -1,10 +1,13 @@ import os +import numpy as np import torch from loguru import logger from models import build_model from kokoro import generate -class TTSGPUModel: +from .tts_base import TTSBaseModel + +class TTSGPUModel(TTSBaseModel): _instance = None _device = "cuda" @@ -24,9 +27,26 @@ class TTSGPUModel: return cls._instance @classmethod - def generate(cls, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> tuple[torch.Tensor, dict]: - """Generate audio using PyTorch model on GPU""" + def generate(cls, input_data: str, voicepack: torch.Tensor, *args) -> np.ndarray: + """Generate audio using PyTorch model on GPU + + Args: + input_data: Text string to generate audio from + voicepack: Voice tensor + *args: (lang, speed) tuple + + Returns: + np.ndarray: Generated audio samples + """ if cls._instance is None: raise RuntimeError("GPU model not initialized") - return generate(cls._instance, text, voicepack, lang=lang, speed=speed) + lang, speed = args + result = generate(cls._instance, input_data, voicepack, lang=lang, speed=speed) + # kokoro.generate returns (audio, metadata, info), we only want audio + audio = result[0] + + # Convert to numpy array if needed + if isinstance(audio, torch.Tensor): + audio = audio.cpu().numpy() + return audio diff --git a/api/src/services/tts_model.py b/api/src/services/tts_model.py index 30c07d2..1e04939 100644 --- a/api/src/services/tts_model.py +++ b/api/src/services/tts_model.py @@ -1,94 +1,8 @@ -import os -import threading import torch -from loguru import logger -from kokoro import tokenize, phonemize -from ..core.config import settings -from .tts_cpu import TTSCPUModel -from .tts_gpu import TTSGPUModel +if torch.cuda.is_available(): + from .tts_gpu import TTSGPUModel as TTSModel +else: + from .tts_cpu import TTSCPUModel as TTSModel - -class TTSModel: - _device = None - _lock = threading.Lock() - VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices") - - @classmethod - def initialize(cls): - """Initialize and warm up the model""" - with cls._lock: - # Set device and initialize model - cls._device = "cuda" if torch.cuda.is_available() else "cpu" - logger.info(f"Initializing model on {cls._device}") - - # Initialize appropriate model based on device - if cls._device == "cuda": - if not TTSGPUModel.initialize(settings.model_dir, settings.model_path): - raise RuntimeError("Failed to initialize GPU model") - else: - # Try CPU ONNX first, fallback to CPU PyTorch if needed - if not TTSCPUModel.initialize(settings.model_dir): - logger.warning("ONNX initialization failed, falling back to PyTorch CPU") - if not TTSGPUModel.initialize(settings.model_dir, settings.model_path): - raise RuntimeError("Failed to initialize CPU model") - - # Setup voices directory - os.makedirs(cls.VOICES_DIR, exist_ok=True) - - # Copy base voices to local directory - base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir) - if os.path.exists(base_voices_dir): - for file in os.listdir(base_voices_dir): - if file.endswith(".pt"): - voice_name = file[:-3] - voice_path = os.path.join(cls.VOICES_DIR, file) - if not os.path.exists(voice_path): - try: - logger.info( - f"Copying base voice {voice_name} to voices directory" - ) - base_path = os.path.join(base_voices_dir, file) - voicepack = torch.load( - base_path, - map_location=cls._device, - weights_only=True, - ) - torch.save(voicepack, voice_path) - except Exception as e: - logger.error( - f"Error copying voice {voice_name}: {str(e)}" - ) - - # Warm up with default voice - try: - dummy_text = "Hello" - voice_path = os.path.join(cls.VOICES_DIR, "af.pt") - dummy_voicepack = torch.load( - voice_path, map_location=cls._device, weights_only=True - ) - - if cls._device == "cuda": - TTSGPUModel.generate(dummy_text, dummy_voicepack, "a", 1.0) - else: - ps = phonemize(dummy_text, "a") - tokens = tokenize(ps) - tokens = [0] + tokens + [0] - TTSCPUModel.generate(tokens, dummy_voicepack, 1.0) - - logger.info("Model warm-up complete") - except Exception as e: - logger.warning(f"Model warm-up failed: {e}") - - # Count voices in directory - voice_count = len( - [f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")] - ) - return voice_count - - @classmethod - def get_device(cls): - """Get the current device or raise an error""" - if cls._device is None: - raise RuntimeError("Model not initialized. Call initialize() first.") - return cls._device +__all__ = ["TTSModel"] diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 67cd155..cddbc47 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -12,8 +12,6 @@ from loguru import logger from ..core.config import settings from .tts_model import TTSModel -from .tts_cpu import TTSCPUModel -from .tts_gpu import TTSGPUModel class TTSService: @@ -22,6 +20,8 @@ class TTSService: def _split_text(self, text: str) -> List[str]: """Split text into sentences""" + if not isinstance(text, str): + text = str(text) if text is not None else "" return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] def _get_voice_path(self, voice_name: str) -> Optional[str]: @@ -37,9 +37,12 @@ class TTSService: try: # Normalize text once at the start - text = normalize_text(text) if not text: raise ValueError("Text is empty after preprocessing") + normalized = normalize_text(text) + if not normalized: + raise ValueError("Text is empty after preprocessing") + text = str(normalized) # Check voice exists voice_path = self._get_voice_path(voice) @@ -61,12 +64,18 @@ class TTSService: try: # Process chunk if TTSModel.get_device() == "cuda": - chunk_audio, _ = TTSGPUModel.generate(chunk, voicepack, voice[0], speed) + # GPU takes (text, voicepack, lang, speed) + try: + chunk_audio = TTSModel.generate(chunk, voicepack, voice[0], speed) + except RuntimeError as e: + logger.error(f"Failed to generate audio: {str(e)}") + chunk_audio = None else: + # CPU takes (tokens, voicepack, speed) ps = phonemize(chunk, voice[0]) tokens = tokenize(ps) - tokens = [0] + tokens + [0] # Add padding - chunk_audio = TTSCPUModel.generate(tokens, voicepack, speed) + tokens = [0] + list(tokens) + [0] # Add padding + chunk_audio = TTSModel.generate(tokens, voicepack, speed) if chunk_audio is not None: audio_chunks.append(chunk_audio) @@ -90,12 +99,18 @@ class TTSService: else: # Process single chunk if TTSModel.get_device() == "cuda": - audio, _ = TTSGPUModel.generate(text, voicepack, voice[0], speed) + # GPU takes (text, voicepack, lang, speed) + try: + audio = TTSModel.generate(text, voicepack, voice[0], speed) + except RuntimeError as e: + logger.error(f"Failed to generate audio: {str(e)}") + raise ValueError("No audio chunks were generated successfully") else: + # CPU takes (tokens, voicepack, speed) ps = phonemize(text, voice[0]) tokens = tokenize(ps) - tokens = [0] + tokens + [0] # Add padding - audio = TTSCPUModel.generate(tokens, voicepack, speed) + tokens = [0] + list(tokens) + [0] # Add padding + audio = TTSModel.generate(tokens, voicepack, speed) processing_time = time.time() - start_time return audio, processing_time diff --git a/api/tests/conftest.py b/api/tests/conftest.py index c41172f..5803170 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -36,7 +36,7 @@ sys.modules["kokoro.tokenize"] = Mock() @pytest.fixture(autouse=True) def mock_tts_model(): """Mock TTSModel to avoid loading real models during tests""" - with patch("api.src.services.tts.TTSModel") as mock: + with patch("api.src.services.tts_model.TTSModel") as mock: model_instance = Mock() model_instance.get_instance.return_value = model_instance model_instance.get_voicepack.return_value = None diff --git a/api/tests/test_main.py b/api/tests/test_main.py index 5b23749..c6a972e 100644 --- a/api/tests/test_main.py +++ b/api/tests/test_main.py @@ -26,13 +26,11 @@ def test_health_check(test_client): @patch("api.src.main.logger") async def test_lifespan_successful_warmup(mock_logger, mock_tts_model): """Test successful model warmup in lifespan""" - # Mock the model initialization with model info and voicepack count - mock_model = MagicMock() # Mock file system for voice counting mock_tts_model.VOICES_DIR = "/mock/voices" with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]): - mock_tts_model.initialize.return_value = (mock_model, 3) # 3 voice files - mock_tts_model._device = "cuda" # Set device class variable + mock_tts_model.setup.return_value = 3 # 3 voice files + mock_tts_model.get_device.return_value = "cuda" # Create an async generator from the lifespan context manager async_gen = lifespan(MagicMock()) @@ -44,8 +42,8 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model): mock_logger.info.assert_any_call("Model loaded and warmed up on cuda") mock_logger.info.assert_any_call("3 voice packs loaded successfully") - # Verify model initialization was called - mock_tts_model.initialize.assert_called_once() + # Verify model setup was called + mock_tts_model.setup.assert_called_once() # Clean up await async_gen.__aexit__(None, None, None) @@ -56,14 +54,14 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model): @patch("api.src.main.logger") async def test_lifespan_failed_warmup(mock_logger, mock_tts_model): """Test failed model warmup in lifespan""" - # Mock the model initialization to fail - mock_tts_model.initialize.side_effect = Exception("Failed to initialize model") + # Mock the model setup to fail + mock_tts_model.setup.side_effect = RuntimeError("Failed to initialize model") # Create an async generator from the lifespan context manager async_gen = lifespan(MagicMock()) # Verify the exception is raised - with pytest.raises(Exception, match="Failed to initialize model"): + with pytest.raises(RuntimeError, match="Failed to initialize model"): await async_gen.__aenter__() # Verify the expected logging sequence @@ -77,20 +75,18 @@ async def test_lifespan_failed_warmup(mock_logger, mock_tts_model): @patch("api.src.main.TTSModel") async def test_lifespan_cuda_warmup(mock_tts_model): """Test model warmup specifically on CUDA""" - # Mock the model initialization with CUDA and voicepacks - mock_model = MagicMock() # Mock file system for voice counting mock_tts_model.VOICES_DIR = "/mock/voices" with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]): - mock_tts_model.initialize.return_value = (mock_model, 2) # 2 voice files - mock_tts_model._device = "cuda" # Set device class variable + mock_tts_model.setup.return_value = 2 # 2 voice files + mock_tts_model.get_device.return_value = "cuda" # Create an async generator from the lifespan context manager async_gen = lifespan(MagicMock()) await async_gen.__aenter__() - # Verify model was initialized - mock_tts_model.initialize.assert_called_once() + # Verify model setup was called + mock_tts_model.setup.assert_called_once() # Clean up await async_gen.__aexit__(None, None, None) @@ -100,22 +96,20 @@ async def test_lifespan_cuda_warmup(mock_tts_model): @patch("api.src.main.TTSModel") async def test_lifespan_cpu_fallback(mock_tts_model): """Test model warmup falling back to CPU""" - # Mock the model initialization with CPU and voicepacks - mock_model = MagicMock() # Mock file system for voice counting mock_tts_model.VOICES_DIR = "/mock/voices" with patch( "os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"] ): - mock_tts_model.initialize.return_value = (mock_model, 4) # 4 voice files - mock_tts_model._device = "cpu" # Set device class variable + mock_tts_model.setup.return_value = 4 # 4 voice files + mock_tts_model.get_device.return_value = "cpu" # Create an async generator from the lifespan context manager async_gen = lifespan(MagicMock()) await async_gen.__aenter__() - # Verify model was initialized - mock_tts_model.initialize.assert_called_once() + # Verify model setup was called + mock_tts_model.setup.assert_called_once() # Clean up await async_gen.__aexit__(None, None, None) diff --git a/api/tests/test_tts_service.py b/api/tests/test_tts_service.py index 7ade493..1286e91 100644 --- a/api/tests/test_tts_service.py +++ b/api/tests/test_tts_service.py @@ -7,6 +7,7 @@ import numpy as np import torch import pytest +from api.src.core.config import settings from api.src.services.tts_model import TTSModel from api.src.services.tts_service import TTSService @@ -14,7 +15,7 @@ from api.src.services.tts_service import TTSService @pytest.fixture def tts_service(): """Create a TTSService instance for testing""" - return TTSService(start_worker=False) + return TTSService() @pytest.fixture @@ -86,6 +87,7 @@ def test_generate_audio_empty_text( ): """Test generating audio with empty text""" mock_normalize.return_value = "" + mock_instance.return_value = (MagicMock(), "cpu") with pytest.raises(ValueError, match="Text is empty after preprocessing"): tts_service._generate_audio("", "af", 1.0) @@ -111,7 +113,7 @@ def test_generate_audio_no_chunks( """Test generating audio with no successful chunks""" mock_normalize.return_value = "Test text" mock_phonemize.return_value = "Test text" - mock_tokenize.return_value = ["test", "text"] + mock_tokenize.return_value = [1, 2] # Return integers instead of strings mock_generate.return_value = (None, None) mock_instance.return_value = (MagicMock(), "cpu") mock_exists.return_value = True @@ -156,57 +158,23 @@ def test_combine_voices_invalid_input(tts_service): tts_service.combine_voices(["voice1"]) -@patch("os.makedirs") + +@patch("api.src.services.tts_model.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_device") +@patch("api.src.services.tts_model.TTSModel.generate") @patch("os.path.exists") -@patch("os.listdir") -@patch("torch.load") -@patch("torch.save") -@patch("os.path.join") -def test_ensure_voices( - mock_join, - mock_save, - mock_load, - mock_listdir, - mock_exists, - mock_makedirs, - tts_service, -): - """Test voice directory initialization""" - # Setup mocks - mock_exists.side_effect = [ - True, - False, - False, - ] # base_dir exists, voice files don't exist - mock_listdir.return_value = ["voice1.pt", "voice2.pt"] - mock_load.return_value = MagicMock() - mock_join.return_value = "/fake/path" - - # Test voice directory initialization - tts_service._ensure_voices() - - # Verify directory was created - mock_makedirs.assert_called_once() - - # Verify voices were loaded and saved - assert mock_load.call_count == len(mock_listdir.return_value) - assert mock_save.call_count == len(mock_listdir.return_value) - - -@patch("api.src.services.tts.TTSModel.get_instance") -@patch("os.path.exists") -@patch("api.src.services.tts.normalize_text") -@patch("api.src.services.tts.phonemize") -@patch("api.src.services.tts.tokenize") -@patch("api.src.services.tts.generate") +@patch("kokoro.normalize_text") +@patch("kokoro.phonemize") +@patch("kokoro.tokenize") @patch("torch.load") def test_generate_audio_success( mock_torch_load, - mock_generate, mock_tokenize, mock_phonemize, mock_normalize, mock_exists, + mock_model_generate, + mock_get_device, mock_instance, tts_service, sample_audio, @@ -214,12 +182,17 @@ def test_generate_audio_success( """Test successful audio generation""" mock_normalize.return_value = "Test text" mock_phonemize.return_value = "Test text" - mock_tokenize.return_value = ["test", "text"] - mock_generate.return_value = (sample_audio, None) + mock_tokenize.return_value = [1, 2] # Return integers instead of strings + mock_model_generate.return_value = sample_audio mock_instance.return_value = (MagicMock(), "cpu") + mock_get_device.return_value = "cpu" mock_exists.return_value = True mock_torch_load.return_value = MagicMock() + # Initialize model + TTSModel._instance = None + TTSModel._device = "cpu" + audio, processing_time = tts_service._generate_audio("Test text", "af", 1.0) assert isinstance(audio, np.ndarray) assert isinstance(processing_time, float) @@ -227,35 +200,94 @@ def test_generate_audio_success( @patch("torch.cuda.is_available") -@patch("models.build_model") -def test_model_initialization_cuda(mock_build_model, mock_cuda_available): +@patch("api.src.services.tts_gpu.TTSGPUModel.initialize") +@patch("os.makedirs") +@patch("os.path.exists") +@patch("os.listdir") +@patch("torch.load") +@patch("torch.save") +@patch("api.src.core.config.settings") +@patch("torch.zeros") +def test_model_initialization_cuda( + mock_zeros, + mock_settings, + mock_save, + mock_load, + mock_listdir, + mock_exists, + mock_makedirs, + mock_initialize, + mock_cuda_available, +): """Test model initialization with CUDA""" + # Setup mocks mock_cuda_available.return_value = True - mock_model = MagicMock() - mock_build_model.return_value = mock_model + mock_initialize.return_value = True + mock_exists.return_value = True + mock_listdir.return_value = ["voice1.pt", "voice2.pt"] + mock_load.return_value = torch.zeros(1) + mock_settings.model_dir = "test_dir" + mock_settings.model_path = "test_path" + mock_settings.voices_dir = "voices" + mock_zeros.return_value = torch.zeros(1) - TTSModel._instance = None # Reset singleton - model, voice_count = TTSModel.initialize() + # Reset singleton and device + TTSModel._instance = None + TTSModel._device = None + + # Mock settings to prevent actual file operations + with patch.object(settings, 'model_dir', 'test_dir'), \ + patch.object(settings, 'model_path', 'test_path'): + voice_count = TTSModel.setup() - assert TTSModel._device == "cuda" # Check the class variable instead - assert model == mock_model - mock_build_model.assert_called_once() + assert TTSModel.get_device() == "cuda" + assert voice_count == 2 + mock_initialize.assert_called_once_with("test_dir", "test_path") -@patch("api.src.services.tts.torch.cuda.is_available") -@patch("api.src.services.tts.build_model") -def test_model_initialization_cpu(mock_build_model, mock_cuda_available): +@patch("torch.cuda.is_available") +@patch("api.src.services.tts_base.TTSBaseModel.initialize") +@patch("os.makedirs") +@patch("os.path.exists") +@patch("os.listdir") +@patch("torch.load") +@patch("torch.save") +@patch("api.src.core.config.settings") +@patch("torch.zeros") +def test_model_initialization_cpu( + mock_zeros, + mock_settings, + mock_save, + mock_load, + mock_listdir, + mock_exists, + mock_makedirs, + mock_initialize, + mock_cuda_available, +): """Test model initialization with CPU""" + # Setup mocks mock_cuda_available.return_value = False - mock_model = MagicMock() - mock_build_model.return_value = mock_model + mock_initialize.return_value = False # This will trigger the RuntimeError + mock_exists.return_value = True + mock_listdir.return_value = ["voice1.pt", "voice2.pt", "voice3.pt"] + mock_load.return_value = torch.zeros(1) + mock_settings.model_dir = "test_dir" + mock_settings.model_path = "test_path" + mock_settings.voices_dir = "voices" + mock_zeros.return_value = torch.zeros(1) - TTSModel._instance = None # Reset singleton - model, voice_count = TTSModel.initialize() + # Reset singleton and device + TTSModel._instance = None + TTSModel._device = None - assert TTSModel._device == "cpu" # Check the class variable instead - assert model == mock_model - mock_build_model.assert_called_once() + # Mock settings to prevent actual file operations + with patch.object(settings, 'model_dir', 'test_dir'), \ + patch.object(settings, 'model_path', 'test_path'), \ + pytest.raises(RuntimeError, match="Failed to initialize CPU model"): + TTSModel.setup() + + mock_initialize.assert_called_once_with("test_dir", "test_path") @patch("api.src.services.tts_service.TTSService._get_voice_path") @@ -267,7 +299,7 @@ def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path): TTSModel._voicepacks = {} # Reset voicepacks - service = TTSService(start_worker=False) + service = TTSService() with pytest.raises(ValueError, match="Voice not found: nonexistent_voice"): service._generate_audio("test", "nonexistent_voice", 1.0) @@ -286,23 +318,32 @@ def test_save_audio(mock_tts_model, tts_service, sample_audio, tmp_path): @patch("api.src.services.tts_model.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_device") +@patch("api.src.services.tts_model.TTSModel.generate") @patch("os.path.exists") -@patch("api.src.services.tts.normalize_text") -@patch("api.src.services.tts.generate") +@patch("kokoro.normalize_text") +@patch("kokoro.phonemize") +@patch("kokoro.tokenize") @patch("torch.load") def test_generate_audio_without_stitching( mock_torch_load, - mock_generate, + mock_tokenize, + mock_phonemize, mock_normalize, mock_exists, + mock_model_generate, + mock_get_device, mock_instance, tts_service, sample_audio, ): """Test generating audio without text stitching""" mock_normalize.return_value = "Test text" - mock_generate.return_value = (sample_audio, None) + mock_phonemize.return_value = "Test text" + mock_tokenize.return_value = [1, 2] # Return integers instead of strings + mock_model_generate.return_value = sample_audio mock_instance.return_value = (MagicMock(), "cpu") + mock_get_device.return_value = "cpu" mock_exists.return_value = True mock_torch_load.return_value = MagicMock() @@ -311,7 +352,7 @@ def test_generate_audio_without_stitching( ) assert isinstance(audio, np.ndarray) assert len(audio) > 0 - mock_generate.assert_called_once() + mock_model_generate.assert_called_once() @patch("os.listdir") @@ -323,12 +364,13 @@ def test_list_voices_error(mock_listdir, tts_service): assert voices == [] -@patch("api.src.services.tts.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_device") @patch("os.path.exists") -@patch("api.src.services.tts.normalize_text") -@patch("api.src.services.tts.phonemize") -@patch("api.src.services.tts.tokenize") -@patch("api.src.services.tts.generate") +@patch("kokoro.normalize_text") +@patch("kokoro.phonemize") +@patch("kokoro.tokenize") +@patch("kokoro.generate") @patch("torch.load") def test_generate_audio_phonemize_error( mock_torch_load, @@ -337,6 +379,7 @@ def test_generate_audio_phonemize_error( mock_phonemize, mock_normalize, mock_exists, + mock_get_device, mock_instance, tts_service, ): @@ -344,33 +387,51 @@ def test_generate_audio_phonemize_error( mock_normalize.return_value = "Test text" mock_phonemize.side_effect = Exception("Phonemization failed") mock_instance.return_value = (MagicMock(), "cpu") + mock_get_device.return_value = "cpu" mock_exists.return_value = True mock_torch_load.return_value = MagicMock() mock_generate.return_value = (None, None) + # Initialize model + TTSModel._instance = None + TTSModel._device = "cpu" + with pytest.raises(ValueError, match="No audio chunks were generated successfully"): tts_service._generate_audio("Test text", "af", 1.0) -@patch("api.src.services.tts.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_instance") +@patch("api.src.services.tts_model.TTSModel.get_device") @patch("os.path.exists") -@patch("api.src.services.tts.normalize_text") -@patch("api.src.services.tts.generate") +@patch("kokoro.normalize_text") +@patch("kokoro.phonemize") +@patch("kokoro.tokenize") +@patch("kokoro.generate") @patch("torch.load") def test_generate_audio_error( mock_torch_load, mock_generate, + mock_tokenize, + mock_phonemize, mock_normalize, mock_exists, + mock_get_device, mock_instance, tts_service, ): """Test handling generation error""" mock_normalize.return_value = "Test text" + mock_phonemize.return_value = "Test text" + mock_tokenize.return_value = [1, 2] # Return integers instead of strings mock_generate.side_effect = Exception("Generation failed") mock_instance.return_value = (MagicMock(), "cpu") + mock_get_device.return_value = "cpu" mock_exists.return_value = True mock_torch_load.return_value = MagicMock() + # Initialize model + TTSModel._instance = None + TTSModel._device = "cpu" + with pytest.raises(ValueError, match="No audio chunks were generated successfully"): tts_service._generate_audio("Test text", "af", 1.0) diff --git a/examples/benchmarks/benchmark_tts_rtf.py b/examples/benchmarks/benchmark_tts_rtf.py index 9fb0c64..773f1d9 100644 --- a/examples/benchmarks/benchmark_tts_rtf.py +++ b/examples/benchmarks/benchmark_tts_rtf.py @@ -69,22 +69,13 @@ def get_gpu_memory(): def get_system_metrics(): """Get current system metrics""" - # Take multiple CPU measurements over a short period - samples = [] - for _ in range(3): # Take 3 samples - # Get both overall and per-CPU percentages - overall_cpu = psutil.cpu_percent(interval=0.1) - per_cpu = psutil.cpu_percent(percpu=True) - avg_per_cpu = sum(per_cpu) / len(per_cpu) - # Use the maximum of overall and average per-CPU - samples.append(max(overall_cpu, avg_per_cpu)) - - # Use the maximum CPU usage from all samples - cpu_usage = round(max(samples), 2) + # Get per-CPU percentages and calculate average + cpu_percentages = psutil.cpu_percent(percpu=True) + avg_cpu = sum(cpu_percentages) / len(cpu_percentages) metrics = { "timestamp": datetime.now().isoformat(), - "cpu_percent": cpu_usage, + "cpu_percent": round(avg_cpu, 2), "ram_percent": psutil.virtual_memory().percent, "ram_used_gb": psutil.virtual_memory().used / (1024**3), }