WIP, Functional for CPU: Updated for ONNX runtime support, Dockerfile and TTS Service

2025-08-05 16:48:53 +00:00 · 2025-01-03 00:53:41 -07:00 · 2025-01-03 00:53:41 -07:00 · e4d8e74738
commit e4d8e74738
parent f1131b4836
15 changed files with 946 additions and 313 deletions
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -10,8 +10,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

-# Install PyTorch CPU version
-RUN pip3 install --no-cache-dir torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
+# Install PyTorch CPU version and ONNX runtime
+RUN pip3 install --no-cache-dir torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu && \
+    pip3 install --no-cache-dir onnxruntime==1.20.1

 # Install all other dependencies from requirements.txt
 COPY requirements.txt .
--- a/api/src/main.py
+++ b/api/src/main.py
@ -10,7 +10,8 @@ from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware

 from .core.config import settings
-from .services.tts import TTSModel, TTSService
+from .services.tts_model import TTSModel
+from .services.tts_service import TTSService
 from .routers.openai_compatible import router as openai_router


@ -20,7 +21,7 @@ async def lifespan(app: FastAPI):
    logger.info("Loading TTS model and voice packs...")

    # Initialize the main model with warm-up
-    model, voicepack_count = TTSModel.initialize()
+    voicepack_count = TTSModel.initialize()
    logger.info(f"Model loaded and warmed up on {TTSModel._device}")
    logger.info(f"{voicepack_count} voice packs loaded successfully")
    yield
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -3,7 +3,7 @@ from typing import List
 from loguru import logger
 from fastapi import Depends, Response, APIRouter, HTTPException

-from ..services.tts import TTSService
+from ..services.tts_service import TTSService
 from ..services.audio import AudioService
 from ..structures.schemas import OpenAISpeechRequest

@ -15,9 +15,7 @@ router = APIRouter(

 def get_tts_service() -> TTSService:
    """Dependency to get TTSService instance with database session"""
-    return TTSService(
-        start_worker=False
-    )  # Don't start worker thread for OpenAI endpoint
+    return TTSService()  # Initialize TTSService with default settings


@router.post("/audio/speech")
--- a/api/src/services/init.py
+++ b/api/src/services/init.py
@ -1,3 +1,4 @@
-from .tts import TTSModel, TTSService
+from .tts_model import TTSModel
+from .tts_service import TTSService

 __all__ = ["TTSService", "TTSModel"]
--- a/api/src/services/tts.py
+++ b/api/src/services/tts.py
@ -1,286 +0,0 @@
-import io
-import os
-import re
-import time
-import threading
-from typing import List, Tuple, Optional
-
-import numpy as np
-import torch
-import tiktoken
-import scipy.io.wavfile as wavfile
-from kokoro import generate, tokenize, phonemize, normalize_text
-from loguru import logger
-from models import build_model
-
-from ..core.config import settings
-
-enc = tiktoken.get_encoding("cl100k_base")
-
-
-class TTSModel:
-    _instance = None
-    _device = None
-    _lock = threading.Lock()
-
-    # Directory for all voices (copied base voices, and any created combined voices)
-    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
-
-    @classmethod
-    def initialize(cls):
-        """Initialize and warm up the model"""
-        with cls._lock:
-            if cls._instance is None:
-                # Initialize model
-                cls._device = "cuda" if torch.cuda.is_available() else "cpu"
-                logger.info(f"Initializing model on {cls._device}")
-                model_path = os.path.join(settings.model_dir, settings.model_path)
-                model = build_model(model_path, cls._device)
-                cls._instance = model
-
-                # Ensure voices directory exists
-                os.makedirs(cls.VOICES_DIR, exist_ok=True)
-
-                # Copy base voices to local directory
-                base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir)
-                if os.path.exists(base_voices_dir):
-                    for file in os.listdir(base_voices_dir):
-                        if file.endswith(".pt"):
-                            voice_name = file[:-3]
-                            voice_path = os.path.join(cls.VOICES_DIR, file)
-                            if not os.path.exists(voice_path):
-                                try:
-                                    logger.info(
-                                        f"Copying base voice {voice_name} to voices directory"
-                                    )
-                                    base_path = os.path.join(base_voices_dir, file)
-                                    voicepack = torch.load(
-                                        base_path,
-                                        map_location=cls._device,
-                                        weights_only=True,
-                                    )
-                                    torch.save(voicepack, voice_path)
-                                except Exception as e:
-                                    logger.error(
-                                        f"Error copying voice {voice_name}: {str(e)}"
-                                    )
-
-                # Warm up with default voice
-                try:
-                    dummy_text = "Hello"
-                    voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
-                    dummy_voicepack = torch.load(
-                        voice_path, map_location=cls._device, weights_only=True
-                    )
-                    generate(model, dummy_text, dummy_voicepack, lang="a", speed=1.0)
-                    logger.info("Model warm-up complete")
-                except Exception as e:
-                    logger.warning(f"Model warm-up failed: {e}")
-
-            # Count voices in directory for validation
-            voice_count = len(
-                [f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")]
-            )
-            return cls._instance, voice_count
-
-    @classmethod
-    def get_instance(cls):
-        """Get the initialized instance or raise an error"""
-        if cls._instance is None:
-            raise RuntimeError("Model not initialized. Call initialize() first.")
-        return cls._instance, cls._device
-
-
-class TTSService:
-    def __init__(self, output_dir: str = None, start_worker: bool = False):
-        self.output_dir = output_dir
-        self._ensure_voices()
-        if start_worker:
-            self.start_worker()
-
-    def _ensure_voices(self):
-        """Copy base voices to local voices directory during initialization"""
-        os.makedirs(TTSModel.VOICES_DIR, exist_ok=True)
-
-        base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir)
-        if os.path.exists(base_voices_dir):
-            for file in os.listdir(base_voices_dir):
-                if file.endswith(".pt"):
-                    voice_name = file[:-3]
-                    voice_path = os.path.join(TTSModel.VOICES_DIR, file)
-                    if not os.path.exists(voice_path):
-                        try:
-                            logger.info(
-                                f"Copying base voice {voice_name} to voices directory"
-                            )
-                            base_path = os.path.join(base_voices_dir, file)
-                            voicepack = torch.load(
-                                base_path,
-                                map_location=TTSModel._device,
-                                weights_only=True,
-                            )
-                            torch.save(voicepack, voice_path)
-                        except Exception as e:
-                            logger.error(f"Error copying voice {voice_name}: {str(e)}")
-
-    def _split_text(self, text: str) -> List[str]:
-        """Split text into sentences"""
-        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
-
-    def _get_voice_path(self, voice_name: str) -> Optional[str]:
-        """Get the path to a voice file.
-
-        Args:
-            voice_name: Name of the voice to find
-
-        Returns:
-            Path to the voice file if found, None otherwise
-        """
-        voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
-        return voice_path if os.path.exists(voice_path) else None
-
-    def _generate_audio(
-        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
-    ) -> Tuple[torch.Tensor, float]:
-        """Generate audio and measure processing time"""
-        start_time = time.time()
-
-        try:
-            # Normalize text once at the start
-            text = normalize_text(text)
-            if not text:
-                raise ValueError("Text is empty after preprocessing")
-
-            # Check voice exists
-            voice_path = self._get_voice_path(voice)
-            if not voice_path:
-                raise ValueError(f"Voice not found: {voice}")
-
-            # Load model and voice
-            model = TTSModel._instance
-            voicepack = torch.load(
-                voice_path, map_location=TTSModel._device, weights_only=True
-            )
-
-            # Generate audio with or without stitching
-            if stitch_long_output:
-                chunks = self._split_text(text)
-                audio_chunks = []
-
-                # Process all chunks with same model/voicepack instance
-                for i, chunk in enumerate(chunks):
-                    try:
-                        # Validate phonemization first
-                        # ps = phonemize(chunk, voice[0])
-                        # tokens = tokenize(ps)
-                        # logger.debug(
-                        #     f"Processing chunk {i + 1}/{len(chunks)}: {len(tokens)} tokens"
-                        # )
-
-                        # Only proceed if phonemization succeeded
-                        chunk_audio, _ = generate(
-                            model, chunk, voicepack, lang=voice[0], speed=speed
-                        )
-                        if chunk_audio is not None:
-                            audio_chunks.append(chunk_audio)
-                        else:
-                            logger.error(
-                                f"No audio generated for chunk {i + 1}/{len(chunks)}"
-                            )
-                    except Exception as e:
-                        logger.error(
-                            f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
-                        )
-                        continue
-
-                if not audio_chunks:
-                    raise ValueError("No audio chunks were generated successfully")
-
-                audio = (
-                    np.concatenate(audio_chunks)
-                    if len(audio_chunks) > 1
-                    else audio_chunks[0]
-                )
-            else:
-                audio, _ = generate(model, text, voicepack, lang=voice[0], speed=speed)
-
-            processing_time = time.time() - start_time
-            return audio, processing_time
-
-        except Exception as e:
-            print(f"Error in audio generation: {str(e)}")
-            raise
-
-    def _save_audio(self, audio: torch.Tensor, filepath: str):
-        """Save audio to file"""
-        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        wavfile.write(filepath, 24000, audio)
-
-    def _audio_to_bytes(self, audio: torch.Tensor) -> bytes:
-        """Convert audio tensor to WAV bytes"""
-        buffer = io.BytesIO()
-        wavfile.write(buffer, 24000, audio)
-        return buffer.getvalue()
-
-    def combine_voices(self, voices: List[str]) -> str:
-        """Combine multiple voices into a new voice.
-
-        Args:
-            voices: List of voice names to combine
-
-        Returns:
-            Name of the combined voice
-
-        Raises:
-            ValueError: If less than 2 voices provided or voice loading fails
-            RuntimeError: If voice combination or saving fails
-        """
-        if len(voices) < 2:
-            raise ValueError("At least 2 voices are required for combination")
-
-        # Load voices
-        t_voices: List[torch.Tensor] = []
-        v_name: List[str] = []
-
-        for voice in voices:
-            try:
-                voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice}.pt")
-                voicepack = torch.load(
-                    voice_path, map_location=TTSModel._device, weights_only=True
-                )
-                t_voices.append(voicepack)
-                v_name.append(voice)
-            except Exception as e:
-                raise ValueError(f"Failed to load voice {voice}: {str(e)}")
-
-        # Combine voices
-        try:
-            f: str = "_".join(v_name)
-            v = torch.mean(torch.stack(t_voices), dim=0)
-            combined_path = os.path.join(TTSModel.VOICES_DIR, f"{f}.pt")
-
-            # Save combined voice
-            try:
-                torch.save(v, combined_path)
-            except Exception as e:
-                raise RuntimeError(
-                    f"Failed to save combined voice to {combined_path}: {str(e)}"
-                )
-
-            return f
-
-        except Exception as e:
-            if not isinstance(e, (ValueError, RuntimeError)):
-                raise RuntimeError(f"Error combining voices: {str(e)}")
-            raise
-
-    def list_voices(self) -> List[str]:
-        """List all available voices"""
-        voices = []
-        try:
-            for file in os.listdir(TTSModel.VOICES_DIR):
-                if file.endswith(".pt"):
-                    voices.append(file[:-3])  # Remove .pt extension
-        except Exception as e:
-            logger.error(f"Error listing voices: {str(e)}")
-        return sorted(voices)
--- a/api/src/services/tts_cpu.py
+++ b/api/src/services/tts_cpu.py
@ -0,0 +1,65 @@
+import os
+import numpy as np
+import torch
+from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel, ExecutionMode
+from loguru import logger
+
+class TTSCPUModel:
+    _instance = None
+    _onnx_session = None
+
+    @classmethod
+    def initialize(cls, model_dir: str):
+        """Initialize ONNX model for CPU inference"""
+        if cls._onnx_session is None:
+            # Try loading ONNX model
+            onnx_path = os.path.join(model_dir, "kokoro-v0_19.onnx")
+            if not os.path.exists(onnx_path):
+                return None
+
+            logger.info(f"Loading ONNX model from {onnx_path}")
+            
+            # Configure ONNX session for optimal performance
+            session_options = SessionOptions()
+            session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+            session_options.intra_op_num_threads = 4  # Adjust based on CPU cores
+            session_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
+
+            # Configure CPU provider options
+            provider_options = {
+                'CPUExecutionProvider': {
+                    'arena_extend_strategy': 'kNextPowerOfTwo',
+                    'cpu_memory_arena_cfg': 'cpu:0'
+                }
+            }
+
+            cls._onnx_session = InferenceSession(
+                onnx_path,
+                sess_options=session_options,
+                providers=['CPUExecutionProvider'],
+                provider_options=[provider_options]
+            )
+            
+            return cls._onnx_session
+        return cls._onnx_session
+
+    @classmethod
+    def generate(cls, tokens: list, voicepack: torch.Tensor, speed: float) -> np.ndarray:
+        """Generate audio using ONNX model"""
+        if cls._onnx_session is None:
+            raise RuntimeError("ONNX model not initialized")
+
+        # Pre-allocate and prepare inputs
+        tokens_input = np.array([tokens], dtype=np.int64)
+        style_input = voicepack[len(tokens)-2].numpy()  # Already has correct dimensions
+        speed_input = np.full(1, speed, dtype=np.float32)  # More efficient than ones * speed
+        
+        # Run inference with optimized inputs
+        return cls._onnx_session.run(
+            None,
+            {
+                'tokens': tokens_input,
+                'style': style_input,
+                'speed': speed_input
+            }
+        )[0]
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -0,0 +1,32 @@
+import os
+import torch
+from loguru import logger
+from models import build_model
+from kokoro import generate
+
+class TTSGPUModel:
+    _instance = None
+    _device = "cuda"
+
+    @classmethod
+    def initialize(cls, model_dir: str, model_path: str):
+        """Initialize PyTorch model for GPU inference"""
+        if cls._instance is None and torch.cuda.is_available():
+            try:
+                logger.info("Initializing GPU model")
+                model_path = os.path.join(model_dir, model_path)
+                model = build_model(model_path, cls._device)
+                cls._instance = model
+                return cls._instance
+            except Exception as e:
+                logger.error(f"Failed to initialize GPU model: {e}")
+                return None
+        return cls._instance
+
+    @classmethod
+    def generate(cls, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> tuple[torch.Tensor, dict]:
+        """Generate audio using PyTorch model on GPU"""
+        if cls._instance is None:
+            raise RuntimeError("GPU model not initialized")
+            
+        return generate(cls._instance, text, voicepack, lang=lang, speed=speed)
--- a/api/src/services/tts_model.py
+++ b/api/src/services/tts_model.py
@ -0,0 +1,94 @@
+import os
+import threading
+import torch
+from loguru import logger
+from kokoro import tokenize, phonemize
+
+from ..core.config import settings
+from .tts_cpu import TTSCPUModel
+from .tts_gpu import TTSGPUModel
+
+
+class TTSModel:
+    _device = None
+    _lock = threading.Lock()
+    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
+
+    @classmethod
+    def initialize(cls):
+        """Initialize and warm up the model"""
+        with cls._lock:
+            # Set device and initialize model
+            cls._device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Initializing model on {cls._device}")
+
+            # Initialize appropriate model based on device
+            if cls._device == "cuda":
+                if not TTSGPUModel.initialize(settings.model_dir, settings.model_path):
+                    raise RuntimeError("Failed to initialize GPU model")
+            else:
+                # Try CPU ONNX first, fallback to CPU PyTorch if needed
+                if not TTSCPUModel.initialize(settings.model_dir):
+                    logger.warning("ONNX initialization failed, falling back to PyTorch CPU")
+                    if not TTSGPUModel.initialize(settings.model_dir, settings.model_path):
+                        raise RuntimeError("Failed to initialize CPU model")
+
+            # Setup voices directory
+            os.makedirs(cls.VOICES_DIR, exist_ok=True)
+
+            # Copy base voices to local directory
+            base_voices_dir = os.path.join(settings.model_dir, settings.voices_dir)
+            if os.path.exists(base_voices_dir):
+                for file in os.listdir(base_voices_dir):
+                    if file.endswith(".pt"):
+                        voice_name = file[:-3]
+                        voice_path = os.path.join(cls.VOICES_DIR, file)
+                        if not os.path.exists(voice_path):
+                            try:
+                                logger.info(
+                                    f"Copying base voice {voice_name} to voices directory"
+                                )
+                                base_path = os.path.join(base_voices_dir, file)
+                                voicepack = torch.load(
+                                    base_path,
+                                    map_location=cls._device,
+                                    weights_only=True,
+                                )
+                                torch.save(voicepack, voice_path)
+                            except Exception as e:
+                                logger.error(
+                                    f"Error copying voice {voice_name}: {str(e)}"
+                                )
+
+            # Warm up with default voice
+            try:
+                dummy_text = "Hello"
+                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
+                dummy_voicepack = torch.load(
+                    voice_path, map_location=cls._device, weights_only=True
+                )
+                
+                if cls._device == "cuda":
+                    TTSGPUModel.generate(dummy_text, dummy_voicepack, "a", 1.0)
+                else:
+                    ps = phonemize(dummy_text, "a")
+                    tokens = tokenize(ps)
+                    tokens = [0] + tokens + [0]
+                    TTSCPUModel.generate(tokens, dummy_voicepack, 1.0)
+                
+                logger.info("Model warm-up complete")
+            except Exception as e:
+                logger.warning(f"Model warm-up failed: {e}")
+
+            # Count voices in directory
+            voice_count = len(
+                [f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")]
+            )
+            return voice_count
+
+    @classmethod
+    def get_device(cls):
+        """Get the current device or raise an error"""
+        if cls._device is None:
+            raise RuntimeError("Model not initialized. Call initialize() first.")
+        return cls._device
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -0,0 +1,168 @@
+import io
+import os
+import re
+import time
+from typing import List, Tuple, Optional
+
+import numpy as np
+import torch
+import scipy.io.wavfile as wavfile
+from kokoro import tokenize, phonemize, normalize_text
+from loguru import logger
+
+from ..core.config import settings
+from .tts_model import TTSModel
+from .tts_cpu import TTSCPUModel
+from .tts_gpu import TTSGPUModel
+
+
+class TTSService:
+    def __init__(self, output_dir: str = None):
+        self.output_dir = output_dir
+
+    def _split_text(self, text: str) -> List[str]:
+        """Split text into sentences"""
+        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+
+    def _get_voice_path(self, voice_name: str) -> Optional[str]:
+        """Get the path to a voice file"""
+        voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
+        return voice_path if os.path.exists(voice_path) else None
+
+    def _generate_audio(
+        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
+    ) -> Tuple[torch.Tensor, float]:
+        """Generate audio and measure processing time"""
+        start_time = time.time()
+
+        try:
+            # Normalize text once at the start
+            text = normalize_text(text)
+            if not text:
+                raise ValueError("Text is empty after preprocessing")
+
+            # Check voice exists
+            voice_path = self._get_voice_path(voice)
+            if not voice_path:
+                raise ValueError(f"Voice not found: {voice}")
+
+            # Load voice
+            voicepack = torch.load(
+                voice_path, map_location=TTSModel.get_device(), weights_only=True
+            )
+
+            # Generate audio with or without stitching
+            if stitch_long_output:
+                chunks = self._split_text(text)
+                audio_chunks = []
+
+                # Process all chunks
+                for i, chunk in enumerate(chunks):
+                    try:
+                        # Process chunk
+                        if TTSModel.get_device() == "cuda":
+                            chunk_audio, _ = TTSGPUModel.generate(chunk, voicepack, voice[0], speed)
+                        else:
+                            ps = phonemize(chunk, voice[0])
+                            tokens = tokenize(ps)
+                            tokens = [0] + tokens + [0]  # Add padding
+                            chunk_audio = TTSCPUModel.generate(tokens, voicepack, speed)
+                            
+                        if chunk_audio is not None:
+                            audio_chunks.append(chunk_audio)
+                        else:
+                            logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
+                            
+                    except Exception as e:
+                        logger.error(
+                            f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
+                        )
+                        continue
+
+                if not audio_chunks:
+                    raise ValueError("No audio chunks were generated successfully")
+
+                audio = (
+                    np.concatenate(audio_chunks)
+                    if len(audio_chunks) > 1
+                    else audio_chunks[0]
+                )
+            else:
+                # Process single chunk
+                if TTSModel.get_device() == "cuda":
+                    audio, _ = TTSGPUModel.generate(text, voicepack, voice[0], speed)
+                else:
+                    ps = phonemize(text, voice[0])
+                    tokens = tokenize(ps)
+                    tokens = [0] + tokens + [0]  # Add padding
+                    audio = TTSCPUModel.generate(tokens, voicepack, speed)
+
+            processing_time = time.time() - start_time
+            return audio, processing_time
+
+        except Exception as e:
+            logger.error(f"Error in audio generation: {str(e)}")
+            raise
+
+    def _save_audio(self, audio: torch.Tensor, filepath: str):
+        """Save audio to file"""
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        wavfile.write(filepath, 24000, audio)
+
+    def _audio_to_bytes(self, audio: torch.Tensor) -> bytes:
+        """Convert audio tensor to WAV bytes"""
+        buffer = io.BytesIO()
+        wavfile.write(buffer, 24000, audio)
+        return buffer.getvalue()
+
+    def combine_voices(self, voices: List[str]) -> str:
+        """Combine multiple voices into a new voice"""
+        if len(voices) < 2:
+            raise ValueError("At least 2 voices are required for combination")
+
+        # Load voices
+        t_voices: List[torch.Tensor] = []
+        v_name: List[str] = []
+
+        for voice in voices:
+            try:
+                voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice}.pt")
+                voicepack = torch.load(
+                    voice_path, map_location=TTSModel.get_device(), weights_only=True
+                )
+                t_voices.append(voicepack)
+                v_name.append(voice)
+            except Exception as e:
+                raise ValueError(f"Failed to load voice {voice}: {str(e)}")
+
+        # Combine voices
+        try:
+            f: str = "_".join(v_name)
+            v = torch.mean(torch.stack(t_voices), dim=0)
+            combined_path = os.path.join(TTSModel.VOICES_DIR, f"{f}.pt")
+
+            # Save combined voice
+            try:
+                torch.save(v, combined_path)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to save combined voice to {combined_path}: {str(e)}"
+                )
+
+            return f
+
+        except Exception as e:
+            if not isinstance(e, (ValueError, RuntimeError)):
+                raise RuntimeError(f"Error combining voices: {str(e)}")
+            raise
+
+    def list_voices(self) -> List[str]:
+        """List all available voices"""
+        voices = []
+        try:
+            for file in os.listdir(TTSModel.VOICES_DIR):
+                if file.endswith(".pt"):
+                    voices.append(file[:-3])  # Remove .pt extension
+        except Exception as e:
+            logger.error(f"Error listing voices: {str(e)}")
+        return sorted(voices)
--- a/api/tests/test_tts_service.py
+++ b/api/tests/test_tts_service.py
@ -7,7 +7,8 @@ import numpy as np
 import torch
 import pytest

-from api.src.services.tts import TTSModel, TTSService
+from api.src.services.tts_model import TTSModel
+from api.src.services.tts_service import TTSService


@pytest.fixture
@ -68,12 +69,12 @@ def test_list_voices(mock_join, mock_listdir, tts_service):
    assert "not_a_voice" not in voices


-@patch("api.src.services.tts.TTSModel.get_instance")
-@patch("api.src.services.tts.TTSModel.get_voicepack")
-@patch("api.src.services.tts.normalize_text")
-@patch("api.src.services.tts.phonemize")
-@patch("api.src.services.tts.tokenize")
-@patch("api.src.services.tts.generate")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
+@patch("api.src.services.tts_model.TTSModel.get_voicepack")
+@patch("kokoro.normalize_text")
+@patch("kokoro.phonemize")
+@patch("kokoro.tokenize")
+@patch("kokoro.generate")
 def test_generate_audio_empty_text(
    mock_generate,
    mock_tokenize,
@ -90,12 +91,12 @@ def test_generate_audio_empty_text(
        tts_service._generate_audio("", "af", 1.0)


-@patch("api.src.services.tts.TTSModel.get_instance")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("os.path.exists")
-@patch("api.src.services.tts.normalize_text")
-@patch("api.src.services.tts.phonemize")
-@patch("api.src.services.tts.tokenize")
-@patch("api.src.services.tts.generate")
+@patch("kokoro.normalize_text")
+@patch("kokoro.phonemize")
+@patch("kokoro.tokenize")
+@patch("kokoro.generate")
@patch("torch.load")
 def test_generate_audio_no_chunks(
    mock_torch_load,
@ -225,8 +226,8 @@ def test_generate_audio_success(
    assert len(audio) > 0


-@patch("api.src.services.tts.torch.cuda.is_available")
-@patch("api.src.services.tts.build_model")
+@patch("torch.cuda.is_available")
+@patch("models.build_model")
 def test_model_initialization_cuda(mock_build_model, mock_cuda_available):
    """Test model initialization with CUDA"""
    mock_cuda_available.return_value = True
@ -257,8 +258,8 @@ def test_model_initialization_cpu(mock_build_model, mock_cuda_available):
    mock_build_model.assert_called_once()


-@patch("api.src.services.tts.TTSService._get_voice_path")
-@patch("api.src.services.tts.TTSModel.get_instance")
+@patch("api.src.services.tts_service.TTSService._get_voice_path")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
 def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path):
    """Test voicepack loading error handling"""
    mock_get_voice_path.return_value = None
@ -271,7 +272,7 @@ def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path):
        service._generate_audio("test", "nonexistent_voice", 1.0)


-@patch("api.src.services.tts.TTSModel")
+@patch("api.src.services.tts_model.TTSModel")
 def test_save_audio(mock_tts_model, tts_service, sample_audio, tmp_path):
    """Test saving audio to file"""
    output_dir = os.path.join(tmp_path, "test_output")
@ -284,7 +285,7 @@ def test_save_audio(mock_tts_model, tts_service, sample_audio, tmp_path):
    assert os.path.getsize(output_path) > 0


-@patch("api.src.services.tts.TTSModel.get_instance")
+@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("os.path.exists")
@patch("api.src.services.tts.normalize_text")
@patch("api.src.services.tts.generate")
--- a/examples/benchmarks/benchmark_results_cpu.json
+++ b/examples/benchmarks/benchmark_results_cpu.json
@ -0,0 +1,216 @@
+{
+  "results": [
+    {
+      "tokens": 100,
+      "processing_time": 14.349808931350708,
+      "output_length": 31.15,
+      "rtf": 0.46,
+      "elapsed_time": 14.716031074523926
+    },
+    {
+      "tokens": 200,
+      "processing_time": 28.341803312301636,
+      "output_length": 62.6,
+      "rtf": 0.45,
+      "elapsed_time": 43.44207406044006
+    },
+    {
+      "tokens": 300,
+      "processing_time": 43.352553606033325,
+      "output_length": 96.325,
+      "rtf": 0.45,
+      "elapsed_time": 87.26906609535217
+    },
+    {
+      "tokens": 400,
+      "processing_time": 71.02449822425842,
+      "output_length": 128.575,
+      "rtf": 0.55,
+      "elapsed_time": 158.7198133468628
+    },
+    {
+      "tokens": 500,
+      "processing_time": 70.92521691322327,
+      "output_length": 158.575,
+      "rtf": 0.45,
+      "elapsed_time": 230.01379895210266
+    },
+    {
+      "tokens": 600,
+      "processing_time": 83.6328592300415,
+      "output_length": 189.25,
+      "rtf": 0.44,
+      "elapsed_time": 314.02610969543457
+    },
+    {
+      "tokens": 700,
+      "processing_time": 103.0810194015503,
+      "output_length": 222.075,
+      "rtf": 0.46,
+      "elapsed_time": 417.5678551197052
+    },
+    {
+      "tokens": 800,
+      "processing_time": 127.02162909507751,
+      "output_length": 253.85,
+      "rtf": 0.5,
+      "elapsed_time": 545.0128681659698
+    },
+    {
+      "tokens": 900,
+      "processing_time": 130.49781227111816,
+      "output_length": 283.775,
+      "rtf": 0.46,
+      "elapsed_time": 675.8943417072296
+    },
+    {
+      "tokens": 1000,
+      "processing_time": 154.76425909996033,
+      "output_length": 315.475,
+      "rtf": 0.49,
+      "elapsed_time": 831.0677945613861
+    }
+  ],
+  "system_metrics": [
+    {
+      "timestamp": "2025-01-03T00:23:52.896889",
+      "cpu_percent": 4.5,
+      "ram_percent": 39.1,
+      "ram_used_gb": 24.86032485961914,
+      "gpu_memory_used": 1281.0
+    },
+    {
+      "timestamp": "2025-01-03T00:24:07.429461",
+      "cpu_percent": 4.5,
+      "ram_percent": 39.1,
+      "ram_used_gb": 24.847564697265625,
+      "gpu_memory_used": 1285.0
+    },
+    {
+      "timestamp": "2025-01-03T00:24:07.620587",
+      "cpu_percent": 2.7,
+      "ram_percent": 39.1,
+      "ram_used_gb": 24.846607208251953,
+      "gpu_memory_used": 1275.0
+    },
+    {
+      "timestamp": "2025-01-03T00:24:36.140754",
+      "cpu_percent": 5.4,
+      "ram_percent": 39.1,
+      "ram_used_gb": 24.857810974121094,
+      "gpu_memory_used": 1267.0
+    },
+    {
+      "timestamp": "2025-01-03T00:24:36.340675",
+      "cpu_percent": 6.2,
+      "ram_percent": 39.1,
+      "ram_used_gb": 24.85773468017578,
+      "gpu_memory_used": 1267.0
+    },
+    {
+      "timestamp": "2025-01-03T00:25:19.905634",
+      "cpu_percent": 29.1,
+      "ram_percent": 39.2,
+      "ram_used_gb": 24.920318603515625,
+      "gpu_memory_used": 1256.0
+    },
+    {
+      "timestamp": "2025-01-03T00:25:20.182219",
+      "cpu_percent": 20.0,
+      "ram_percent": 39.2,
+      "ram_used_gb": 24.930198669433594,
+      "gpu_memory_used": 1256.0
+    },
+    {
+      "timestamp": "2025-01-03T00:26:31.414760",
+      "cpu_percent": 5.3,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.127891540527344,
+      "gpu_memory_used": 1259.0
+    },
+    {
+      "timestamp": "2025-01-03T00:26:31.617256",
+      "cpu_percent": 3.6,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.126346588134766,
+      "gpu_memory_used": 1252.0
+    },
+    {
+      "timestamp": "2025-01-03T00:27:42.736097",
+      "cpu_percent": 10.5,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.100231170654297,
+      "gpu_memory_used": 1249.0
+    },
+    {
+      "timestamp": "2025-01-03T00:27:42.912870",
+      "cpu_percent": 5.3,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.098285675048828,
+      "gpu_memory_used": 1249.0
+    },
+    {
+      "timestamp": "2025-01-03T00:29:06.725264",
+      "cpu_percent": 8.9,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.123123168945312,
+      "gpu_memory_used": 1239.0
+    },
+    {
+      "timestamp": "2025-01-03T00:29:06.928826",
+      "cpu_percent": 5.5,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.128646850585938,
+      "gpu_memory_used": 1239.0
+    },
+    {
+      "timestamp": "2025-01-03T00:30:50.206349",
+      "cpu_percent": 49.6,
+      "ram_percent": 39.6,
+      "ram_used_gb": 25.162948608398438,
+      "gpu_memory_used": 1245.0
+    },
+    {
+      "timestamp": "2025-01-03T00:30:50.491837",
+      "cpu_percent": 14.8,
+      "ram_percent": 39.5,
+      "ram_used_gb": 25.13379669189453,
+      "gpu_memory_used": 1245.0
+    },
+    {
+      "timestamp": "2025-01-03T00:32:57.721467",
+      "cpu_percent": 6.2,
+      "ram_percent": 39.6,
+      "ram_used_gb": 25.187721252441406,
+      "gpu_memory_used": 1384.0
+    },
+    {
+      "timestamp": "2025-01-03T00:32:57.913350",
+      "cpu_percent": 3.6,
+      "ram_percent": 39.6,
+      "ram_used_gb": 25.199390411376953,
+      "gpu_memory_used": 1384.0
+    },
+    {
+      "timestamp": "2025-01-03T00:35:08.608730",
+      "cpu_percent": 6.3,
+      "ram_percent": 39.8,
+      "ram_used_gb": 25.311710357666016,
+      "gpu_memory_used": 1330.0
+    },
+    {
+      "timestamp": "2025-01-03T00:35:08.791851",
+      "cpu_percent": 5.3,
+      "ram_percent": 39.8,
+      "ram_used_gb": 25.326683044433594,
+      "gpu_memory_used": 1333.0
+    },
+    {
+      "timestamp": "2025-01-03T00:37:43.782406",
+      "cpu_percent": 6.8,
+      "ram_percent": 40.6,
+      "ram_used_gb": 25.803058624267578,
+      "gpu_memory_used": 1409.0
+    }
+  ]
+}
--- a/examples/benchmarks/benchmark_stats_cpu.txt
+++ b/examples/benchmarks/benchmark_stats_cpu.txt
@ -0,0 +1,19 @@
+=== Benchmark Statistics (with correct RTF) ===
+
+Overall Stats:
+Total tokens processed: 5500
+Total audio generated: 1741.65s
+Total test duration: 831.07s
+Average processing rate: 6.72 tokens/second
+Average RTF: 0.47x
+
+Per-chunk Stats:
+Average chunk size: 550.00 tokens
+Min chunk size: 100.00 tokens
+Max chunk size: 1000.00 tokens
+Average processing time: 82.70s
+Average output length: 174.17s
+
+Performance Ranges:
+Processing rate range: 5.63 - 7.17 tokens/second
+RTF range: 0.44x - 0.55x
--- a/examples/benchmarks/benchmark_tts_rtf.py
+++ b/examples/benchmarks/benchmark_tts_rtf.py
@ -0,0 +1,323 @@
+import os
+import json
+import time
+import subprocess
+from datetime import datetime
+
+import pandas as pd
+import psutil
+import seaborn as sns
+import requests
+import tiktoken
+import scipy.io.wavfile as wavfile
+import matplotlib.pyplot as plt
+
+enc = tiktoken.get_encoding("cl100k_base")
+
+
+def setup_plot(fig, ax, title):
+    """Configure plot styling"""
+    ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
+    ax.set_title(title, pad=20, fontsize=16, fontweight="bold", color="#ffffff")
+    ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
+    ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
+    ax.tick_params(labelsize=12, colors="#ffffff")
+
+    for spine in ax.spines.values():
+        spine.set_color("#ffffff")
+        spine.set_alpha(0.3)
+        spine.set_linewidth(0.5)
+
+    ax.set_facecolor("#1a1a2e")
+    fig.patch.set_facecolor("#1a1a2e")
+    return fig, ax
+
+
+def get_text_for_tokens(text: str, num_tokens: int) -> str:
+    """Get a slice of text that contains exactly num_tokens tokens"""
+    tokens = enc.encode(text)
+    if num_tokens > len(tokens):
+        return text
+    return enc.decode(tokens[:num_tokens])
+
+
+def get_audio_length(audio_data: bytes) -> float:
+    """Get audio length in seconds from bytes data"""
+    temp_path = "examples/benchmarks/output/temp.wav"
+    os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+    with open(temp_path, "wb") as f:
+        f.write(audio_data)
+
+    try:
+        rate, data = wavfile.read(temp_path)
+        return len(data) / rate
+    finally:
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+
+
+def get_gpu_memory():
+    """Get GPU memory usage using nvidia-smi"""
+    try:
+        result = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
+        )
+        return float(result.decode("utf-8").strip())
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return None
+
+
+def get_system_metrics():
+    """Get current system metrics"""
+    # Take multiple CPU measurements over a short period
+    samples = []
+    for _ in range(3):  # Take 3 samples
+        # Get both overall and per-CPU percentages
+        overall_cpu = psutil.cpu_percent(interval=0.1)
+        per_cpu = psutil.cpu_percent(percpu=True)
+        avg_per_cpu = sum(per_cpu) / len(per_cpu)
+        # Use the maximum of overall and average per-CPU
+        samples.append(max(overall_cpu, avg_per_cpu))
+    
+    # Use the maximum CPU usage from all samples
+    cpu_usage = round(max(samples), 2)
+    
+    metrics = {
+        "timestamp": datetime.now().isoformat(),
+        "cpu_percent": cpu_usage,
+        "ram_percent": psutil.virtual_memory().percent,
+        "ram_used_gb": psutil.virtual_memory().used / (1024**3),
+    }
+
+    gpu_mem = get_gpu_memory()
+    if gpu_mem is not None:
+        metrics["gpu_memory_used"] = gpu_mem
+
+    return metrics
+
+
+def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
+    """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio"""
+    rtf = processing_time / audio_length
+    return round(rtf, decimals)
+
+
+def make_tts_request(text: str, timeout: int = 1800) -> tuple[float, float]:
+    """Make TTS request using OpenAI-compatible endpoint and return processing time and output length"""
+    try:
+        start_time = time.time()
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "input": text,
+                "voice": "af",
+                "response_format": "wav",
+            },
+            timeout=timeout,
+        )
+        response.raise_for_status()
+
+        processing_time = round(time.time() - start_time, 2)
+        audio_length = round(get_audio_length(response.content), 2)
+
+        # Save the audio file
+        token_count = len(enc.encode(text))
+        output_file = f"examples/benchmarks/output/chunk_{token_count}_tokens.wav"
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        with open(output_file, "wb") as f:
+            f.write(response.content)
+        print(f"Saved audio to {output_file}")
+
+        return processing_time, audio_length
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error making request for text: {text[:50]}... Error: {str(e)}")
+        return None, None
+    except Exception as e:
+        print(f"Error processing text: {text[:50]}... Error: {str(e)}")
+        return None, None
+
+
+def plot_system_metrics(metrics_data):
+    """Create plots for system metrics over time"""
+    df = pd.DataFrame(metrics_data)
+    df["timestamp"] = pd.to_datetime(df["timestamp"])
+    elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
+
+    baseline_cpu = df["cpu_percent"].iloc[0]
+    baseline_ram = df["ram_used_gb"].iloc[0]
+    baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
+
+    if "gpu_memory_used" in df.columns:
+        df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
+
+    plt.style.use("dark_background")
+
+    has_gpu = "gpu_memory_used" in df.columns
+    num_plots = 3 if has_gpu else 2
+    fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
+    fig.patch.set_facecolor("#1a1a2e")
+
+    window = min(5, len(df) // 2)
+
+    # Plot CPU Usage
+    smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
+    sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], color="#ff2a6d", linewidth=2)
+    axes[0].axhline(y=baseline_cpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline")
+    axes[0].set_xlabel("Time (seconds)")
+    axes[0].set_ylabel("CPU Usage (%)")
+    axes[0].set_title("CPU Usage Over Time")
+    axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
+    axes[0].legend()
+
+    # Plot RAM Usage
+    smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
+    sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], color="#05d9e8", linewidth=2)
+    axes[1].axhline(y=baseline_ram, color="#ff2a6d", linestyle="--", alpha=0.5, label="Baseline")
+    axes[1].set_xlabel("Time (seconds)")
+    axes[1].set_ylabel("RAM Usage (GB)")
+    axes[1].set_title("RAM Usage Over Time")
+    axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
+    axes[1].legend()
+
+    # Plot GPU Memory if available
+    if has_gpu:
+        smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
+        sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], color="#ff2a6d", linewidth=2)
+        axes[2].axhline(y=baseline_gpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline")
+        axes[2].set_xlabel("Time (seconds)")
+        axes[2].set_ylabel("GPU Memory (GB)")
+        axes[2].set_title("GPU Memory Usage Over Time")
+        axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
+        axes[2].legend()
+
+    for ax in axes:
+        ax.grid(True, linestyle="--", alpha=0.3)
+        ax.set_facecolor("#1a1a2e")
+        for spine in ax.spines.values():
+            spine.set_color("#ffffff")
+            spine.set_alpha(0.3)
+
+    plt.tight_layout()
+    plt.savefig("examples/benchmarks/system_usage_rtf.png", dpi=300, bbox_inches="tight")
+    plt.close()
+
+
+def main():
+    os.makedirs("examples/benchmarks/output", exist_ok=True)
+
+    with open("examples/benchmarks/the_time_machine_hg_wells.txt", "r", encoding="utf-8") as f:
+        text = f.read()
+
+    total_tokens = len(enc.encode(text))
+    print(f"Total tokens in file: {total_tokens}")
+
+    # Generate token sizes with dense sampling at start
+    dense_range = list(range(100, 1001, 100))
+    token_sizes = sorted(list(set(dense_range)))
+    print(f"Testing sizes: {token_sizes}")
+
+    results = []
+    system_metrics = []
+    test_start_time = time.time()
+
+    for num_tokens in token_sizes:
+        chunk = get_text_for_tokens(text, num_tokens)
+        actual_tokens = len(enc.encode(chunk))
+
+        print(f"\nProcessing chunk with {actual_tokens} tokens:")
+        print(f"Text preview: {chunk[:100]}...")
+
+        system_metrics.append(get_system_metrics())
+
+        processing_time, audio_length = make_tts_request(chunk)
+        if processing_time is None or audio_length is None:
+            print("Breaking loop due to error")
+            break
+
+        system_metrics.append(get_system_metrics())
+
+        # Calculate RTF using the correct formula
+        rtf = real_time_factor(processing_time, audio_length)
+        
+        results.append({
+            "tokens": actual_tokens,
+            "processing_time": processing_time,
+            "output_length": audio_length,
+            "rtf": rtf,
+            "elapsed_time": round(time.time() - test_start_time, 2),
+        })
+
+        with open("examples/benchmarks/benchmark_results_rtf.json", "w") as f:
+            json.dump({"results": results, "system_metrics": system_metrics}, f, indent=2)
+
+    df = pd.DataFrame(results)
+    if df.empty:
+        print("No data to plot")
+        return
+
+    df["tokens_per_second"] = df["tokens"] / df["processing_time"]
+
+    with open("examples/benchmarks/benchmark_stats_rtf.txt", "w") as f:
+        f.write("=== Benchmark Statistics (with correct RTF) ===\n\n")
+
+        f.write("Overall Stats:\n")
+        f.write(f"Total tokens processed: {df['tokens'].sum()}\n")
+        f.write(f"Total audio generated: {df['output_length'].sum():.2f}s\n")
+        f.write(f"Total test duration: {df['elapsed_time'].max():.2f}s\n")
+        f.write(f"Average processing rate: {df['tokens_per_second'].mean():.2f} tokens/second\n")
+        f.write(f"Average RTF: {df['rtf'].mean():.2f}x\n\n")
+
+        f.write("Per-chunk Stats:\n")
+        f.write(f"Average chunk size: {df['tokens'].mean():.2f} tokens\n")
+        f.write(f"Min chunk size: {df['tokens'].min():.2f} tokens\n")
+        f.write(f"Max chunk size: {df['tokens'].max():.2f} tokens\n")
+        f.write(f"Average processing time: {df['processing_time'].mean():.2f}s\n")
+        f.write(f"Average output length: {df['output_length'].mean():.2f}s\n\n")
+
+        f.write("Performance Ranges:\n")
+        f.write(f"Processing rate range: {df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f} tokens/second\n")
+        f.write(f"RTF range: {df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x\n")
+
+    plt.style.use("dark_background")
+
+    # Plot Processing Time vs Token Count
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.scatterplot(data=df, x="tokens", y="processing_time", s=100, alpha=0.6, color="#ff2a6d")
+    sns.regplot(data=df, x="tokens", y="processing_time", scatter=False, color="#05d9e8", line_kws={"linewidth": 2})
+    corr = df["tokens"].corr(df["processing_time"])
+    plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff",
+             bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7))
+    setup_plot(fig, ax, "Processing Time vs Input Size")
+    ax.set_xlabel("Number of Input Tokens")
+    ax.set_ylabel("Processing Time (seconds)")
+    plt.savefig("examples/benchmarks/processing_time_rtf.png", dpi=300, bbox_inches="tight")
+    plt.close()
+
+    # Plot RTF vs Token Count
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.scatterplot(data=df, x="tokens", y="rtf", s=100, alpha=0.6, color="#ff2a6d")
+    sns.regplot(data=df, x="tokens", y="rtf", scatter=False, color="#05d9e8", line_kws={"linewidth": 2})
+    corr = df["tokens"].corr(df["rtf"])
+    plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff",
+             bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7))
+    setup_plot(fig, ax, "Real-Time Factor vs Input Size")
+    ax.set_xlabel("Number of Input Tokens")
+    ax.set_ylabel("Real-Time Factor (processing time / audio length)")
+    plt.savefig("examples/benchmarks/realtime_factor_rtf.png", dpi=300, bbox_inches="tight")
+    plt.close()
+
+    plot_system_metrics(system_metrics)
+
+    print("\nResults saved to:")
+    print("- examples/benchmarks/benchmark_results_rtf.json")
+    print("- examples/benchmarks/benchmark_stats_rtf.txt")
+    print("- examples/benchmarks/processing_time_rtf.png")
+    print("- examples/benchmarks/realtime_factor_rtf.png")
+    print("- examples/benchmarks/system_usage_rtf.png")
+    print("\nAudio files saved in examples/benchmarks/output/")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarks/processing_time_cpu.png
+++ b/examples/benchmarks/processing_time_cpu.png
--- a/examples/benchmarks/realtime_factor_cpu.png
+++ b/examples/benchmarks/realtime_factor_cpu.png