Merge pull request #9 from remsky/feat/streaming

-Added Streaming support -Improved model warmup operations -Minor optimizations to inference model structure -Chunking configurations added
2025-08-05 16:48:53 +00:00 · 2025-01-06 03:53:50 -07:00 · 2025-01-06 03:53:50 -07:00 · ab8e3c98f6
commit ab8e3c98f6
parent 58a8030646 3cb595a129
76 changed files with 6247 additions and 4141 deletions
--- a/.coverage
+++ b/.coverage
--- a/.gitignore
+++ b/.gitignore
@ -23,4 +23,7 @@ examples/assorted_checks/test_openai/output/*

 examples/assorted_checks/test_voices/output/*
 examples/assorted_checks/test_formats/output/*
+examples/assorted_checks/benchmarks/output_audio_stream/*
 ui/RepoScreenshot.png
+examples/assorted_checks/benchmarks/output_audio_stream_openai/*
+
--- a/README.md
+++ b/README.md
@ -10,12 +10,12 @@
 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
 - OpenAI-compatible Speech endpoint, with voice combination functionality
 - NVIDIA GPU accelerated inference (or CPU) option
- very fast generation time (~35x real time generation speed via 4060Ti)
+- very fast generation time (~30x real time speed via 4060Ti)
 - automatic chunking/stitching for long texts
+- streaming support w/ variable chunking to control latency
 - simple audio generation web ui utility


-
 ## Quick Start

 The service can be accessed through either the API endpoints or the Gradio web interface.
@ -129,7 +129,7 @@ response = requests.post(
 )
 ```
 <p align="center">
-  <img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>
 </details>

@ -144,7 +144,7 @@ response = requests.post(
 - pcm

 <p align="center">
-<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
+<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>

 </details>
@ -162,6 +162,76 @@ If you only want the API, just comment out everything in the docker-compose.yml
 Currently, voices created via the API are accessible here, but voice combination/creation has not yet been added
 </details>

+<details>
+<summary>Streaming Support</summary>
+
+```python
+# OpenAI-compatible streaming
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8880", api_key="not-needed")
+
+# Stream to file
+with client.audio.speech.with_streaming_response.create(
+    model="kokoro",
+    voice="af_bella",
+    input="Hello world!"
+) as response:
+    response.stream_to_file("output.mp3")
+
+# Stream to speakers (requires PyAudio)
+import pyaudio
+player = pyaudio.PyAudio().open(
+    format=pyaudio.paInt16, 
+    channels=1, 
+    rate=24000, 
+    output=True
+)
+
+with client.audio.speech.with_streaming_response.create(
+    model="kokoro",
+    voice="af_bella",
+    response_format="pcm",
+    input="Hello world!"
+) as response:
+    for chunk in response.iter_bytes(chunk_size=1024):
+        player.write(chunk)
+```
+
+Or via requests:
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:8880/v1/audio/speech",
+    json={
+        "input": "Hello world!",
+        "voice": "af_bella",
+        "response_format": "pcm"
+    },
+    stream=True
+)
+
+for chunk in response.iter_content(chunk_size=1024):
+    if chunk:
+        # Process streaming chunks
+        pass
+```
+
+<p align="center">
+  <img src="assets/gpu_first_token_timeline_openai.png" width="45%" alt="GPU First Token Timeline" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
+  <img src="assets/cpu_first_token_timeline_stream_openai.png" width="45%" alt="CPU First Token Timeline" style="border: 2px solid #333; padding: 10px;">
+</p>
+
+Key Streaming Metrics:
+- First token latency @ chunksize
+    - ~300ms (GPU) @ 400 
+    - ~3500ms (CPU) @ 200 
+- Adjustable chunking settings for real-time playback 
+
+*Note: Artifacts in intonation can increase with smaller chunks*
+</details>
+
 ## Processing Details
 <details>
 <summary>Performance Benchmarks</summary>
@ -175,8 +245,8 @@ Benchmarking was performed on generation via the local API using text lengths up
 - H.G. Wells - The Time Machine (full text)

 <p align="center">
-  <img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
-  <img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
+  <img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
 </p>

 Key Performance Metrics:
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -18,6 +18,8 @@ class Settings(BaseSettings):
    onnx_model_path: str = "kokoro-v0_19.onnx"
    voices_dir: str = "voices"
    sample_rate: int = 24000
+    max_chunk_size: int = 300  # Maximum size of text chunks for processing
+    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
    
    # ONNX Optimization Settings
    onnx_num_threads: int = 4  # Number of threads for intra-op parallelism
--- a/api/src/core/don_quixote.txt
+++ b/api/src/core/don_quixote.txt
@ -0,0 +1,9 @@
+In a village of La Mancha, the name of which I have no desire to call
+to mind, there lived not long since one of those gentlemen that keep a
+lance in the lance-rack, an old buckler, a lean hack, and a greyhound
+for coursing. An olla of rather more beef than mutton, a salad on most
+nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
+extra on Sundays, made away with three-quarters of his income. The rest
+of it went in a doublet of fine cloth and velvet breeches and shoes to
+match for holidays, while on week-days he made a brave figure in his
+best homespun. 
--- a/api/src/main.py
+++ b/api/src/main.py
@ -22,9 +22,28 @@ async def lifespan(app: FastAPI):
    logger.info("Loading TTS model and voice packs...")

    # Initialize the main model with warm-up
-    voicepack_count = TTSModel.setup()
-    logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
-    logger.info(f"{voicepack_count} voice packs loaded successfully")
+    voicepack_count = await TTSModel.setup()
+    # boundary = "█████╗"*9
+    boundary = "░" * 24
+    startup_msg =f"""
+
+{boundary}
+
+    ╔═╗┌─┐┌─┐┌┬┐
+    ╠╣ ├─┤└─┐ │ 
+    ╚  ┴ ┴└─┘ ┴ 
+    ╦╔═┌─┐┬┌─┌─┐
+    ╠╩╗│ │├┴┐│ │
+    ╩ ╩└─┘┴ ┴└─┘
+
+{boundary}
+                """
+    # TODO: Improve CPU warmup, threads, memory, etc
+    startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
+    startup_msg += f"\n{voicepack_count} voice packs loaded\n"
+    startup_msg += f"\n{boundary}\n"
+    logger.info(startup_msg)
+
    yield


--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -2,10 +2,12 @@ from typing import List

 from loguru import logger
 from fastapi import Depends, Response, APIRouter, HTTPException
-
+from fastapi import Header
+from fastapi.responses import StreamingResponse
 from ..services.tts_service import TTSService
 from ..services.audio import AudioService
 from ..structures.schemas import OpenAISpeechRequest
+from typing import AsyncGenerator

 router = APIRouter(
    tags=["OpenAI Compatible TTS"],
@ -18,9 +20,23 @@ def get_tts_service() -> TTSService:
    return TTSService()  # Initialize TTSService with default settings


+async def stream_audio_chunks(tts_service: TTSService, request: OpenAISpeechRequest) -> AsyncGenerator[bytes, None]:
+    """Stream audio chunks as they're generated"""
+    async for chunk in tts_service.generate_audio_stream(
+        text=request.input,
+        voice=request.voice,
+        speed=request.speed,
+        output_format=request.response_format
+    ):
+        yield chunk
+
+
+
@router.post("/audio/speech")
 async def create_speech(
-    request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
+    request: OpenAISpeechRequest, 
+    tts_service: TTSService = Depends(get_tts_service),
+    x_raw_response: str = Header(None, alias="x-raw-response"),
 ):
    """OpenAI-compatible endpoint for text-to-speech"""
    try:
@ -31,24 +47,53 @@ async def create_speech(
                f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
            )

-        # Generate audio directly using TTSService's method
-        audio, _ = tts_service._generate_audio(
-            text=request.input,
-            voice=request.voice,
-            speed=request.speed,
-            stitch_long_output=True,
-        )
+        # Set content type based on format
+        content_type = {
+            "mp3": "audio/mpeg",
+            "opus": "audio/opus",
+            "aac": "audio/aac",
+            "flac": "audio/flac",
+            "wav": "audio/wav",
+            "pcm": "audio/pcm",
+        }.get(request.response_format, f"audio/{request.response_format}")

-        # Convert to requested format
-        content = AudioService.convert_audio(audio, 24000, request.response_format)
+        # Check if streaming is requested (default for OpenAI client)
+        if request.stream:
+            # Stream audio chunks as they're generated
+            return StreamingResponse(
+                stream_audio_chunks(tts_service, request),
+                media_type=content_type,
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "X-Accel-Buffering": "no",  # Disable proxy buffering
+                    "Cache-Control": "no-cache",  # Prevent caching
+                },
+            )
+        else:
+            # Generate complete audio
+            audio, _ = tts_service._generate_audio(
+                text=request.input,
+                voice=request.voice,
+                speed=request.speed,
+                stitch_long_output=True,
+            )

-        return Response(
-            content=content,
-            media_type=f"audio/{request.response_format}",
-            headers={
-                "Content-Disposition": f"attachment; filename=speech.{request.response_format}"
-            },
-        )
+            # Convert to requested format
+            content = AudioService.convert_audio(
+                audio, 
+                24000, 
+                request.response_format,
+                is_first_chunk=True,
+                stream=False)
+
+            return Response(
+                content=content,
+                media_type=content_type,
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "Cache-Control": "no-cache",  # Prevent caching
+                },
+            )

    except ValueError as e:
        logger.error(f"Invalid request: {str(e)}")
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,15 +4,61 @@ from io import BytesIO

 import numpy as np
 import soundfile as sf
+import scipy.io.wavfile as wavfile
 from loguru import logger
+from ..core.config import settings

+class AudioNormalizer:
+    """Handles audio normalization state for a single stream"""
+    def __init__(self):
+        self.int16_max = np.iinfo(np.int16).max
+        self.chunk_trim_ms = settings.gap_trim_ms
+        self.sample_rate = 24000  # Sample rate of the audio
+        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
+    
+    def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
+        """Normalize audio data to int16 range and trim chunk boundaries"""
+        # Convert to float32 if not already
+        audio_float = audio_data.astype(np.float32)
+        
+        # Normalize to [-1, 1] range first
+        if np.max(np.abs(audio_float)) > 0:
+            audio_float = audio_float / np.max(np.abs(audio_float))
+        
+        # Trim end of non-final chunks to reduce gaps
+        if not is_last_chunk and len(audio_float) > self.samples_to_trim:
+            audio_float = audio_float[:-self.samples_to_trim]
+            
+        # Scale to int16 range
+        return (audio_float * self.int16_max).astype(np.int16)

 class AudioService:
    """Service for audio format conversions"""
-
+    
+    # Default audio format settings balanced for speed and compression
+    DEFAULT_SETTINGS = {
+        "mp3": {
+            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
+            "compression_level": 0.0,  # Balanced compression
+        },
+        "opus": {
+            "compression_level": 0.0,  # Good balance for speech
+        },
+        "flac": {
+            "compression_level": 0.0,  # Light compression, still fast
+        }
+    }
+    
    @staticmethod
    def convert_audio(
-        audio_data: np.ndarray, sample_rate: int, output_format: str
+        audio_data: np.ndarray, 
+        sample_rate: int, 
+        output_format: str, 
+        is_first_chunk: bool = True,
+        is_last_chunk: bool = False,
+        normalizer: AudioNormalizer = None,
+        format_settings: dict = None,
+        stream: bool = True
    ) -> bytes:
        """Convert audio data to specified format

@ -20,6 +66,20 @@ class AudioService:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
+            is_first_chunk: Whether this is the first chunk of a stream
+            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
+            format_settings: Optional dict of format-specific settings to override defaults
+                Example: {
+                    "mp3": {
+                        "bitrate_mode": "VARIABLE",
+                        "compression_level": 0.8
+                    }
+                }
+                Default settings balance speed and compression:
+                optimized for localhost @ 0.0
+                - MP3: constant bitrate, no compression (0.0)
+                - OPUS: no compression (0.0)
+                - FLAC: no compression (0.0)

        Returns:
            Bytes of the converted audio
@ -27,34 +87,58 @@ class AudioService:
        buffer = BytesIO()

        try:
-            if output_format == "wav":
-                logger.info("Writing to WAV format...")
-                # Ensure audio_data is in int16 format for WAV
-                audio_data_wav = (
-                    audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
-                ).astype(np.int16)  # Normalize
-                sf.write(buffer, audio_data_wav, sample_rate, format="WAV")
-            elif output_format == "mp3":
-                logger.info("Converting to MP3 format...")
-                # soundfile can write MP3 if ffmpeg or libsox is installed
-                sf.write(buffer, audio_data, sample_rate, format="MP3")
-            elif output_format == "opus":
-                logger.info("Converting to Opus format...")
-                sf.write(buffer, audio_data, sample_rate, format="OGG", subtype="OPUS")
-            elif output_format == "flac":
-                logger.info("Converting to FLAC format...")
-                sf.write(buffer, audio_data, sample_rate, format="FLAC")
-            elif output_format == "pcm":
-                logger.info("Extracting PCM data...")
-                # Ensure audio_data is in int16 format for PCM
-                audio_data_pcm = (
-                    audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
-                ).astype(np.int16)  # Normalize
-                buffer.write(audio_data_pcm.tobytes())
+            # Always normalize audio to ensure proper amplitude scaling
+            if stream:
+                if normalizer is None:
+                    normalizer = AudioNormalizer()
+                normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
            else:
-                raise ValueError(
-                    f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
-                )
+                normalized_audio = audio_data
+
+            if output_format == "pcm":
+                # Raw 16-bit PCM samples, no header
+                buffer.write(normalized_audio.tobytes())
+            elif output_format == "wav":
+                if stream:
+                    # Use soundfile for streaming to ensure proper headers
+                    sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
+                else:
+                    # Trying scipy.io.wavfile for non-streaming WAV generation 
+                    # seems faster than soundfile
+                    # avoids overhead from header generation and PCM encoding
+                    wavfile.write(buffer, sample_rate, normalized_audio)
+            elif output_format == "mp3":
+                # Use format settings or defaults
+                settings = format_settings.get("mp3", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
+                sf.write(
+                    buffer, normalized_audio, 
+                    sample_rate, format="MP3",
+                    **settings
+                    )
+                
+            elif output_format == "opus":
+                settings = format_settings.get("opus", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
+                sf.write(buffer, normalized_audio, sample_rate, format="OGG", 
+                        subtype="OPUS", **settings)
+                
+            elif output_format == "flac":
+                if is_first_chunk:
+                    logger.info("Starting FLAC stream...")
+                settings = format_settings.get("flac", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
+                sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
+                        subtype='PCM_16', **settings)
+            else:
+                if output_format == "aac":
+                    raise ValueError(
+                        "Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm."
+                    )
+                else:
+                    raise ValueError(
+                        f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
+                    )

            buffer.seek(0)
            return buffer.getvalue()
--- a/api/src/services/text_processing/chunker.py
+++ b/api/src/services/text_processing/chunker.py
@ -0,0 +1,52 @@
+"""Text chunking service"""
+
+import re
+from ...core.config import settings
+
+
+def split_text(text: str, max_chunk=None):
+    """Split text into chunks on natural pause points
+    
+    Args:
+        text: Text to split into chunks
+        max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
+    """
+    if max_chunk is None:
+        max_chunk = settings.max_chunk_size
+        
+    if not isinstance(text, str):
+        text = str(text) if text is not None else ""
+        
+    text = text.strip()
+    if not text:
+        return
+        
+    # First split into sentences
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+            
+        # For medium-length sentences, split on punctuation
+        if len(sentence) > max_chunk:  # Lower threshold for more consistent sizes
+            # First try splitting on semicolons and colons
+            parts = re.split(r"(?<=[;:])\s+", sentence)
+            
+            for part in parts:
+                part = part.strip()
+                if not part:
+                    continue
+                    
+                # If part is still long, split on commas
+                if len(part) > max_chunk:
+                    subparts = re.split(r"(?<=,)\s+", part)
+                    for subpart in subparts:
+                        subpart = subpart.strip()
+                        if subpart:
+                            yield subpart
+                else:
+                    yield part
+        else:
+            yield sentence
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -1,4 +1,5 @@
 import re
+from functools import lru_cache

 def split_num(num: re.Match) -> str:
    """Handle number splitting for various formats"""
@ -48,6 +49,7 @@ def handle_decimal(num: re.Match) -> str:
    a, b = num.group().split(".")
    return " point ".join([a, " ".join(b)])

+# @lru_cache(maxsize=1000)  # Cache normalized text results
 def normalize_text(text: str) -> str:
    """Normalize text for TTS processing
    
--- a/api/src/services/tts_base.py
+++ b/api/src/services/tts_base.py
@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")

    @classmethod
-    def setup(cls):
+    async def setup(cls):
        """Initialize model and setup voices"""
        with cls._lock:
            # Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
                            except Exception as e:
                                logger.error(f"Error copying voice {voice_name}: {str(e)}")

-            # Warm up with default voice
+            # Load warmup text
            try:
-                dummy_text = "Hello"
-                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
-                dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
-                
-                # Process text and generate audio
-                phonemes, tokens = cls.process_text(dummy_text, "a")
-                cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
-                
-                logger.info("Model warm-up complete")
+                with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
+                    warmup_text = f.read()
            except Exception as e:
-                logger.warning(f"Model warm-up failed: {e}")
+                logger.warning(f"Failed to load warmup text: {e}")
+                warmup_text = "This is a warmup text that will be split into chunks for processing."
+
+            # Use warmup service
+            from .warmup import WarmupService
+            warmup = WarmupService()
+            
+            # Load and warm up voices
+            loaded_voices = warmup.load_voices()
+            await warmup.warmup_voices(warmup_text, loaded_voices)
+            
+            logger.info("Model warm-up complete")

            # Count voices in directory
            voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -1,6 +1,7 @@
 import os
 import numpy as np
 import torch
+import time
 from loguru import logger
 from models import build_model
 from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
 from .tts_base import TTSBaseModel
 from ..core.config import settings

+# @torch.no_grad()
+# def forward(model, tokens, ref_s, speed):
+#     """Forward pass through the model"""
+#     device = ref_s.device
+#     tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+#     input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+#     text_mask = length_to_mask(input_lengths).to(device)
+#     bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+#     d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+#     s = ref_s[:, 128:]
+#     d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+#     x, _ = model.predictor.lstm(d)
+#     duration = model.predictor.duration_proj(x)
+#     duration = torch.sigmoid(duration).sum(axis=-1) / speed
+#     pred_dur = torch.round(duration).clamp(min=1).long()
+#     pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+#     c_frame = 0
+#     for i in range(pred_aln_trg.size(0)):
+#         pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+#         c_frame += pred_dur[0, i].item()
+#     en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+#     F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+#     t_en = model.text_encoder(tokens, input_lengths, text_mask)
+#     asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+#     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
 def forward(model, tokens, ref_s, speed):
-    """Forward pass through the model"""
+    """Forward pass through the model with light optimizations that preserve output quality"""
    device = ref_s.device
+    
+    # Keep original token handling but optimize device placement
    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
    text_mask = length_to_mask(input_lengths).to(device)
+    
+    # BERT and encoder pass
    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-    s = ref_s[:, 128:]
-    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+    
+    # Split reference signal once for efficiency
+    s_content = ref_s[:, 128:]
+    s_ref = ref_s[:, :128]
+    
+    # Predictor forward pass
+    d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
    x, _ = model.predictor.lstm(d)
+    
+    # Duration prediction - keeping original logic
    duration = model.predictor.duration_proj(x)
    duration = torch.sigmoid(duration).sum(axis=-1) / speed
    pred_dur = torch.round(duration).clamp(min=1).long()
-    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    
+    # Alignment matrix construction - keeping original approach for quality
+    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
    c_frame = 0
    for i in range(pred_aln_trg.size(0)):
-        pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+        pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
        c_frame += pred_dur[0, i].item()
-    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
-    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+    
+    # Matrix multiplications - reuse unsqueezed tensor
+    pred_aln_trg = pred_aln_trg.unsqueeze(0)  # Do unsqueeze once
+    en = d.transpose(-1, -2) @ pred_aln_trg
+    F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+    
+    # Text encoding and final decoding
    t_en = model.text_encoder(tokens, input_lengths, text_mask)
-    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
-    return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+    asr = t_en @ pred_aln_trg
+    
+    return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
+
+# def length_to_mask(lengths):
+#     """Create attention mask from lengths"""
+#     mask = (
+#         torch.arange(lengths.max())
+#         .unsqueeze(0)
+#         .expand(lengths.shape[0], -1)
+#         .type_as(lengths)
+#     )
+#     mask = torch.gt(mask + 1, lengths.unsqueeze(1))
+#     return mask

 def length_to_mask(lengths):
-    """Create attention mask from lengths"""
-    mask = (
-        torch.arange(lengths.max())
-        .unsqueeze(0)
-        .expand(lengths.shape[0], -1)
-        .type_as(lengths)
-    )
-    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
-    return mask
+    """Create attention mask from lengths - possibly optimized version"""
+    max_len = lengths.max()
+    # Create mask directly on the same device as lengths
+    mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
+    # Avoid type_as by using the correct dtype from the start
+    if lengths.dtype != mask.dtype:
+        mask = mask.to(dtype=lengths.dtype)
+    # Fuse operations  using broadcasting
+    return mask + 1 > lengths[:, None]

 class TTSGPUModel(TTSBaseModel):
    _instance = None
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -3,26 +3,29 @@ import os
 import re
 import time
 from typing import List, Tuple, Optional
+from functools import lru_cache

 import numpy as np
 import torch
 import scipy.io.wavfile as wavfile
-from .text_processing import normalize_text
+from .text_processing import normalize_text, chunker
 from loguru import logger

 from ..core.config import settings
 from .tts_model import TTSModel
+from .audio import AudioService, AudioNormalizer


 class TTSService:
    def __init__(self, output_dir: str = None):
        self.output_dir = output_dir

-    def _split_text(self, text: str) -> List[str]:
-        """Split text into sentences"""
-        if not isinstance(text, str):
-            text = str(text) if text is not None else ""
-        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+
+    @staticmethod
+    @lru_cache(maxsize=20)  # Cache up to 8 most recently used voices
+    def _load_voice(voice_path: str) -> torch.Tensor:
+        """Load and cache a voice model"""
+        return torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)

    def _get_voice_path(self, voice_name: str) -> Optional[str]:
        """Get the path to a voice file"""
@ -31,6 +34,13 @@ class TTSService:

    def _generate_audio(
        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
+    ) -> Tuple[torch.Tensor, float]:
+        """Generate complete audio and return with processing time"""
+        audio, processing_time = self._generate_audio_internal(text, voice, speed, stitch_long_output)
+        return audio, processing_time
+
+    def _generate_audio_internal(
+        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
    ) -> Tuple[torch.Tensor, float]:
        """Generate audio and measure processing time"""
        start_time = time.time()
@ -49,42 +59,42 @@ class TTSService:
            if not voice_path:
                raise ValueError(f"Voice not found: {voice}")

-            # Load voice
-            voicepack = torch.load(
-                voice_path, map_location=TTSModel.get_device(), weights_only=True
-            )
+            # Load voice using cached loader
+            voicepack = self._load_voice(voice_path)

-            # Generate audio with or without stitching
+            # For non-streaming, preprocess all chunks first
            if stitch_long_output:
-                chunks = self._split_text(text)
-                audio_chunks = []
-
-                # Process all chunks
-                for i, chunk in enumerate(chunks):
+                # Preprocess all chunks to phonemes/tokens
+                chunks_data = []
+                for chunk in chunker.split_text(text):
                    try:
-                        # Process text and generate audio
                        phonemes, tokens = TTSModel.process_text(chunk, voice[0])
+                        chunks_data.append((chunk, tokens))
+                    except Exception as e:
+                        logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
+                        continue
+
+                if not chunks_data:
+                    raise ValueError("No chunks were processed successfully")
+
+                # Generate audio for all chunks
+                audio_chunks = []
+                for chunk, tokens in chunks_data:
+                    try:
                        chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
-    
                        if chunk_audio is not None:
                            audio_chunks.append(chunk_audio)
                        else:
-                            logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
-                            
+                            logger.error(f"No audio generated for chunk: '{chunk}'")
                    except Exception as e:
-                        logger.error(
-                            f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
-                        )
+                        logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
                        continue

                if not audio_chunks:
                    raise ValueError("No audio chunks were generated successfully")

-                audio = (
-                    np.concatenate(audio_chunks)
-                    if len(audio_chunks) > 1
-                    else audio_chunks[0]
-                )
+                # Concatenate all chunks
+                audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
            else:
                # Process single chunk
                phonemes, tokens = TTSModel.process_text(text, voice[0])
@ -97,6 +107,99 @@ class TTSService:
            logger.error(f"Error in audio generation: {str(e)}")
            raise

+    async def generate_audio_stream(
+        self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
+    ):
+        """Generate and yield audio chunks as they're generated for real-time streaming"""
+        try:
+            stream_start = time.time()
+            # Create normalizer for consistent audio levels
+            stream_normalizer = AudioNormalizer()
+            
+            # Input validation and preprocessing
+            if not text:
+                raise ValueError("Text is empty")
+            preprocess_start = time.time()
+            normalized = normalize_text(text)
+            if not normalized:
+                raise ValueError("Text is empty after preprocessing")
+            text = str(normalized)
+            logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
+
+            # Voice validation and loading
+            voice_start = time.time()
+            voice_path = self._get_voice_path(voice)
+            if not voice_path:
+                raise ValueError(f"Voice not found: {voice}")
+            voicepack = self._load_voice(voice_path)
+            logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
+
+            # Process chunks as they're generated
+            is_first = True
+            chunks_processed = 0
+            # last_chunk_end = time.time()
+            
+            # Process chunks as they come from generator
+            chunk_gen = chunker.split_text(text)
+            current_chunk = next(chunk_gen, None)
+            
+            while current_chunk is not None:
+                next_chunk = next(chunk_gen, None)  # Peek at next chunk
+                # chunk_start = time.time()
+                chunks_processed += 1
+                try:
+                    # Process text and generate audio
+                    # text_process_start = time.time()
+                    phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
+                    # text_process_time = time.time() - text_process_start
+                    
+                    # audio_gen_start = time.time()
+                    chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
+                    # audio_gen_time = time.time() - audio_gen_start
+                    
+                    if chunk_audio is not None:
+                        # Convert chunk with proper header handling
+                        convert_start = time.time()
+                        chunk_bytes = AudioService.convert_audio(
+                            chunk_audio,
+                            24000,
+                            output_format,
+                            is_first_chunk=is_first,
+                            normalizer=stream_normalizer,
+                            is_last_chunk=(next_chunk is None)  # Last if no next chunk
+                        )
+                        # convert_time = time.time() - convert_start
+                        
+                        # Calculate gap from last chunk
+                        # gap_time = chunk_start - last_chunk_end
+                        
+                        # Log timing details if not silent
+                        # if not silent:
+                        #     logger.debug(
+                        #         f"\nChunk {chunks_processed} timing:"
+                        #         f"\n  Gap from last chunk: {gap_time*1000:.1f}ms"
+                        #         f"\n  Text processing: {text_process_time*1000:.1f}ms"
+                        #         f"\n  Audio generation: {audio_gen_time*1000:.1f}ms"
+                        #         f"\n  Audio conversion: {convert_time*1000:.1f}ms"
+                        #         f"\n  Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
+                        #     )
+                        
+                        yield chunk_bytes
+                        is_first = False
+                        # last_chunk_end = time.time()
+                    else:
+                        logger.error(f"No audio generated for chunk: '{current_chunk}'")
+
+                except Exception as e:
+                    logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
+                
+                current_chunk = next_chunk  # Move to next chunk
+                
+        except Exception as e:
+            logger.error(f"Error in audio generation stream: {str(e)}")
+            raise
+
+
    def _save_audio(self, audio: torch.Tensor, filepath: str):
        """Save audio to file"""
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
--- a/api/src/services/warmup.py
+++ b/api/src/services/warmup.py
@ -0,0 +1,52 @@
+import os
+from typing import List, Tuple
+import torch
+from loguru import logger
+
+from .tts_service import TTSService
+from .tts_model import TTSModel
+
+
+class WarmupService:
+    """Service for warming up TTS models and voice caches"""
+    
+    def __init__(self):
+        self.tts_service = TTSService()
+        
+    def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
+        """Load and cache voices up to LRU limit"""
+        # Get all voices sorted by filename length (shorter names first, usually base voices)
+        voice_files = sorted(
+            [f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
+            key=len
+        )
+        
+        # Load up to LRU cache limit (20)
+        loaded_voices = []
+        for voice_file in voice_files[:20]:
+            try:
+                voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
+                voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
+                loaded_voices.append((voice_file[:-3], voicepack))  # Store name and tensor
+                # logger.info(f"Loaded voice {voice_file[:-3]} into cache")
+            except Exception as e:
+                logger.error(f"Failed to load voice {voice_file}: {e}")
+        logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
+        return loaded_voices
+        
+    async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
+        """Warm up voice inference and streaming"""
+        n_warmups = 1
+        for voice_name, _ in loaded_voices[:n_warmups]:
+            try:
+                logger.info(f"Running warmup inference on voice {voice_name}")
+                async for _ in self.tts_service.generate_audio_stream(
+                    warmup_text,
+                    voice_name,
+                    1.0,
+                    "pcm"
+                ):
+                    pass  # Process all chunks to properly warm up
+                logger.info(f"Completed warmup for voice {voice_name}")
+            except Exception as e:
+                logger.warning(f"Warmup failed for voice {voice_name}: {e}")
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -22,7 +22,7 @@ class OpenAISpeechRequest(BaseModel):
    )
    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
        default="mp3",
-        description="The format to return audio in. Supported formats: mp3, opus, flac, wav. AAC and PCM are not currently supported.",
+        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
    )
    speed: float = Field(
        default=1.0,
@ -30,3 +30,7 @@ class OpenAISpeechRequest(BaseModel):
        le=4.0,
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
+    stream: bool = Field(
+        default=True,  # Default to streaming for OpenAI compatibility
+        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+    )
--- a/api/tests/test_chunker.py
+++ b/api/tests/test_chunker.py
@ -0,0 +1,35 @@
+"""Tests for text chunking service"""
+
+import pytest
+from api.src.services.text_processing import chunker
+
+
+def test_split_text():
+    """Test text splitting into sentences"""
+    text = "First sentence. Second sentence! Third sentence?"
+    sentences = list(chunker.split_text(text))
+    assert len(sentences) == 3
+    assert sentences[0] == "First sentence."
+    assert sentences[1] == "Second sentence!"
+    assert sentences[2] == "Third sentence?"
+
+
+def test_split_text_empty():
+    """Test splitting empty text"""
+    assert list(chunker.split_text("")) == []
+
+
+def test_split_text_single_sentence():
+    """Test splitting single sentence"""
+    text = "Just one sentence."
+    assert list(chunker.split_text(text)) == ["Just one sentence."]
+
+
+def test_split_text_with_custom_chunk_size():
+    """Test splitting with custom max chunk size"""
+    text = "First part, second part, third part."
+    chunks = list(chunker.split_text(text, max_chunk=15))
+    assert len(chunks) == 3
+    assert chunks[0] == "First part,"
+    assert chunks[1] == "second part,"
+    assert chunks[2] == "third part."
--- a/api/tests/test_endpoints.py
+++ b/api/tests/test_endpoints.py
@ -1,19 +1,34 @@
-from unittest.mock import Mock
+from unittest.mock import Mock, AsyncMock

 import pytest
+import pytest_asyncio
+import asyncio
 from fastapi.testclient import TestClient
+from httpx import AsyncClient

 from ..src.main import app

 # Create test client
 client = TestClient(app)

+# Create async client fixture
+@pytest_asyncio.fixture
+async def async_client():
+    async with AsyncClient(app=app, base_url="http://test") as ac:
+        yield ac
+

 # Mock services
@pytest.fixture
 def mock_tts_service(monkeypatch):
    mock_service = Mock()
    mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
+    
+    # Create proper async generator mock
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_service.generate_audio_stream = mock_stream
    mock_service.list_voices.return_value = [
        "af",
        "bm_lewis",
@ -34,12 +49,12 @@ def mock_tts_service(monkeypatch):

@pytest.fixture
 def mock_audio_service(monkeypatch):
-    def mock_convert(*args):
-        return b"converted mock audio data"
-
+    mock_service = Mock()
+    mock_service.convert_audio.return_value = b"converted mock audio data"
    monkeypatch.setattr(
-        "api.src.routers.openai_compatible.AudioService.convert_audio", mock_convert
+        "api.src.routers.openai_compatible.AudioService", mock_service
    )
+    return mock_service


 def test_health_check():
@ -57,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
        "voice": "bm_lewis",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 200
@ -76,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
        "voice": "invalid_voice",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 400  # Bad request
@ -90,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": -1.0,  # Invalid speed
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 422  # Validation error
@ -104,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 500
@ -153,3 +172,89 @@ def test_combine_voices_error(mock_tts_service):

    assert response.status_code == 500
    assert "Combination failed" in response.json()["detail"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
+    """Test streaming PCM audio for real-time playback"""
+    test_request = {
+        "model": "kokoro",
+        "input": "Hello world",
+        "voice": "af",
+        "response_format": "pcm",
+        "stream": True
+    }
+    
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
+    
+    # Add streaming header
+    headers = {"x-raw-response": "stream"}
+    response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
+    
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/pcm"
+    # Just verify status and content type
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/pcm"
+
+
+@pytest.mark.asyncio
+async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
+    """Test streaming MP3 audio to file"""
+    test_request = {
+        "model": "kokoro",
+        "input": "Hello world",
+        "voice": "af",
+        "response_format": "mp3",
+        "stream": True
+    }
+    
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"mp3header", b"mp3data"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
+    
+    # Add streaming header
+    headers = {"x-raw-response": "stream"}
+    response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
+    
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/mpeg"
+    assert response.headers["content-disposition"] == "attachment; filename=speech.mp3"
+    # Just verify status and content type
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/mpeg"
+    assert response.headers["content-disposition"] == "attachment; filename=speech.mp3"
+
+
+@pytest.mark.asyncio
+async def test_openai_speech_streaming_generator(mock_tts_service, async_client):
+    """Test streaming with async generator"""
+    test_request = {
+        "model": "kokoro",
+        "input": "Hello world",
+        "voice": "af",
+        "response_format": "pcm",
+        "stream": True
+    }
+    
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
+    
+    # Add streaming header
+    headers = {"x-raw-response": "stream"}
+    response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
+    
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/pcm"
+    # Just verify status and content type
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "audio/pcm"
--- a/api/tests/test_main.py
+++ b/api/tests/test_main.py
@ -1,6 +1,6 @@
 """Tests for FastAPI application"""

-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, patch, call

 import pytest
 from fastapi.testclient import TestClient
@ -28,25 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
    """Test successful model warmup in lifespan"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
+    
+    # Create async mock
+    async def async_setup():
+        return 3
+    mock_tts_model.setup = MagicMock()
+    mock_tts_model.setup.side_effect = async_setup
+    mock_tts_model.get_device.return_value = "cuda"
+    
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
-        mock_tts_model.setup.return_value = 3  # 3 voice files
-        mock_tts_model.get_device.return_value = "cuda"
+        # Create an async generator from the lifespan context manager
+        async_gen = lifespan(MagicMock())
+        # Start the context manager
+        await async_gen.__aenter__()

-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    # Start the context manager
-    await async_gen.__aenter__()
+        # Verify the expected logging sequence
+        mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
+        
+        # Check for the startup message containing the required info
+        startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
+        startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
+        assert "Model warmed up on" in startup_msg
+        assert "3 voice packs loaded" in startup_msg

-    # Verify the expected logging sequence
-    mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
-    mock_logger.info.assert_any_call("Model loaded and warmed up on cuda")
-    mock_logger.info.assert_any_call("3 voice packs loaded successfully")
+        # Verify model setup was called
+        mock_tts_model.setup.assert_called_once()

-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
-
-    # Clean up
-    await async_gen.__aexit__(None, None, None)
+        # Clean up
+        await async_gen.__aexit__(None, None, None)


@pytest.mark.asyncio
@ -77,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
    """Test model warmup specifically on CUDA"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
+    
+    # Create async mock
+    async def async_setup():
+        return 2
+    mock_tts_model.setup = MagicMock()
+    mock_tts_model.setup.side_effect = async_setup
+    mock_tts_model.get_device.return_value = "cuda"
+    
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
-        mock_tts_model.setup.return_value = 2  # 2 voice files
-        mock_tts_model.get_device.return_value = "cuda"
+        # Create an async generator from the lifespan context manager
+        async_gen = lifespan(MagicMock())
+        await async_gen.__aenter__()

-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    await async_gen.__aenter__()
+        # Verify model setup was called
+        mock_tts_model.setup.assert_called_once()

-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
-
-    # Clean up
-    await async_gen.__aexit__(None, None, None)
-
-
-@pytest.mark.asyncio
-@patch("api.src.main.TTSModel")
-async def test_lifespan_cpu_fallback(mock_tts_model):
-    """Test model warmup falling back to CPU"""
-    # Mock file system for voice counting
-    mock_tts_model.VOICES_DIR = "/mock/voices"
-    with patch(
-        "os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
-    ):
-        mock_tts_model.setup.return_value = 4  # 4 voice files
-        mock_tts_model.get_device.return_value = "cpu"
-
-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    await async_gen.__aenter__()
-
-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
-
-    # Clean up
-    await async_gen.__aexit__(None, None, None)
+        # Clean up
+        await async_gen.__aexit__(None, None, None)
--- a/api/tests/test_tts_implementations.py
+++ b/api/tests/test_tts_implementations.py
@ -16,13 +16,14 @@ def test_get_device_error():
    with pytest.raises(RuntimeError, match="Model not initialized"):
        TTSBaseModel.get_device()

+@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA available"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
    
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cuda"
    assert voice_count == 2

+@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA unavailable"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
    
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cpu"
    assert voice_count == 2

--- a/api/tests/test_tts_service.py
+++ b/api/tests/test_tts_service.py
@ -31,27 +31,6 @@ def sample_audio():
    return np.sin(2 * np.pi * frequency * t).astype(np.float32)


-def test_split_text(tts_service):
-    """Test text splitting into sentences"""
-    text = "First sentence. Second sentence! Third sentence?"
-    sentences = tts_service._split_text(text)
-    assert len(sentences) == 3
-    assert sentences[0] == "First sentence."
-    assert sentences[1] == "Second sentence!"
-    assert sentences[2] == "Third sentence?"
-
-
-def test_split_text_empty(tts_service):
-    """Test splitting empty text"""
-    assert tts_service._split_text("") == []
-
-
-def test_split_text_single_sentence(tts_service):
-    """Test splitting single sentence"""
-    text = "Just one sentence."
-    assert tts_service._split_text(text) == ["Just one sentence."]
-
-
 def test_audio_to_bytes(tts_service, sample_audio):
    """Test converting audio tensor to bytes"""
    audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
    mock_torch_load.return_value = torch.zeros((10, 24000))
    mock_generate.return_value = (None, None)

-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)


@ -185,7 +164,7 @@ def test_generate_audio_error(
    mock_exists.return_value = True
    mock_torch_load.return_value = torch.zeros((10, 24000))

-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)


--- a/assets/cpu_first_token_timeline_stream_openai.png
+++ b/assets/cpu_first_token_timeline_stream_openai.png
--- a/assets/format_comparison.png
+++ b/assets/format_comparison.png
--- a/assets/gpu_first_token_latency_direct.png
+++ b/assets/gpu_first_token_latency_direct.png
--- a/assets/gpu_first_token_latency_openai.png
+++ b/assets/gpu_first_token_latency_openai.png
--- a/assets/gpu_first_token_timeline_direct.png
+++ b/assets/gpu_first_token_timeline_direct.png
--- a/assets/gpu_first_token_timeline_openai.png
+++ b/assets/gpu_first_token_timeline_openai.png
--- a/assets/gpu_processing_time.png
+++ b/assets/gpu_processing_time.png
--- a/assets/gpu_realtime_factor.png
+++ b/assets/gpu_realtime_factor.png
--- a/assets/gpu_total_time_latency_direct.png
+++ b/assets/gpu_total_time_latency_direct.png
--- a/assets/gpu_total_time_latency_openai.png
+++ b/assets/gpu_total_time_latency_openai.png
--- a/assets/voice_analysis.png
+++ b/assets/voice_analysis.png
--- a/docker-compose.cpu.yml
+++ b/docker-compose.cpu.yml
@ -45,6 +45,7 @@ services:
      - ONNX_OPTIMIZATION_LEVEL=all
      - ONNX_MEMORY_PATTERN=true
      - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
+      
    depends_on:
      model-fetcher:
        condition: service_healthy
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,20 +1,26 @@
 services:
  model-fetcher:
    image: datamachines/git-lfs:latest
+    environment:
+      - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
    volumes:
      - ./Kokoro-82M:/app/Kokoro-82M
    working_dir: /app/Kokoro-82M
    command: >
      sh -c "
-        rm -f .git/index.lock;
-        if [ -z \"$(ls -A .)\" ]; then
-          git clone https://huggingface.co/hexgrad/Kokoro-82M .
-          touch .cloned;
+        if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
+          echo 'Skipping model fetch...' && touch .cloned;
        else
-          rm -f .git/index.lock && \
-          git checkout main && \
-          git pull origin main && \
-          touch .cloned;
+          rm -f .git/index.lock;
+          if [ -z \"$(ls -A .)\" ]; then
+            git clone https://huggingface.co/hexgrad/Kokoro-82M .
+            touch .cloned;
+          else
+            rm -f .git/index.lock && \
+            git checkout main && \
+            git pull origin main && \
+            touch .cloned;
+          fi;
        fi;
        tail -f /dev/null
      "
@ -26,10 +32,10 @@ services:
      start_period: 1s

  kokoro-tts:
-    image: ghcr.io/remsky/kokoro-fastapi:latest
+    # image: ghcr.io/remsky/kokoro-fastapi:latest
    # Uncomment below to build from source instead of using the released image
-    # build:
-    #   context: .
+    build:
+      context: .
    volumes:
      - ./api/src:/app/api/src
      - ./Kokoro-82M:/app/Kokoro-82M
--- a/examples/assorted_checks/benchmarks/benchmark_first_token.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+import os
+import json
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from lib.shared_utils import save_json_results
+from lib.shared_plotting import plot_timeline, plot_correlation
+from lib.shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def measure_first_token(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+    """Measure time to audio via API calls and save the audio output"""
+    results = {
+        "text_length": len(text),
+        "token_count": len(enc.encode(text)),
+        "total_time": None,
+        "time_to_first_chunk": None,
+        "error": None,
+        "audio_path": None,
+        "audio_length": None,  # Length of output audio in seconds
+    }
+
+    try:
+        start_time = time.time()
+
+        # Make request without streaming
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "input": text,
+                "voice": "af",
+                "response_format": "wav",
+                "stream": False,
+            },
+            timeout=1800,
+        )
+        response.raise_for_status()
+
+        # Save complete audio
+        audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
+        audio_path = os.path.join(output_dir, audio_filename)
+        results["audio_path"] = audio_path
+
+        content = response.content
+        with open(audio_path, "wb") as f:
+            f.write(content)
+
+        # Calculate audio length using scipy
+        import scipy.io.wavfile as wavfile
+
+        sample_rate, audio_data = wavfile.read(audio_path)
+        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
+        results["time_to_first_chunk"] = time.time() - start_time
+
+        results["total_time"] = time.time() - start_time
+        return results
+
+    except Exception as e:
+        results["error"] = str(e)
+        return results
+
+
+def main():
+    # Set up paths
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    output_dir = os.path.join(script_dir, "output_audio")
+    output_data_dir = os.path.join(script_dir, "output_data")
+
+    # Create output directories
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(output_data_dir, exist_ok=True)
+
+    # Load sample text
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
+        text = f.read()
+
+    # Test specific token counts
+    token_sizes = [10, 25, 50, 100, 200, 500]
+    all_results = []
+
+    for tokens in token_sizes:
+        print(f"\nTesting {tokens} tokens")
+        test_text = get_text_for_tokens(text, tokens)
+        actual_tokens = len(enc.encode(test_text))
+        print(f"Text preview: {test_text[:50]}...")
+
+        # Run test 3 times for each size to get average
+        for i in range(5):
+            print(f"Run {i+1}/3...")
+            result = measure_first_token(test_text, output_dir, tokens, i + 1)
+            result["target_tokens"] = tokens
+            result["actual_tokens"] = actual_tokens
+            result["run_number"] = i + 1
+
+            print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
+            print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
+
+            if result["error"]:
+                print(f"Error: {result['error']}")
+
+            all_results.append(result)
+
+    # Calculate averages per token size
+    summary = {}
+    for tokens in token_sizes:
+        matching_results = [
+            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+        ]
+        if matching_results:
+            avg_first_chunk = sum(
+                r["time_to_first_chunk"] for r in matching_results
+            ) / len(matching_results)
+            avg_total = sum(r["total_time"] for r in matching_results) / len(
+                matching_results
+            )
+            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+                matching_results
+            )
+            summary[tokens] = {
+                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
+                "avg_total_time": round(avg_total, 3),
+                "avg_audio_length": round(avg_audio_length, 3),
+                "num_successful_runs": len(matching_results),
+            }
+
+    # Save results
+    # Save results
+    results_data = {
+        "individual_runs": all_results,
+        "summary": summary,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    save_json_results(
+        results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
+    )
+
+    # Create plot directory if it doesn't exist
+    output_plots_dir = os.path.join(script_dir, "output_plots")
+    os.makedirs(output_plots_dir, exist_ok=True)
+
+    # Create DataFrame for plotting
+    df = pd.DataFrame(all_results)
+
+    # Create both plots
+    plot_correlation(
+        df,
+        "target_tokens",
+        "time_to_first_chunk",
+        "Time to Audio vs Input Size",
+        "Number of Input Tokens",
+        "Time to Audio (seconds)",
+        os.path.join(output_plots_dir, "first_token_latency.png"),
+    )
+
+    plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
+
+    print("\nResults and plots saved to:")
+    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
+    print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
+    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import os
+import time
+
+import requests
+from openai import OpenAI
+from lib.stream_utils import run_benchmark
+
+OPENAI_CLIENT = OpenAI(
+    base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
+)
+
+
+def measure_first_token_requests(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+    """Measure time to audio via direct API calls and save the audio output"""
+    results = {
+        "text_length": len(text),
+        "token_count": None,  # Will be set by run_benchmark
+        "total_time": None,
+        "time_to_first_chunk": None,
+        "error": None,
+        "audio_path": None,
+        "audio_length": None,
+    }
+
+    try:
+        start_time = time.time()
+
+        # Make request with streaming enabled
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "input": text,
+                "voice": "af",
+                "response_format": "pcm",
+                "stream": True,
+            },
+            stream=True,
+            timeout=1800,
+        )
+        response.raise_for_status()
+
+        # Save complete audio
+        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
+        audio_path = os.path.join(output_dir, audio_filename)
+        results["audio_path"] = audio_path
+
+        first_chunk_time = None
+        chunks = []
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                if first_chunk_time is None:
+                    first_chunk_time = time.time()
+                    results["time_to_first_chunk"] = first_chunk_time - start_time
+                chunks.append(chunk)
+
+        # Concatenate all PCM chunks
+        if not chunks:
+            raise ValueError("No audio chunks received")
+
+        all_audio_data = b"".join(chunks)
+
+        # Write as WAV file
+        import wave
+
+        with wave.open(audio_path, "wb") as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
+            wav_file.setframerate(24000)  # Known sample rate for Kokoro
+            wav_file.writeframes(all_audio_data)
+
+        # Calculate audio length using scipy
+        import scipy.io.wavfile as wavfile
+
+        sample_rate, audio_data = wavfile.read(audio_path)
+        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
+
+        results["total_time"] = time.time() - start_time
+
+        # Print debug info
+        print(f"Complete audio size: {len(all_audio_data)} bytes")
+        print(f"Number of chunks received: {len(chunks)}")
+        print(f"Audio length: {results['audio_length']:.3f}s")
+
+        return results
+
+    except Exception as e:
+        results["error"] = str(e)
+        return results
+
+
+def measure_first_token_openai(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+    """Measure time to audio via OpenAI API calls and save the audio output"""
+    results = {
+        "text_length": len(text),
+        "token_count": None,  # Will be set by run_benchmark
+        "total_time": None,
+        "time_to_first_chunk": None,
+        "error": None,
+        "audio_path": None,
+        "audio_length": None,
+    }
+
+    try:
+        start_time = time.time()
+
+        # Initialize OpenAI client
+
+        # Save complete audio
+        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
+        audio_path = os.path.join(output_dir, audio_filename)
+        results["audio_path"] = audio_path
+
+        first_chunk_time = None
+        all_audio_data = bytearray()
+        chunk_count = 0
+
+        # Make streaming request using OpenAI client
+        with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
+            model="kokoro",
+            voice="af",
+            response_format="pcm",
+            input=text,
+        ) as response:
+            for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunk_count += 1
+                    if first_chunk_time is None:
+                        first_chunk_time = time.time()
+                        results["time_to_first_chunk"] = first_chunk_time - start_time
+                    all_audio_data.extend(chunk)
+
+        # Write as WAV file
+        import wave
+
+        with wave.open(audio_path, "wb") as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
+            wav_file.setframerate(24000)  # Known sample rate for Kokoro
+            wav_file.writeframes(all_audio_data)
+
+        # Calculate audio length using scipy
+        import scipy.io.wavfile as wavfile
+
+        sample_rate, audio_data = wavfile.read(audio_path)
+        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
+
+        results["total_time"] = time.time() - start_time
+
+        # Print debug info
+        print(f"Complete audio size: {len(all_audio_data)} bytes")
+        print(f"Number of chunks received: {chunk_count}")
+        print(f"Audio length: {results['audio_length']:.3f}s")
+
+        return results
+
+    except Exception as e:
+        results["error"] = str(e)
+        return results
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    prefix='cpu'
+    # Run requests benchmark
+    print("\n=== Running Direct Requests Benchmark ===")
+    run_benchmark(
+        measure_first_token_requests,
+        output_dir=os.path.join(script_dir, "output_audio_stream"),
+        output_data_dir=os.path.join(script_dir, "output_data"),
+        output_plots_dir=os.path.join(script_dir, "output_plots"),
+        suffix="_stream",
+        plot_title_suffix="(Streaming)",
+        prefix=prefix
+    )
+    # Run OpenAI benchmark
+    print("\n=== Running OpenAI Library Benchmark ===")
+    run_benchmark(
+        measure_first_token_openai,
+        output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
+        output_data_dir=os.path.join(script_dir, "output_data"),
+        output_plots_dir=os.path.join(script_dir, "output_plots"),
+        suffix="_stream_openai",
+        plot_title_suffix="(OpenAI Streaming)",
+        prefix=prefix
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
+++ b/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
@ -1,30 +1,37 @@
 #!/usr/bin/env python3
 import os
+import sys
 import json
 import time
-import threading
 import queue
-import pandas as pd
-import sys
+import threading
 from datetime import datetime

-from lib.shared_plotting import plot_system_metrics, plot_correlation
+import pandas as pd
 from lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats,
-    real_time_factor
+    real_time_factor,
+    save_json_results,
+    get_system_metrics,
+    write_benchmark_stats,
 )
+from lib.shared_plotting import plot_correlation, plot_system_metrics
 from lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
+    make_tts_request,
+    get_text_for_tokens,
+    generate_token_sizes,
 )

+
 class SystemMonitor:
    def __init__(self, interval=1.0):
+        """Rough system tracker: Not always accurate"""
        self.interval = interval
        self.metrics_queue = queue.Queue()
        self.stop_event = threading.Event()
        self.metrics_timeline = []
        self.start_time = None
-        
+
    def _monitor_loop(self):
        """Background thread function to collect system metrics."""
        while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
            metrics["relative_time"] = time.time() - self.start_time
            self.metrics_queue.put(metrics)
            time.sleep(self.interval)
-    
+
    def start(self):
        """Start the monitoring thread."""
        self.start_time = time.time()
        self.monitor_thread = threading.Thread(target=self._monitor_loop)
        self.monitor_thread.daemon = True
        self.monitor_thread.start()
-    
+
    def stop(self):
        """Stop the monitoring thread and collect final metrics."""
        self.stop_event.set()
-        if hasattr(self, 'monitor_thread'):
+        if hasattr(self, "monitor_thread"):
            self.monitor_thread.join(timeout=2)
-        
+
        # Collect all metrics from queue
        while True:
            try:
@ -53,23 +60,24 @@ class SystemMonitor:
                self.metrics_timeline.append(metrics)
            except queue.Empty:
                break
-        
+
        return self.metrics_timeline

+
 def main():
    # Initialize system monitor
    monitor = SystemMonitor(interval=1.0)  # 1 second interval
    # Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
-    prefix = "gpu"
+    prefix = "cpu"
    # Generate token sizes
-    if 'gpu' in prefix:
+    if "gpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=5000, dense_step=150, 
-            dense_max=1000, sparse_step=1000)
-    elif 'cpu' in prefix:
+            max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
+        )
+    elif "cpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=1000, dense_step=300, 
-            dense_max=1000, sparse_step=0)
+            max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
+        )
    else:
        token_sizes = generate_token_sizes(max_tokens=3000)

@ -78,7 +86,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
            filename = f"{prefix}_{filename}"
        return os.path.join(path, filename)

-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
        text = f.read()

    total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():

    results = []
    test_start_time = time.time()
-    
+
    # Start system monitoring
    monitor.start()

@ -114,7 +124,8 @@ def main():
        processing_time, audio_length = make_tts_request(
            chunk,
            output_dir=output_dir,
-            prefix=prefix
+            prefix=prefix,
+            stream=False,  # Use non-streaming mode for RTF benchmarking
        )
        if processing_time is None or audio_length is None:
            print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
        # Calculate RTF using the correct formula
        rtf = real_time_factor(processing_time, audio_length)
        print(f"Real-Time Factor: {rtf:.5f}")
-        
-        results.append({
-            "tokens": actual_tokens,
-            "processing_time": processing_time,
-            "output_length": audio_length,
-            "rtf": rtf,
-            "elapsed_time": round(time.time() - test_start_time, 2),
-        })
+
+        results.append(
+            {
+                "tokens": actual_tokens,
+                "processing_time": processing_time,
+                "output_length": audio_length,
+                "rtf": rtf,
+                "elapsed_time": round(time.time() - test_start_time, 5),
+            }
+        )

    df = pd.DataFrame(results)
    if df.empty:
@ -144,89 +157,101 @@ def main():
        {
            "title": "Benchmark Statistics (with correct RTF)",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
-                "Average RTF": df['rtf'].mean(),
-                "Average Real Time Speed": 1/df['rtf'].mean()
-            }
+                "Total tokens processed": df["tokens"].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
+                "Total test duration (s)": df["elapsed_time"].max(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+                "Average RTF": df["rtf"].mean(),
+                "Average Real Time Speed": 1 / df["rtf"].mean(),
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
-                "Average output length (s)": df['output_length'].mean()
-            }
+                "Average chunk size (tokens)": df["tokens"].mean(),
+                "Min chunk size (tokens)": df["tokens"].min(),
+                "Max chunk size (tokens)": df["tokens"].max(),
+                "Average processing time (s)": df["processing_time"].mean(),
+                "Average output length (s)": df["output_length"].mean(),
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
                "RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
-                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
-            }
-        }
+                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
+            },
+        },
    ]
-    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
+    write_benchmark_stats(
+        stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
+    )

    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
+        "tokens",
+        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time_rtf.png")
+        prefix_path(output_plots_dir, "processing_time_rtf.png"),
    )

    # Plot RTF vs Token Count
    plot_correlation(
-        df, "tokens", "rtf",
+        df,
+        "tokens",
+        "rtf",
        "Real-Time Factor vs Input Size",
        "Number of Input Tokens",
        "Real-Time Factor (processing time / audio length)",
-        prefix_path(output_plots_dir, "realtime_factor_rtf.png")
+        prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
    )

    # Stop monitoring and get final metrics
    final_metrics = monitor.stop()
-    
+
    # Convert metrics timeline to DataFrame for stats
    metrics_df = pd.DataFrame(final_metrics)
-    
+
    # Add system usage stats
    if not metrics_df.empty:
-        stats.append({
-            "title": "System Usage Statistics",
-            "stats": {
-                "Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
-                "Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
-                "Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
-                "Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
-                "Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
-                "Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
+        stats.append(
+            {
+                "title": "System Usage Statistics",
+                "stats": {
+                    "Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
+                    "Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
+                    "Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
+                    "Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
+                    "Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
+                    "Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
+                },
            }
-        })
-        if 'gpu_memory_used' in metrics_df:
-            stats[-1]["stats"].update({
-                "Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
-                "Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
-            })
-    
+        )
+        if "gpu_memory_used" in metrics_df:
+            stats[-1]["stats"].update(
+                {
+                    "Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
+                    "Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
+                }
+            )
+
    # Plot system metrics
-    plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
+    plot_system_metrics(
+        final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
+    )

    # Save final results
    save_json_results(
        {
            "results": results,
            "system_metrics": final_metrics,
-            "test_duration": time.time() - test_start_time
+            "test_duration": time.time() - test_start_time,
        },
-        prefix_path(output_data_dir, "benchmark_results_rtf.json")
+        prefix_path(output_data_dir, "benchmark_results_rtf.json"),
    )

    print("\nResults saved to:")
--- a/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
+++ b/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
@ -1,19 +1,30 @@
 import os
 import json
 import time
+
 import pandas as pd
-from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
+
 from examples.assorted_checks.lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats
+    save_json_results,
+    get_system_metrics,
+    write_benchmark_stats,
+)
+from examples.assorted_checks.lib.shared_plotting import (
+    plot_correlation,
+    plot_system_metrics,
 )
 from examples.assorted_checks.lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
+    make_tts_request,
+    get_text_for_tokens,
+    generate_token_sizes,
 )


 def main():
    # Get optional prefix from first command line argument
    import sys
+
    prefix = sys.argv[1] if len(sys.argv) > 1 else ""

    # Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
    total_tokens = len(enc.encode(text))
    print(f"Total tokens in file: {total_tokens}")

-
    token_sizes = generate_token_sizes(total_tokens)

    print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
        # Save intermediate results
        save_json_results(
            {"results": results, "system_metrics": system_metrics},
-            prefix_path(output_data_dir, "benchmark_results.json")
+            prefix_path(output_data_dir, "benchmark_results.json"),
        )

    # Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
        {
            "title": "Benchmark Statistics",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
-                "Average realtime factor": df['realtime_factor'].mean()
-            }
+                "Total tokens processed": df["tokens"].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
+                "Total test duration (s)": df["elapsed_time"].max(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+                "Average realtime factor": df["realtime_factor"].mean(),
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
-                "Average output length (s)": df['output_length'].mean()
-            }
+                "Average chunk size (tokens)": df["tokens"].mean(),
+                "Min chunk size (tokens)": df["tokens"].min(),
+                "Max chunk size (tokens)": df["tokens"].max(),
+                "Average processing time (s)": df["processing_time"].mean(),
+                "Average output length (s)": df["output_length"].mean(),
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
-                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
-            }
-        }
+                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
+            },
+        },
    ]
    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))

    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
+        "tokens",
+        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time.png")
+        prefix_path(output_plots_dir, "processing_time.png"),
    )

    # Plot Realtime Factor vs Token Count
    plot_correlation(
-        df, "tokens", "realtime_factor",
+        df,
+        "tokens",
+        "realtime_factor",
        "Realtime Factor vs Input Size",
        "Number of Input Tokens",
        "Realtime Factor (output length / processing time)",
-        prefix_path(output_plots_dir, "realtime_factor.png")
+        prefix_path(output_plots_dir, "realtime_factor.png"),
    )

    # Plot system metrics
-    plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
+    plot_system_metrics(
+        system_metrics, prefix_path(output_plots_dir, "system_usage.png")
+    )

    print("\nResults saved to:")
    print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
--- a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
@ -1,11 +1,12 @@
 """Shared utilities specific to TTS benchmarking."""
+
 import time
-from typing import List, Optional, Tuple
+from typing import List, Tuple, Optional

 import requests
 import tiktoken

-from .shared_utils import get_audio_length, save_audio_file
+from .shared_utils import save_audio_file, get_audio_length

 # Global tokenizer instance
 enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")

 def get_text_for_tokens(text: str, num_tokens: int) -> str:
    """Get a slice of text that contains exactly num_tokens tokens.
-    
+
    Args:
        text: Input text to slice
        num_tokens: Desired number of tokens
-        
+
    Returns:
        str: Text slice containing exactly num_tokens tokens
    """
@ -31,44 +32,69 @@ def make_tts_request(
    text: str,
    output_dir: str = None,
    timeout: int = 1800,
-    prefix: str = ""
+    prefix: str = "",
+    stream: bool = True,
 ) -> Tuple[Optional[float], Optional[float]]:
    """Make TTS request using OpenAI-compatible endpoint.
-    
+
    Args:
        text: Input text to convert to speech
        output_dir: Directory to save audio files. If None, audio won't be saved.
        timeout: Request timeout in seconds
        prefix: Optional prefix for output filenames
-        
+
    Returns:
        tuple: (processing_time, audio_length) in seconds, or (None, None) on error
    """
    try:
        start_time = time.time()
-        response = requests.post(
-            "http://localhost:8880/v1/audio/speech",
-            json={
-                "model": "kokoro",
-                "input": text,
-                "voice": "af",
-                "response_format": "wav",
-            },
-            timeout=timeout,
-        )
-        response.raise_for_status()
+        if stream:
+            # For streaming, we need to collect all chunks
+            audio_chunks = []
+            response = requests.post(
+                "http://localhost:8880/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": "af",
+                    "response_format": "wav",
+                    "stream": True,
+                },
+                timeout=timeout,
+                stream=True,
+            )
+            response.raise_for_status()
+
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    audio_chunks.append(chunk)
+
+            # Combine all chunks
+            audio_data = b"".join(audio_chunks)
+        else:
+            response = requests.post(
+                "http://localhost:8880/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": "af",
+                    "response_format": "wav",
+                    "stream": False,
+                },
+                timeout=timeout,
+            )
+            response.raise_for_status()
+            audio_data = response.content

        processing_time = round(time.time() - start_time, 2)
-        # Calculate audio length from response content
-        audio_length = get_audio_length(response.content)
-        
+        # Calculate audio length from audio data
+        audio_length = get_audio_length(audio_data)
+
        # Save the audio file if output_dir is provided
        if output_dir:
            token_count = len(enc.encode(text))
            output_file = save_audio_file(
-                response.content,
-                f"chunk_{token_count}_tokens",
-                output_dir
+                audio_data, f"chunk_{token_count}_tokens", output_dir
            )
            print(f"Saved audio to {output_file}")

@ -86,26 +112,26 @@ def generate_token_sizes(
    max_tokens: int,
    dense_step: int = 100,
    dense_max: int = 1000,
-    sparse_step: int = 1000
+    sparse_step: int = 1000,
 ) -> List[int]:
    """Generate token size ranges with dense sampling at start.
-    
+
    Args:
        max_tokens: Maximum number of tokens to generate sizes up to
        dense_step: Step size for dense sampling range
        dense_max: Maximum value for dense sampling
        sparse_step: Step size for sparse sampling range
-        
+
    Returns:
        list: Sorted list of token sizes
    """
    # Dense sampling at start
    dense_range = list(range(dense_step, dense_max + 1, dense_step))
-    
+
    if max_tokens <= dense_max or sparse_step < dense_max:
        return sorted(dense_range)
    # Sparse sampling for larger sizes
    sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
-    
+
    # Combine and deduplicate
    return sorted(list(set(dense_range + sparse_range)))
--- a/examples/assorted_checks/benchmarks/lib/shared_plotting.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@ -1,7 +1,10 @@
 """Shared plotting utilities for benchmarks and tests."""
+
+import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
+import matplotlib.patches as patches

 # Common style configurations
 STYLE_CONFIG = {
@ -10,66 +13,71 @@ STYLE_CONFIG = {
    "secondary_color": "#05d9e8",
    "grid_color": "#ffffff",
    "text_color": "#ffffff",
-    "font_sizes": {
-        "title": 16,
-        "label": 14,
-        "tick": 12,
-        "text": 10
-    }
+    "font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
 }

+
 def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
    """Configure plot styling with consistent theme.
-    
+
    Args:
        fig: matplotlib figure object
        ax: matplotlib axis object
        title: str, plot title
        xlabel: str, optional x-axis label
        ylabel: str, optional y-axis label
-    
+
    Returns:
        tuple: (fig, ax) with applied styling
    """
    # Grid styling
    ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
-    
+
    # Title and labels
-    ax.set_title(title, pad=20, 
-                fontsize=STYLE_CONFIG["font_sizes"]["title"], 
-                fontweight="bold", 
-                color=STYLE_CONFIG["text_color"])
-    
+    ax.set_title(
+        title,
+        pad=20,
+        fontsize=STYLE_CONFIG["font_sizes"]["title"],
+        fontweight="bold",
+        color=STYLE_CONFIG["text_color"],
+    )
+
    if xlabel:
-        ax.set_xlabel(xlabel, 
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
-                     fontweight="medium", 
-                     color=STYLE_CONFIG["text_color"])
+        ax.set_xlabel(
+            xlabel,
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
+            fontweight="medium",
+            color=STYLE_CONFIG["text_color"],
+        )
    if ylabel:
-        ax.set_ylabel(ylabel, 
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
-                     fontweight="medium", 
-                     color=STYLE_CONFIG["text_color"])
-    
+        ax.set_ylabel(
+            ylabel,
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
+            fontweight="medium",
+            color=STYLE_CONFIG["text_color"],
+        )
+
    # Tick styling
-    ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"], 
-                  colors=STYLE_CONFIG["text_color"])
-    
+    ax.tick_params(
+        labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
+    )
+
    # Spine styling
    for spine in ax.spines.values():
        spine.set_color(STYLE_CONFIG["text_color"])
        spine.set_alpha(0.3)
        spine.set_linewidth(0.5)
-    
+
    # Background colors
    ax.set_facecolor(STYLE_CONFIG["background_color"])
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    return fig, ax

+
 def plot_system_metrics(metrics_data, output_path):
    """Create plots for system metrics over time.
-    
+
    Args:
        metrics_data: list of dicts containing system metrics
        output_path: str, path to save the output plot
@ -77,68 +85,281 @@ def plot_system_metrics(metrics_data, output_path):
    df = pd.DataFrame(metrics_data)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
-    
+
    # Get baseline values
    baseline_cpu = df["cpu_percent"].iloc[0]
    baseline_ram = df["ram_used_gb"].iloc[0]
-    baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
-    
+    baseline_gpu = (
+        df["gpu_memory_used"].iloc[0] / 1024
+        if "gpu_memory_used" in df.columns
+        else None
+    )
+
    # Convert GPU memory to GB if present
    if "gpu_memory_used" in df.columns:
        df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
-    
+
    plt.style.use("dark_background")
-    
+
    # Create subplots based on available metrics
    has_gpu = "gpu_memory_used" in df.columns
    num_plots = 3 if has_gpu else 2
    fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    # Smoothing window
    window = min(5, len(df) // 2)
-    
+
    # Plot CPU Usage
    smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], 
-                color=STYLE_CONFIG["primary_color"], linewidth=2)
-    axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"], 
-                    linestyle="--", alpha=0.5, label="Baseline")
-    setup_plot(fig, axes[0], "CPU Usage Over Time", 
-              xlabel="Time (seconds)", ylabel="CPU Usage (%)")
+    sns.lineplot(
+        x=elapsed_time,
+        y=smoothed_cpu,
+        ax=axes[0],
+        color=STYLE_CONFIG["primary_color"],
+        linewidth=2,
+    )
+    axes[0].axhline(
+        y=baseline_cpu,
+        color=STYLE_CONFIG["secondary_color"],
+        linestyle="--",
+        alpha=0.5,
+        label="Baseline",
+    )
+    setup_plot(
+        fig,
+        axes[0],
+        "CPU Usage Over Time",
+        xlabel="Time (seconds)",
+        ylabel="CPU Usage (%)",
+    )
    axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
    axes[0].legend()
-    
+
    # Plot RAM Usage
    smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], 
-                color=STYLE_CONFIG["secondary_color"], linewidth=2)
-    axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"], 
-                    linestyle="--", alpha=0.5, label="Baseline")
-    setup_plot(fig, axes[1], "RAM Usage Over Time", 
-              xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
+    sns.lineplot(
+        x=elapsed_time,
+        y=smoothed_ram,
+        ax=axes[1],
+        color=STYLE_CONFIG["secondary_color"],
+        linewidth=2,
+    )
+    axes[1].axhline(
+        y=baseline_ram,
+        color=STYLE_CONFIG["primary_color"],
+        linestyle="--",
+        alpha=0.5,
+        label="Baseline",
+    )
+    setup_plot(
+        fig,
+        axes[1],
+        "RAM Usage Over Time",
+        xlabel="Time (seconds)",
+        ylabel="RAM Usage (GB)",
+    )
    axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
    axes[1].legend()
-    
+
    # Plot GPU Memory if available
    if has_gpu:
        smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
-        sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], 
-                    color=STYLE_CONFIG["primary_color"], linewidth=2)
-        axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"], 
-                        linestyle="--", alpha=0.5, label="Baseline")
-        setup_plot(fig, axes[2], "GPU Memory Usage Over Time", 
-                  xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
+        sns.lineplot(
+            x=elapsed_time,
+            y=smoothed_gpu,
+            ax=axes[2],
+            color=STYLE_CONFIG["primary_color"],
+            linewidth=2,
+        )
+        axes[2].axhline(
+            y=baseline_gpu,
+            color=STYLE_CONFIG["secondary_color"],
+            linestyle="--",
+            alpha=0.5,
+            label="Baseline",
+        )
+        setup_plot(
+            fig,
+            axes[2],
+            "GPU Memory Usage Over Time",
+            xlabel="Time (seconds)",
+            ylabel="GPU Memory (GB)",
+        )
        axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
        axes[2].legend()
-    
+
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()

+
+def plot_timeline(df, output_path, suffix="", prefix=""):
+    """Create timeline plot showing latency for each run.
+
+    Args:
+        df: pandas DataFrame containing run data with columns:
+            - target_tokens: number of tokens
+            - run_number: run iteration
+            - time_to_first_chunk: latency to first token
+        output_path: str, path to save the output plot
+    """
+    plt.style.use("dark_background")
+
+    # Sort by tokens and run number
+    df = df.sort_values(["target_tokens", "run_number"])
+
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    # Calculate y positions for each run with tighter grouping
+    unique_tokens = sorted(df["target_tokens"].unique())
+    y_positions = {}
+    current_y = 0
+    group_spacing = 0.8  # Space between groups
+    run_spacing = 0.2  # Space between runs in a group
+
+    for tokens in unique_tokens:
+        runs = df[df["target_tokens"] == tokens]
+        base_y = current_y
+        for i, (_, run) in enumerate(runs.iterrows()):
+            y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
+        current_y = base_y + (len(runs) * run_spacing) + group_spacing
+
+    # Plot bars and points with more transparency
+    bar_height = 0.15
+    for _, row in df.iterrows():
+        y = y_positions[(row["target_tokens"], row["run_number"])]
+        latency = row["time_to_first_chunk"]
+
+        # Latency bar
+        ax.add_patch(
+            patches.Rectangle(
+                (0, y - bar_height / 2),
+                latency,
+                bar_height,
+                facecolor=STYLE_CONFIG["primary_color"],
+                alpha=0.3,
+            )
+        )
+
+        # End point
+        ax.plot(
+            latency,
+            y,
+            "o",
+            color=STYLE_CONFIG["secondary_color"],
+            markersize=4,
+            alpha=0.5,
+        )
+
+    # Add mean lines and values for each token group
+    for tokens in unique_tokens:
+        token_runs = df[df["target_tokens"] == tokens]
+        mean_latency = token_runs["time_to_first_chunk"].mean()
+        y_positions_for_token = [
+            y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
+        ]
+        min_y = min(y_positions_for_token)
+        max_y = max(y_positions_for_token)
+        group_center = (min_y + max_y) / 2
+
+        # Plot mean line with gradient alpha
+        gradient = np.linspace(0.2, 0.8, 100)
+        for i in range(len(gradient) - 1):
+            y1 = (
+                min_y
+                - bar_height
+                + (max_y - min_y + 2 * bar_height) * (i / len(gradient))
+            )
+            y2 = (
+                min_y
+                - bar_height
+                + (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
+            )
+            ax.plot(
+                [mean_latency, mean_latency],
+                [y1, y2],
+                "-",
+                color=STYLE_CONFIG["secondary_color"],
+                linewidth=3,
+                alpha=gradient[i],
+            )
+
+        # Add mean value label with background
+        label_text = f"Mean: {mean_latency:.3f}s"
+        bbox_props = dict(
+            facecolor=STYLE_CONFIG["background_color"],
+            edgecolor=STYLE_CONFIG["secondary_color"],
+            alpha=0.8,
+            pad=3,
+            linewidth=1,
+        )
+        ax.text(
+            mean_latency + 0.02,
+            group_center,
+            label_text,
+            color=STYLE_CONFIG["secondary_color"],
+            va="center",
+            fontsize=10,
+            fontweight="bold",
+            bbox=bbox_props,
+        )
+
+    # Customize plot
+    ax.set_ylim(-1, current_y)
+    ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3)  # Extra space for labels
+
+    # Add labels for token groups with tighter spacing
+    group_positions = {}
+    for tokens in unique_tokens:
+        runs = df[df["target_tokens"] == tokens]
+        y_positions_for_token = [
+            y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
+        ]
+        group_positions[tokens] = sum(y_positions_for_token) / len(
+            y_positions_for_token
+        )
+        plt.axhline(
+            y=min(y_positions_for_token) - bar_height,
+            color="white",
+            alpha=0.1,
+            linestyle="-",
+        )
+
+    # Calculate mean audio length for each token group
+    audio_lengths = {}
+    for tokens in unique_tokens:
+        token_runs = df[df["target_tokens"] == tokens]
+        audio_lengths[tokens] = token_runs["audio_length"].mean()
+
+    # Set y-ticks at group centers with token counts and audio lengths
+    plt.yticks(
+        list(group_positions.values()),
+        [
+            f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
+            for tokens in group_positions.keys()
+        ],
+        fontsize=10,
+    )
+
+    # Customize appearance
+    setup_plot(
+        fig,
+        ax,
+        prefix.upper() + " Time-To-Audio Latency " + suffix,
+        xlabel="Time (seconds)",
+        ylabel="Input Size",
+    )
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close()
+
+
 def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
    """Create correlation plot with regression line and correlation coefficient.
-    
+
    Args:
        df: pandas DataFrame containing the data
        x: str, column name for x-axis
@ -149,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
-    
+
    fig, ax = plt.subplots(figsize=(12, 8))
-    
+
    # Scatter plot
-    sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6, 
-                    color=STYLE_CONFIG["primary_color"])
-    
+    sns.scatterplot(
+        data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
+    )
+
    # Regression line
-    sns.regplot(data=df, x=x, y=y, scatter=False, 
-                color=STYLE_CONFIG["secondary_color"], 
-                line_kws={"linewidth": 2})
-    
+    sns.regplot(
+        data=df,
+        x=x,
+        y=y,
+        scatter=False,
+        color=STYLE_CONFIG["secondary_color"],
+        line_kws={"linewidth": 2},
+    )
+
    # Add correlation coefficient
    corr = df[x].corr(df[y])
-    plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", 
-             transform=ax.transAxes, 
-             fontsize=STYLE_CONFIG["font_sizes"]["text"], 
-             color=STYLE_CONFIG["text_color"],
-             bbox=dict(facecolor=STYLE_CONFIG["background_color"], 
-                      edgecolor=STYLE_CONFIG["text_color"], 
-                      alpha=0.7))
-    
+    plt.text(
+        0.05,
+        0.95,
+        f"Correlation: {corr:.2f}",
+        transform=ax.transAxes,
+        fontsize=STYLE_CONFIG["font_sizes"]["text"],
+        color=STYLE_CONFIG["text_color"],
+        bbox=dict(
+            facecolor=STYLE_CONFIG["background_color"],
+            edgecolor=STYLE_CONFIG["text_color"],
+            alpha=0.7,
+        ),
+    )
+
    setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
--- a/examples/assorted_checks/benchmarks/lib/shared_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_utils.py
@ -1,9 +1,10 @@
 """Shared utilities for benchmarks and tests."""
+
 import os
 import json
 import subprocess
+from typing import Any, Dict, List, Union, Optional
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Union

 import psutil
 import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
 TORCH_AVAILABLE = False
 try:
    import torch
+
    TORCH_AVAILABLE = torch.cuda.is_available()
 except ImportError:
    pass


+def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
+    """Check if an audio file is silent by comparing peak amplitude to a threshold.
+
+    Args:
+        audio_path: Path to the audio file
+        threshold: Peak amplitude threshold for silence
+
+    Returns:
+        bool: True if audio is silent, False otherwise
+    """
+    rate, data = wavfile.read(audio_path)
+    peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0  # 16-bit audio
+
+    return peak_amplitude < threshold
+
+
 def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
    """Get audio length in seconds from bytes data.
-    
+
    Args:
        audio_data: Raw audio bytes
        temp_dir: Directory for temporary file. If None, uses system temp directory.
-        
+
    Returns:
        float: Audio length in seconds
    """
    if temp_dir is None:
        import tempfile
+
        temp_dir = tempfile.gettempdir()
-    
+
    temp_path = os.path.join(temp_dir, "temp.wav")
    os.makedirs(temp_dir, exist_ok=True)
-    
+
    with open(temp_path, "wb") as f:
        f.write(audio_data)

@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:

 def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
    """Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
-    
+
    Args:
        average: If True and multiple GPUs present, returns average memory usage.
                If False, returns list of memory usage per GPU.
-    
+
    Returns:
        float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
        If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
        n_gpus = torch.cuda.device_count()
        memory_used = []
        for i in range(n_gpus):
-            memory_used.append(torch.cuda.memory_allocated(i) / 1024**2)  # Convert to MB
-        
+            memory_used.append(
+                torch.cuda.memory_allocated(i) / 1024**2
+            )  # Convert to MB
+
        if average and len(memory_used) > 0:
            return sum(memory_used) / len(memory_used)
        return memory_used if len(memory_used) > 1 else memory_used[0]
-    
+
    # Fall back to nvidia-smi
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
        )
-        memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
-        
+        memory_values = [
+            float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
+        ]
+
        if average and len(memory_values) > 0:
            return sum(memory_values) / len(memory_values)
        return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:

 def get_system_metrics() -> Dict[str, Union[str, float]]:
    """Get current system metrics including CPU, RAM, and GPU if available.
-    
+
    Returns:
        dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
    """
    # Get per-CPU percentages and calculate average
    cpu_percentages = psutil.cpu_percent(percpu=True)
    avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
-    
+
    metrics = {
        "timestamp": datetime.now().isoformat(),
        "cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:

 def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
    """Save audio data to a file with proper naming and directory creation.
-    
+
    Args:
        audio_data: Raw audio bytes
        identifier: String to identify this audio file (e.g. token count, test name)
        output_dir: Directory to save the file
-        
+
    Returns:
        str: Path to the saved audio file
    """
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{identifier}.wav")
-    
+
    with open(output_file, "wb") as f:
        f.write(audio_data)
-        
+
    return output_file


 def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
    """Write benchmark statistics to a file in a clean, organized format.
-    
+
    Args:
        stats: List of dictionaries containing stat name/value pairs
        output_file: Path to output file
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    
+
    with open(output_file, "w") as f:
        for section in stats:
            # Write section header
            f.write(f"=== {section['title']} ===\n\n")
-            
+
            # Write stats
-            for label, value in section['stats'].items():
+            for label, value in section["stats"].items():
                if isinstance(value, float):
                    f.write(f"{label}: {value:.2f}\n")
                else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None

 def save_json_results(results: Dict[str, Any], output_file: str) -> None:
    """Save benchmark results to a JSON file with proper formatting.
-    
+
    Args:
        results: Dictionary of results to save
        output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
        json.dump(results, f, indent=2)


-def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
+def real_time_factor(
+    processing_time: float, audio_length: float, decimals: int = 2
+) -> float:
    """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
-    
+
    Args:
        processing_time: Time taken to process/generate audio
        audio_length: Length of the generated audio
        decimals: Number of decimal places to round to
-        
+
    Returns:
        float: RTF value
    """
--- a/examples/assorted_checks/benchmarks/lib/stream_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/stream_utils.py
@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+import os
+import time
+import wave
+from typing import Any, Dict, List, Callable, Optional
+
+import pandas as pd
+import scipy.io.wavfile as wavfile
+
+from .shared_utils import save_json_results
+from .shared_plotting import plot_timeline, plot_correlation
+from .shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def check_audio_silence(audio_path: str) -> bool:
+    """Check if audio file contains only silence"""
+    sample_rate, audio_data = wavfile.read(audio_path)
+    # Convert to float for RMS calculation
+    audio_float = audio_data.astype(float)
+    # Calculate RMS value
+    rms = (audio_float**2).mean() ** 0.5
+    # Define silence threshold (adjust if needed)
+    SILENCE_THRESHOLD = 50.0
+    return rms < SILENCE_THRESHOLD
+
+
+def process_benchmark_results(
+    all_results: List[Dict[str, Any]], token_sizes: List[int]
+) -> Dict[str, Any]:
+    """Process benchmark results and generate summary"""
+    summary = {}
+    for tokens in token_sizes:
+        matching_results = [
+            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+        ]
+        if matching_results:
+            avg_first_chunk = sum(
+                r["time_to_first_chunk"] for r in matching_results
+            ) / len(matching_results)
+            avg_total = sum(r["total_time"] for r in matching_results) / len(
+                matching_results
+            )
+            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+                matching_results
+            )
+            summary[tokens] = {
+                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
+                "avg_total_time": round(avg_total, 3),
+                "avg_audio_length": round(avg_audio_length, 3),
+                "num_successful_runs": len(matching_results),
+            }
+    return summary
+
+
+def save_benchmark_results(
+    all_results: List[Dict[str, Any]],
+    summary: Dict[str, Any],
+    output_data_dir: str,
+    output_plots_dir: str,
+    suffix: str,
+    plot_title_suffix: str,
+    prefix: str = "",
+):
+    """Save benchmark results and generate plots"""
+    # Save results
+    results_data = {
+        "individual_runs": all_results,
+        "summary": summary,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    save_json_results(
+        results_data,
+        os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
+    )
+
+    # Create DataFrame for plotting
+    df = pd.DataFrame(all_results)
+
+    # Create plots
+    plot_correlation(
+        df,
+        "target_tokens",
+        "time_to_first_chunk",
+        f"Time to First Audio vs Input Size {plot_title_suffix}",
+        "Number of Input Tokens",
+        "Time to First Audio (seconds)",
+        os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
+    )
+
+    plot_correlation(
+        df,
+        "target_tokens",
+        "total_time",
+        f"Total Time vs Input Size {plot_title_suffix}",
+        "Number of Input Tokens",
+        "Total Time (seconds)",
+        os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
+    )
+
+    plot_timeline(
+        df,
+        os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
+        suffix=plot_title_suffix,
+    )
+
+
+def run_benchmark(
+    measure_func: Callable,
+    output_dir: str,
+    output_data_dir: str,
+    output_plots_dir: str,
+    suffix: str = "",
+    plot_title_suffix: str = "",
+    num_runs: int = 5,
+    client=None,
+    prefix="",
+):
+    """Run benchmark with the given measurement function"""
+    # Create output directories
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(output_data_dir, exist_ok=True)
+    os.makedirs(output_plots_dir, exist_ok=True)
+
+    # Load sample text
+    script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
+        text = f.read()
+
+    # Test specific token counts
+    token_sizes = [10, 50, 100, 250, 500]
+    all_results = []
+    silent_files = []
+
+    for tokens in token_sizes:
+        print(
+            f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
+        )
+        test_text = get_text_for_tokens(text, tokens)
+        actual_tokens = len(enc.encode(test_text))
+        print(f"Text preview: {test_text[:50]}...")
+
+        for i in range(num_runs):
+            print(f"Run {i+1}/{num_runs}...")
+            result = measure_func(test_text, output_dir, tokens, i + 1)
+            result["target_tokens"] = tokens
+            result["actual_tokens"] = actual_tokens
+            result["run_number"] = i + 1
+
+            # Handle time to first audio
+            first_chunk = result.get('time_to_first_chunk')
+            print(
+                f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
+            )
+            
+            # Handle total time
+            total_time = result.get('total_time')
+            print(
+                f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
+            )
+            
+            # Handle audio length
+            audio_length = result.get('audio_length')
+            print(
+                f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
+            )
+            # Calculate streaming overhead only if both values exist
+            if total_time is not None and first_chunk is not None:
+                print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
+            else:
+                print("Streaming overhead: N/A")
+
+            if result["error"]:
+                print(f"Error: {result['error']}")
+            elif result["audio_path"] and check_audio_silence(result["audio_path"]):
+                silent_files.append(result["audio_path"])
+
+            all_results.append(result)
+
+    # Process and save results
+    summary = process_benchmark_results(all_results, token_sizes)
+    save_benchmark_results(
+        all_results,
+        summary,
+        output_data_dir,
+        output_plots_dir,
+        suffix,
+        plot_title_suffix,
+    )
+
+    # Print paths
+    print("\nResults and plots saved to:")
+    print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
+
+    # Print silence check summary
+    if silent_files:
+        print("\nWARNING: The following files contain only silence:")
+        for file in silent_files:
+            print(f"- {file}")
+    else:
+        print("\nAll generated audio files contain valid audio content.")
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
@ -1,111 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 18.833295583724976,
-      "output_length": 31.15,
-      "realtime_factor": 1.6539856161403135,
-      "elapsed_time": 19.024322748184204
-    },
-    {
-      "tokens": 200,
-      "processing_time": 38.95506024360657,
-      "output_length": 62.6,
-      "realtime_factor": 1.6069799304257042,
-      "elapsed_time": 58.21527123451233
-    },
-    {
-      "tokens": 300,
-      "processing_time": 49.74252939224243,
-      "output_length": 96.325,
-      "realtime_factor": 1.9364716908630366,
-      "elapsed_time": 108.19673728942871
-    },
-    {
-      "tokens": 400,
-      "processing_time": 61.349056243896484,
-      "output_length": 128.575,
-      "realtime_factor": 2.095794261102292,
-      "elapsed_time": 169.733656167984
-    },
-    {
-      "tokens": 500,
-      "processing_time": 82.86568236351013,
-      "output_length": 158.575,
-      "realtime_factor": 1.9136389815071193,
-      "elapsed_time": 252.7968451976776
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T00:13:49.865330",
-      "cpu_percent": 8.0,
-      "ram_percent": 39.4,
-      "ram_used_gb": 25.03811264038086,
-      "gpu_memory_used": 1204.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:08.781551",
-      "cpu_percent": 26.8,
-      "ram_percent": 42.6,
-      "ram_used_gb": 27.090862274169922,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:08.916973",
-      "cpu_percent": 16.1,
-      "ram_percent": 42.6,
-      "ram_used_gb": 27.089553833007812,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:47.979053",
-      "cpu_percent": 31.5,
-      "ram_percent": 43.6,
-      "ram_used_gb": 27.714427947998047,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:48.098976",
-      "cpu_percent": 20.0,
-      "ram_percent": 43.6,
-      "ram_used_gb": 27.704315185546875,
-      "gpu_memory_used": 1211.0
-    },
-    {
-      "timestamp": "2025-01-03T00:15:37.944729",
-      "cpu_percent": 29.7,
-      "ram_percent": 38.6,
-      "ram_used_gb": 24.53925323486328,
-      "gpu_memory_used": 1217.0
-    },
-    {
-      "timestamp": "2025-01-03T00:15:38.071915",
-      "cpu_percent": 8.6,
-      "ram_percent": 38.5,
-      "ram_used_gb": 24.51690673828125,
-      "gpu_memory_used": 1208.0
-    },
-    {
-      "timestamp": "2025-01-03T00:16:39.525449",
-      "cpu_percent": 23.4,
-      "ram_percent": 38.8,
-      "ram_used_gb": 24.71230697631836,
-      "gpu_memory_used": 1221.0
-    },
-    {
-      "timestamp": "2025-01-03T00:16:39.612442",
-      "cpu_percent": 5.5,
-      "ram_percent": 38.9,
-      "ram_used_gb": 24.72066879272461,
-      "gpu_memory_used": 1221.0
-    },
-    {
-      "timestamp": "2025-01-03T00:18:02.569076",
-      "cpu_percent": 27.4,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.868202209472656,
-      "gpu_memory_used": 1264.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
@ -1,216 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 14.349808931350708,
-      "output_length": 31.15,
-      "rtf": 0.46,
-      "elapsed_time": 14.716031074523926
-    },
-    {
-      "tokens": 200,
-      "processing_time": 28.341803312301636,
-      "output_length": 62.6,
-      "rtf": 0.45,
-      "elapsed_time": 43.44207406044006
-    },
-    {
-      "tokens": 300,
-      "processing_time": 43.352553606033325,
-      "output_length": 96.325,
-      "rtf": 0.45,
-      "elapsed_time": 87.26906609535217
-    },
-    {
-      "tokens": 400,
-      "processing_time": 71.02449822425842,
-      "output_length": 128.575,
-      "rtf": 0.55,
-      "elapsed_time": 158.7198133468628
-    },
-    {
-      "tokens": 500,
-      "processing_time": 70.92521691322327,
-      "output_length": 158.575,
-      "rtf": 0.45,
-      "elapsed_time": 230.01379895210266
-    },
-    {
-      "tokens": 600,
-      "processing_time": 83.6328592300415,
-      "output_length": 189.25,
-      "rtf": 0.44,
-      "elapsed_time": 314.02610969543457
-    },
-    {
-      "tokens": 700,
-      "processing_time": 103.0810194015503,
-      "output_length": 222.075,
-      "rtf": 0.46,
-      "elapsed_time": 417.5678551197052
-    },
-    {
-      "tokens": 800,
-      "processing_time": 127.02162909507751,
-      "output_length": 253.85,
-      "rtf": 0.5,
-      "elapsed_time": 545.0128681659698
-    },
-    {
-      "tokens": 900,
-      "processing_time": 130.49781227111816,
-      "output_length": 283.775,
-      "rtf": 0.46,
-      "elapsed_time": 675.8943417072296
-    },
-    {
-      "tokens": 1000,
-      "processing_time": 154.76425909996033,
-      "output_length": 315.475,
-      "rtf": 0.49,
-      "elapsed_time": 831.0677945613861
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T00:23:52.896889",
-      "cpu_percent": 4.5,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.86032485961914,
-      "gpu_memory_used": 1281.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:07.429461",
-      "cpu_percent": 4.5,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.847564697265625,
-      "gpu_memory_used": 1285.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:07.620587",
-      "cpu_percent": 2.7,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.846607208251953,
-      "gpu_memory_used": 1275.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:36.140754",
-      "cpu_percent": 5.4,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.857810974121094,
-      "gpu_memory_used": 1267.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:36.340675",
-      "cpu_percent": 6.2,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.85773468017578,
-      "gpu_memory_used": 1267.0
-    },
-    {
-      "timestamp": "2025-01-03T00:25:19.905634",
-      "cpu_percent": 29.1,
-      "ram_percent": 39.2,
-      "ram_used_gb": 24.920318603515625,
-      "gpu_memory_used": 1256.0
-    },
-    {
-      "timestamp": "2025-01-03T00:25:20.182219",
-      "cpu_percent": 20.0,
-      "ram_percent": 39.2,
-      "ram_used_gb": 24.930198669433594,
-      "gpu_memory_used": 1256.0
-    },
-    {
-      "timestamp": "2025-01-03T00:26:31.414760",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.127891540527344,
-      "gpu_memory_used": 1259.0
-    },
-    {
-      "timestamp": "2025-01-03T00:26:31.617256",
-      "cpu_percent": 3.6,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.126346588134766,
-      "gpu_memory_used": 1252.0
-    },
-    {
-      "timestamp": "2025-01-03T00:27:42.736097",
-      "cpu_percent": 10.5,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.100231170654297,
-      "gpu_memory_used": 1249.0
-    },
-    {
-      "timestamp": "2025-01-03T00:27:42.912870",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.098285675048828,
-      "gpu_memory_used": 1249.0
-    },
-    {
-      "timestamp": "2025-01-03T00:29:06.725264",
-      "cpu_percent": 8.9,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.123123168945312,
-      "gpu_memory_used": 1239.0
-    },
-    {
-      "timestamp": "2025-01-03T00:29:06.928826",
-      "cpu_percent": 5.5,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.128646850585938,
-      "gpu_memory_used": 1239.0
-    },
-    {
-      "timestamp": "2025-01-03T00:30:50.206349",
-      "cpu_percent": 49.6,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.162948608398438,
-      "gpu_memory_used": 1245.0
-    },
-    {
-      "timestamp": "2025-01-03T00:30:50.491837",
-      "cpu_percent": 14.8,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.13379669189453,
-      "gpu_memory_used": 1245.0
-    },
-    {
-      "timestamp": "2025-01-03T00:32:57.721467",
-      "cpu_percent": 6.2,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.187721252441406,
-      "gpu_memory_used": 1384.0
-    },
-    {
-      "timestamp": "2025-01-03T00:32:57.913350",
-      "cpu_percent": 3.6,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.199390411376953,
-      "gpu_memory_used": 1384.0
-    },
-    {
-      "timestamp": "2025-01-03T00:35:08.608730",
-      "cpu_percent": 6.3,
-      "ram_percent": 39.8,
-      "ram_used_gb": 25.311710357666016,
-      "gpu_memory_used": 1330.0
-    },
-    {
-      "timestamp": "2025-01-03T00:35:08.791851",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.8,
-      "ram_used_gb": 25.326683044433594,
-      "gpu_memory_used": 1333.0
-    },
-    {
-      "timestamp": "2025-01-03T00:37:43.782406",
-      "cpu_percent": 6.8,
-      "ram_percent": 40.6,
-      "ram_used_gb": 25.803058624267578,
-      "gpu_memory_used": 1409.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
@ -1,300 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 0.96,
-      "output_length": 31.1,
-      "rtf": 0.03,
-      "elapsed_time": 1.11
-    },
-    {
-      "tokens": 250,
-      "processing_time": 2.23,
-      "output_length": 77.17,
-      "rtf": 0.03,
-      "elapsed_time": 3.49
-    },
-    {
-      "tokens": 400,
-      "processing_time": 4.05,
-      "output_length": 128.05,
-      "rtf": 0.03,
-      "elapsed_time": 7.77
-    },
-    {
-      "tokens": 550,
-      "processing_time": 4.06,
-      "output_length": 171.45,
-      "rtf": 0.02,
-      "elapsed_time": 12.0
-    },
-    {
-      "tokens": 700,
-      "processing_time": 6.01,
-      "output_length": 221.6,
-      "rtf": 0.03,
-      "elapsed_time": 18.16
-    },
-    {
-      "tokens": 850,
-      "processing_time": 6.9,
-      "output_length": 269.1,
-      "rtf": 0.03,
-      "elapsed_time": 25.21
-    },
-    {
-      "tokens": 1000,
-      "processing_time": 7.65,
-      "output_length": 315.05,
-      "rtf": 0.02,
-      "elapsed_time": 33.03
-    },
-    {
-      "tokens": 6000,
-      "processing_time": 48.7,
-      "output_length": 1837.1,
-      "rtf": 0.03,
-      "elapsed_time": 82.21
-    },
-    {
-      "tokens": 11000,
-      "processing_time": 92.44,
-      "output_length": 3388.57,
-      "rtf": 0.03,
-      "elapsed_time": 175.46
-    },
-    {
-      "tokens": 16000,
-      "processing_time": 163.61,
-      "output_length": 4977.32,
-      "rtf": 0.03,
-      "elapsed_time": 340.46
-    },
-    {
-      "tokens": 21000,
-      "processing_time": 209.72,
-      "output_length": 6533.3,
-      "rtf": 0.03,
-      "elapsed_time": 551.92
-    },
-    {
-      "tokens": 26000,
-      "processing_time": 329.35,
-      "output_length": 8068.15,
-      "rtf": 0.04,
-      "elapsed_time": 883.37
-    },
-    {
-      "tokens": 31000,
-      "processing_time": 473.52,
-      "output_length": 9611.48,
-      "rtf": 0.05,
-      "elapsed_time": 1359.28
-    },
-    {
-      "tokens": 36000,
-      "processing_time": 650.98,
-      "output_length": 11157.15,
-      "rtf": 0.06,
-      "elapsed_time": 2012.9
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T14:41:01.331735",
-      "cpu_percent": 7.5,
-      "ram_percent": 50.2,
-      "ram_used_gb": 31.960269927978516,
-      "gpu_memory_used": 3191.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:02.357116",
-      "cpu_percent": 17.01,
-      "ram_percent": 50.2,
-      "ram_used_gb": 31.96163558959961,
-      "gpu_memory_used": 3426.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:02.445009",
-      "cpu_percent": 9.5,
-      "ram_percent": 50.3,
-      "ram_used_gb": 31.966781616210938,
-      "gpu_memory_used": 3426.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:04.742152",
-      "cpu_percent": 18.27,
-      "ram_percent": 50.4,
-      "ram_used_gb": 32.08788299560547,
-      "gpu_memory_used": 3642.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:04.847795",
-      "cpu_percent": 16.27,
-      "ram_percent": 50.5,
-      "ram_used_gb": 32.094364166259766,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:09.019590",
-      "cpu_percent": 15.97,
-      "ram_percent": 50.7,
-      "ram_used_gb": 32.23244094848633,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:09.110324",
-      "cpu_percent": 3.54,
-      "ram_percent": 50.7,
-      "ram_used_gb": 32.234458923339844,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:13.252607",
-      "cpu_percent": 13.4,
-      "ram_percent": 50.6,
-      "ram_used_gb": 32.194271087646484,
-      "gpu_memory_used": 3935.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:13.327557",
-      "cpu_percent": 4.69,
-      "ram_percent": 50.6,
-      "ram_used_gb": 32.191776275634766,
-      "gpu_memory_used": 3935.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:19.413633",
-      "cpu_percent": 12.92,
-      "ram_percent": 50.9,
-      "ram_used_gb": 32.3467903137207,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:19.492758",
-      "cpu_percent": 7.5,
-      "ram_percent": 50.8,
-      "ram_used_gb": 32.34375,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:26.467284",
-      "cpu_percent": 13.09,
-      "ram_percent": 51.2,
-      "ram_used_gb": 32.56281280517578,
-      "gpu_memory_used": 4249.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:26.553559",
-      "cpu_percent": 8.39,
-      "ram_percent": 51.2,
-      "ram_used_gb": 32.56183624267578,
-      "gpu_memory_used": 4249.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:34.284362",
-      "cpu_percent": 12.61,
-      "ram_percent": 51.7,
-      "ram_used_gb": 32.874778747558594,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:34.362353",
-      "cpu_percent": 1.25,
-      "ram_percent": 51.7,
-      "ram_used_gb": 32.87461471557617,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:42:23.471312",
-      "cpu_percent": 11.64,
-      "ram_percent": 54.9,
-      "ram_used_gb": 34.90264129638672,
-      "gpu_memory_used": 4647.0
-    },
-    {
-      "timestamp": "2025-01-03T14:42:23.547203",
-      "cpu_percent": 5.31,
-      "ram_percent": 54.9,
-      "ram_used_gb": 34.91563415527344,
-      "gpu_memory_used": 4647.0
-    },
-    {
-      "timestamp": "2025-01-03T14:43:56.724933",
-      "cpu_percent": 12.97,
-      "ram_percent": 59.5,
-      "ram_used_gb": 37.84241485595703,
-      "gpu_memory_used": 4655.0
-    },
-    {
-      "timestamp": "2025-01-03T14:43:56.815453",
-      "cpu_percent": 11.75,
-      "ram_percent": 59.5,
-      "ram_used_gb": 37.832679748535156,
-      "gpu_memory_used": 4655.0
-    },
-    {
-      "timestamp": "2025-01-03T14:46:41.705155",
-      "cpu_percent": 12.94,
-      "ram_percent": 66.3,
-      "ram_used_gb": 42.1534538269043,
-      "gpu_memory_used": 4729.0
-    },
-    {
-      "timestamp": "2025-01-03T14:46:41.835177",
-      "cpu_percent": 7.73,
-      "ram_percent": 66.2,
-      "ram_used_gb": 42.13554000854492,
-      "gpu_memory_used": 4729.0
-    },
-    {
-      "timestamp": "2025-01-03T14:50:13.166236",
-      "cpu_percent": 11.62,
-      "ram_percent": 73.4,
-      "ram_used_gb": 46.71288299560547,
-      "gpu_memory_used": 4676.0
-    },
-    {
-      "timestamp": "2025-01-03T14:50:13.261611",
-      "cpu_percent": 8.16,
-      "ram_percent": 73.4,
-      "ram_used_gb": 46.71356201171875,
-      "gpu_memory_used": 4676.0
-    },
-    {
-      "timestamp": "2025-01-03T14:55:44.623607",
-      "cpu_percent": 12.92,
-      "ram_percent": 82.8,
-      "ram_used_gb": 52.65533447265625,
-      "gpu_memory_used": 4636.0
-    },
-    {
-      "timestamp": "2025-01-03T14:55:44.735410",
-      "cpu_percent": 15.29,
-      "ram_percent": 82.7,
-      "ram_used_gb": 52.63290786743164,
-      "gpu_memory_used": 4636.0
-    },
-    {
-      "timestamp": "2025-01-03T15:03:40.534449",
-      "cpu_percent": 13.88,
-      "ram_percent": 85.0,
-      "ram_used_gb": 54.050071716308594,
-      "gpu_memory_used": 4771.0
-    },
-    {
-      "timestamp": "2025-01-03T15:03:40.638708",
-      "cpu_percent": 12.21,
-      "ram_percent": 85.0,
-      "ram_used_gb": 54.053733825683594,
-      "gpu_memory_used": 4771.0
-    },
-    {
-      "timestamp": "2025-01-03T15:14:34.159142",
-      "cpu_percent": 14.51,
-      "ram_percent": 78.1,
-      "ram_used_gb": 49.70396423339844,
-      "gpu_memory_used": 4739.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
@ -1,19 +0,0 @@
-=== Benchmark Statistics (with correct RTF) ===
-
-Overall Stats:
-Total tokens processed: 5500
-Total audio generated: 1741.65s
-Total test duration: 831.07s
-Average processing rate: 6.72 tokens/second
-Average RTF: 0.47x
-
-Per-chunk Stats:
-Average chunk size: 550.00 tokens
-Min chunk size: 100.00 tokens
-Max chunk size: 1000.00 tokens
-Average processing time: 82.70s
-Average output length: 174.17s
-
-Performance Ranges:
-Processing rate range: 5.63 - 7.17 tokens/second
-RTF range: 0.44x - 0.55x
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
@ -1,9 +0,0 @@
-=== Benchmark Statistics (with correct RTF) ===
-
-Overall Stats:
-Total tokens processed: 150850
-Total audio generated: 46786.59s
-Total test duration: 2012.90s
-Average processing rate: 104.34 tokens/second
-Average RTF: 0.03x
-
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
@ -2,22 +2,22 @@

 Total tokens processed: 1800
 Total audio generated (s): 568.53
-Total test duration (s): 244.10
-Average processing rate (tokens/s): 7.34
-Average RTF: 0.43
-Average Real Time Speed: 2.33
+Total test duration (s): 306.02
+Average processing rate (tokens/s): 5.75
+Average RTF: 0.55
+Average Real Time Speed: 1.81

 === Per-chunk Stats ===

 Average chunk size (tokens): 600.00
 Min chunk size (tokens): 300
 Max chunk size (tokens): 900
-Average processing time (s): 81.30
+Average processing time (s): 101.89
 Average output length (s): 189.51

 === Performance Ranges ===

-Processing rate range (tokens/s): 7.21 - 7.47
-RTF range: 0.43x - 0.43x
-Real Time Speed range: 2.33x - 2.33x
+Processing rate range (tokens/s): 5.30 - 6.26
+RTF range: 0.51x - 0.59x
+Real Time Speed range: 1.69x - 1.96x

--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@ -0,0 +1,337 @@
+{
+  "individual_runs": [
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.818483829498291,
+      "time_to_first_chunk": 1.8067498207092285,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 1
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.6271553039550781,
+      "time_to_first_chunk": 1.610968828201294,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 2
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.5759549140930176,
+      "time_to_first_chunk": 1.561316967010498,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 3
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.615680456161499,
+      "time_to_first_chunk": 1.6035709381103516,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 4
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.6515357494354248,
+      "time_to_first_chunk": 1.6268820762634277,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 5
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 7.368175268173218,
+      "time_to_first_chunk": 3.4540352821350098,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 1
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.931752443313599,
+      "time_to_first_chunk": 3.1553661823272705,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 2
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.867500066757202,
+      "time_to_first_chunk": 3.127124309539795,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 3
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.933881521224976,
+      "time_to_first_chunk": 3.1872360706329346,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 4
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 7.605916738510132,
+      "time_to_first_chunk": 3.6397976875305176,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 5
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 14.777218580245972,
+      "time_to_first_chunk": 3.625889778137207,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 1
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 13.911701202392578,
+      "time_to_first_chunk": 3.298157215118408,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 2
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 14.451806783676147,
+      "time_to_first_chunk": 3.8353848457336426,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 3
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 13.941124200820923,
+      "time_to_first_chunk": 3.3754897117614746,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 4
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 15.717307329177856,
+      "time_to_first_chunk": 3.6421003341674805,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 5
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 41.16162133216858,
+      "time_to_first_chunk": 3.7044918537139893,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 1
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 35.43009877204895,
+      "time_to_first_chunk": 3.1040024757385254,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 2
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 35.285505294799805,
+      "time_to_first_chunk": 3.657808780670166,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 3
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 34.47842836380005,
+      "time_to_first_chunk": 3.2033851146698,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 4
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 36.50936222076416,
+      "time_to_first_chunk": 3.1159815788269043,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 5
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 86.84899735450745,
+      "time_to_first_chunk": 5.405678987503052,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 1
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 74.72578477859497,
+      "time_to_first_chunk": 3.966891050338745,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 2
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 68.1974081993103,
+      "time_to_first_chunk": 3.27712082862854,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 3
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 72.68819260597229,
+      "time_to_first_chunk": 3.153608560562134,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 4
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 67.94887590408325,
+      "time_to_first_chunk": 3.954728841781616,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 5
+    }
+  ],
+  "summary": {
+    "10": {
+      "avg_time_to_first_chunk": 1.642,
+      "avg_total_time": 1.658,
+      "avg_audio_length": 3.45,
+      "num_successful_runs": 5
+    },
+    "50": {
+      "avg_time_to_first_chunk": 3.313,
+      "avg_total_time": 7.141,
+      "avg_audio_length": 15.825,
+      "num_successful_runs": 5
+    },
+    "100": {
+      "avg_time_to_first_chunk": 3.555,
+      "avg_total_time": 14.56,
+      "avg_audio_length": 30.35,
+      "num_successful_runs": 5
+    },
+    "250": {
+      "avg_time_to_first_chunk": 3.357,
+      "avg_total_time": 36.573,
+      "avg_audio_length": 78.175,
+      "num_successful_runs": 5
+    },
+    "500": {
+      "avg_time_to_first_chunk": 3.952,
+      "avg_total_time": 74.082,
+      "avg_audio_length": 155.125,
+      "num_successful_runs": 5
+    }
+  },
+  "timestamp": "2025-01-06 03:31:37"
+}
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
@ -0,0 +1,337 @@
+{
+  "individual_runs": [
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.638200044631958,
+      "time_to_first_chunk": 1.6232295036315918,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 1
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.4960439205169678,
+      "time_to_first_chunk": 1.4854960441589355,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 2
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.5055279731750488,
+      "time_to_first_chunk": 1.4948456287384033,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 3
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.496837854385376,
+      "time_to_first_chunk": 1.4835176467895508,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 4
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 1.7330272197723389,
+      "time_to_first_chunk": 1.7219843864440918,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 5
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.865253925323486,
+      "time_to_first_chunk": 3.1809072494506836,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 1
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 7.975425720214844,
+      "time_to_first_chunk": 3.2910428047180176,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 2
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.793715715408325,
+      "time_to_first_chunk": 3.210068464279175,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 3
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 6.639606237411499,
+      "time_to_first_chunk": 3.0641400814056396,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 4
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 8.100529193878174,
+      "time_to_first_chunk": 3.3910109996795654,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
+      "audio_length": 15.825,
+      "target_tokens": 50,
+      "actual_tokens": 50,
+      "run_number": 5
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 15.246968984603882,
+      "time_to_first_chunk": 3.1980819702148438,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 1
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 15.934760332107544,
+      "time_to_first_chunk": 4.23082709312439,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 2
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 13.799078226089478,
+      "time_to_first_chunk": 3.42996883392334,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 3
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 13.400063037872314,
+      "time_to_first_chunk": 3.2097883224487305,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 4
+    },
+    {
+      "text_length": 448,
+      "token_count": null,
+      "total_time": 14.833694219589233,
+      "time_to_first_chunk": 3.1589744091033936,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
+      "audio_length": 30.35,
+      "target_tokens": 100,
+      "actual_tokens": 100,
+      "run_number": 5
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 35.49378156661987,
+      "time_to_first_chunk": 3.852027177810669,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 1
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 33.59433174133301,
+      "time_to_first_chunk": 3.2059006690979004,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 2
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 34.23120045661926,
+      "time_to_first_chunk": 3.1464977264404297,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 3
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 36.18487215042114,
+      "time_to_first_chunk": 3.188844919204712,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 4
+    },
+    {
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 38.142744302749634,
+      "time_to_first_chunk": 3.6997063159942627,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
+      "audio_length": 78.175,
+      "target_tokens": 250,
+      "actual_tokens": 250,
+      "run_number": 5
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 71.48920440673828,
+      "time_to_first_chunk": 3.148237943649292,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 1
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 73.53017520904541,
+      "time_to_first_chunk": 3.464594841003418,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 2
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 75.52278685569763,
+      "time_to_first_chunk": 3.5506417751312256,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 3
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 69.45922994613647,
+      "time_to_first_chunk": 3.495962619781494,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 4
+    },
+    {
+      "text_length": 2232,
+      "token_count": null,
+      "total_time": 66.66928672790527,
+      "time_to_first_chunk": 3.301323175430298,
+      "error": null,
+      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
+      "audio_length": 155.125,
+      "target_tokens": 500,
+      "actual_tokens": 500,
+      "run_number": 5
+    }
+  ],
+  "summary": {
+    "10": {
+      "avg_time_to_first_chunk": 1.562,
+      "avg_total_time": 1.574,
+      "avg_audio_length": 3.45,
+      "num_successful_runs": 5
+    },
+    "50": {
+      "avg_time_to_first_chunk": 3.227,
+      "avg_total_time": 7.275,
+      "avg_audio_length": 15.825,
+      "num_successful_runs": 5
+    },
+    "100": {
+      "avg_time_to_first_chunk": 3.446,
+      "avg_total_time": 14.643,
+      "avg_audio_length": 30.35,
+      "num_successful_runs": 5
+    },
+    "250": {
+      "avg_time_to_first_chunk": 3.419,
+      "avg_total_time": 35.529,
+      "avg_audio_length": 78.175,
+      "num_successful_runs": 5
+    },
+    "500": {
+      "avg_time_to_first_chunk": 3.392,
+      "avg_total_time": 71.334,
+      "avg_audio_length": 155.125,
+      "num_successful_runs": 5
+    }
+  },
+  "timestamp": "2025-01-06 03:42:32"
+}
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
@ -1,23 +1,23 @@
 === Benchmark Statistics (with correct RTF) ===

-Total tokens processed: 17150
-Total audio generated (s): 5296.38
-Total test duration (s): 155.23
-Average processing rate (tokens/s): 102.86
-Average RTF: 0.03
-Average Real Time Speed: 31.25
+Total tokens processed: 3150
+Total audio generated (s): 994.22
+Total test duration (s): 73.81
+Average processing rate (tokens/s): 49.36
+Average RTF: 0.07
+Average Real Time Speed: 15.00

 === Per-chunk Stats ===

-Average chunk size (tokens): 1715.00
+Average chunk size (tokens): 525.00
 Min chunk size (tokens): 150
-Max chunk size (tokens): 5000
-Average processing time (s): 15.39
-Average output length (s): 529.64
+Max chunk size (tokens): 900
+Average processing time (s): 12.12
+Average output length (s): 165.70

 === Performance Ranges ===

-Processing rate range (tokens/s): 80.65 - 125.10
-RTF range: 0.03x - 0.04x
-Real Time Speed range: 25.00x - 33.33x
+Processing rate range (tokens/s): 30.33 - 63.56
+RTF range: 0.05x - 0.10x
+Real Time Speed range: 10.00x - 20.00x

--- a/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
+++ b/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
--- a/examples/assorted_checks/generate_readme_plots.py
+++ b/examples/assorted_checks/generate_readme_plots.py
@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Script to generate all plots needed for the README."""
+
+import os
+import sys
+import shutil
+from pathlib import Path
+
+from validate_wav import validate_tts
+
+# Get absolute paths
+script_dir = Path(__file__).parent.resolve()
+project_root = script_dir.parent.parent
+
+# Add directories to Python path for imports
+sys.path.append(str(script_dir))
+sys.path.append(str(script_dir / "benchmarks"))
+
+# Import test scripts
+from benchmark_tts_rtf import main as benchmark_rtf
+from test_formats.test_audio_formats import main as test_formats
+from benchmark_first_token_stream_unified import main as benchmark_stream
+from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
+
+# Remove directories from path after imports
+sys.path.remove(str(script_dir))
+sys.path.remove(str(script_dir / "benchmarks"))
+
+
+def ensure_assets_dir():
+    """Create assets directory if it doesn't exist."""
+    assets_dir = project_root / "assets"
+    assets_dir.mkdir(exist_ok=True)
+    return assets_dir
+
+
+def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
+    """Copy a plot to the assets directory with a new name."""
+    if os.path.exists(src_path):
+        shutil.copy2(src_path, assets_dir / dest_name)
+        print(f"Copied {src_path} to {assets_dir / dest_name}")
+    else:
+        print(f"Warning: Source plot not found at {src_path}")
+
+
+def validate_and_print(wav_path: str, category: str):
+    """Validate a WAV file and print results."""
+    if not os.path.exists(wav_path):
+        print(f"Warning: WAV file not found at {wav_path}")
+        return
+
+    print(f"\n=== Validating {category} Audio ===")
+    result = validate_tts(wav_path)
+
+    if "error" in result:
+        print(f"Error: {result['error']}")
+    else:
+        print(f"Duration: {result['duration']}")
+        print(f"Sample Rate: {result['sample_rate']} Hz")
+        print(f"Peak Amplitude: {result['peak_amplitude']}")
+        print(f"RMS Level: {result['rms_level']}")
+
+        if result["issues"]:
+            print("\nIssues Found:")
+            for issue in result["issues"]:
+                print(f"- {issue}")
+        else:
+            print("\nNo issues found")
+
+
+def main():
+    """Generate all plots needed for the README."""
+    # Ensure assets directory exists
+    prefix = "gpu"
+    assets_dir = ensure_assets_dir()
+
+    print("\n=== Generating Format Comparison Plot ===")
+    test_formats()
+    copy_plot(
+        str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
+        "format_comparison.png",
+        assets_dir,
+    )
+    # Validate WAV output from format test
+    validate_and_print(
+        str(script_dir / "test_formats/output/test_formats/speech.wav"),
+        "Format Test WAV",
+    )
+
+    print("\n=== Generating Voice Analysis Plot ===")
+    test_voice_analysis()
+    copy_plot(
+        str(script_dir / "test_combinations/output/analysis_comparison.png"),
+        "voice_analysis.png",
+        assets_dir,
+    )
+    # Validate combined voice output
+    validate_and_print(
+        str(
+            script_dir
+            / "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
+        ),
+        "Combined Voice",
+    )
+
+    print("\n=== Generating Performance Benchmark Plots ===")
+    benchmark_rtf()
+    copy_plot(
+        str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
+        f"{prefix}_processing_time.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
+        f"{prefix}_realtime_factor.png",
+        assets_dir,
+    )
+    # Validate RTF benchmark output (~500 tokens)
+    validate_and_print(
+        str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
+        "RTF Benchmark",
+    )
+
+    print("\n=== Generating Streaming Benchmark Plots ===")
+    benchmark_stream()
+
+    # Copy direct streaming plots
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
+        f"{prefix}_first_token_latency_direct.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
+        f"{prefix}_first_token_timeline_direct.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
+        f"{prefix}_total_time_latency_direct.png",
+        assets_dir,
+    )
+
+    # Copy OpenAI streaming plots
+    copy_plot(
+        str(
+            script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
+        ),
+        f"{prefix}_first_token_latency_openai.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(
+            script_dir
+            / "benchmarks/output_plots/first_token_timeline_stream_openai.png"
+        ),
+        f"{prefix}_first_token_timeline_openai.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(
+            script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
+        ),
+        f"{prefix}_total_time_latency_openai.png",
+        assets_dir,
+    )
+
+    # Wait a moment for files to be generated
+    import time
+
+    time.sleep(2)
+
+    # Validate streaming outputs (~500 tokens)
+    validate_and_print(
+        str(
+            script_dir
+            / "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
+        ),
+        "Direct Streaming",
+    )
+    validate_and_print(
+        str(
+            script_dir
+            / "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
+        ),
+        "OpenAI Streaming",
+    )
+
+    validate_and_print(
+        str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
+        "Format Test WAV",
+    )
+
+    print("\nAll plots have been generated and copied to the assets directory")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
+++ b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
@ -73,6 +73,7 @@ def generate_speech(
                "voice": voice,
                "speed": 1.0,
                "response_format": "wav",  # Use WAV for analysis
+                "stream": False,
            },
        )

@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    fig.patch.set_facecolor("#1a1a2e")
    num_files = len(audio_files)

-    # Create subplot grid with proper spacing
+    # Create subplot grid with proper spacing for waveforms and metrics
+    total_rows = num_files + 2  # Add one more row for metrics
    gs = plt.GridSpec(
-        num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
+        total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
    )

    # Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    # Colors for voices
    colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]

-    # Create two subplots for metrics with similar scales
-    # Left subplot: Brightness and Volume
-    ax1 = plt.subplot(gs[num_files, 0])
-    metrics1 = [
+    # Create metrics for each subplot
+    metrics = [
        (
-            "Brightness",
-            [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
-            "kHz",
-        ),
-        ("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
-    ]
-
-    # Right subplot: Voice Pitch and Texture
-    ax2 = plt.subplot(gs[num_files, 1])
-    metrics2 = [
-        (
-            "Voice Pitch",
-            [min(chars["dominant_frequencies"]) for chars in all_chars.values()],
-            "Hz",
+            plt.subplot(gs[num_files, 0]),
+            [
+                (
+                    "Volume",
+                    [chars["rms"] * 100 for chars in all_chars.values()],
+                    "RMS×100",
+                )
+            ],
        ),
        (
-            "Texture",
-            [chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
-            "ZCR×1000",
+            plt.subplot(gs[num_files, 1]),
+            [
+                (
+                    "Brightness",
+                    [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
+                    "kHz",
+                )
+            ],
+        ),
+        (
+            plt.subplot(gs[num_files + 1, 0]),
+            [
+                (
+                    "Voice Pitch",
+                    [
+                        min(chars["dominant_frequencies"])
+                        for chars in all_chars.values()
+                    ],
+                    "Hz",
+                )
+            ],
+        ),
+        (
+            plt.subplot(gs[num_files + 1, 1]),
+            [
+                (
+                    "Texture",
+                    [
+                        chars["zero_crossing_rate"] * 1000
+                        for chars in all_chars.values()
+                    ],
+                    "ZCR×1000",
+                )
+            ],
        ),
    ]

-    def plot_grouped_bars(ax, metrics, show_legend=True):
-        n_groups = len(metrics)
+    # Plot each metric
+    for i, (ax, metric_data) in enumerate(metrics):
        n_voices = len(audio_files)
        bar_width = 0.25
+        indices = np.array([0])

-        indices = np.arange(n_groups)
+        values = metric_data[0][1]
+        max_val = max(values)

-        # Get max value for y-axis scaling
-        max_val = max(max(m[1]) for m in metrics)
-
-        for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
-            values = [m[1][i] for m in metrics]
-            offset = (i - n_voices / 2 + 0.5) * bar_width
+        for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
+            offset = (j - n_voices / 2 + 0.5) * bar_width
            bars = ax.bar(
-                indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
+                indices + offset,
+                [values[j]],
+                bar_width,
+                label=voice,
+                color=color,
+                alpha=0.8,
            )

            # Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                )

        ax.set_xticks(indices)
-        ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
-
-        # Set y-axis limits with some padding
+        ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
        ax.set_ylim(0, max_val * 1.2)
+        ax.set_ylabel("Value")

-        if show_legend:
+        # Only show legend on first metric plot
+        if i == 0:
            ax.legend(
                bbox_to_anchor=(1.05, 1),
                loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                edgecolor="#ffffff",
            )

-    # Plot both subplots
-    plot_grouped_bars(ax1, metrics1, show_legend=True)
-    plot_grouped_bars(ax2, metrics2, show_legend=False)
+        # Style the subplot
+        setup_plot(fig, ax, metric_data[0][0])

-    # Style both subplots
-    setup_plot(fig, ax1, "Brightness and Volume")
-    setup_plot(fig, ax2, "Voice Pitch and Texture")
-
-    # Add y-axis labels
-    ax1.set_ylabel("Value")
-    ax2.set_ylabel("Value")
-
-    # Adjust the figure size to accommodate the legend
-    fig.set_size_inches(15, 15)
-
-    # Add padding around the entire figure
+    # Adjust the figure size and padding
+    fig.set_size_inches(15, 20)
    plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
    plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
    print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
    )
    parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
    parser.add_argument(
-        "--output-dir", 
+        "--output-dir",
        default="examples/assorted_checks/test_combinations/output",
        help="Output directory for audio files",
    )
--- a/examples/assorted_checks/test_formats/test_audio_formats.py
+++ b/examples/assorted_checks/test_formats/test_audio_formats.py
@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
    for i, stat in enumerate(stats):
        format_name = stat["format"].upper()
        try:
-            # Handle PCM format differently
-            if stat["format"] == "pcm":
-                # Read raw PCM data (16-bit mono)
-                with open(
-                    os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
-                ) as f:
-                    raw_data = f.read()
-                data = np.frombuffer(raw_data, dtype=np.int16)
-                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
-                sr = 24000
-            else:
-                # Read other formats with soundfile
-                data, sr = sf.read(
-                    os.path.join(output_dir, f"test_audio.{stat['format']}")
-                )
+            file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")

-            # Plot waveform
+            if stat["format"] == "wav":
+                # Use scipy.io.wavfile for WAV files
+                sr, data = wavfile.read(file_path)
+                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+            elif stat["format"] == "pcm":
+                # Read raw 16-bit signed little-endian PCM data at 24kHz
+                data = np.frombuffer(
+                    open(file_path, "rb").read(), dtype="<i2"
+                )  # '<i2' means little-endian 16-bit signed int
+                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+                sr = 24000  # Known sample rate for our endpoint
+            else:
+                # Use soundfile for other formats (mp3, opus, flac)
+                data, sr = sf.read(file_path)
+
+            # Plot waveform with consistent normalization
            ax = plt.subplot(gs_waves[i])
            time = np.arange(len(data)) / sr
-            plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
+            plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
            ax.set_xlabel("Time (seconds)")
            ax.set_ylabel("")
            ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
    """Get audio file statistics"""
    file_size = os.path.getsize(file_path)
    file_size_kb = file_size / 1024  # Convert to KB
+    format_name = Path(file_path).suffix[1:]

-    try:
-        # Try reading with soundfile first
+    if format_name == "wav":
+        # Use scipy.io.wavfile for WAV files
+        sample_rate, data = wavfile.read(file_path)
+        data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+        duration = len(data) / sample_rate
+        channels = 1 if len(data.shape) == 1 else data.shape[1]
+    elif format_name == "pcm":
+        # For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
+        data = np.frombuffer(
+            open(file_path, "rb").read(), dtype="<i2"
+        )  # '<i2' means little-endian 16-bit signed int
+        data = data.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
+        sample_rate = 24000  # Known sample rate for our endpoint
+        duration = len(data) / sample_rate
+        channels = 1
+    else:
+        # Use soundfile for other formats (mp3, opus, flac)
        data, sample_rate = sf.read(file_path)
        duration = len(data) / sample_rate
        channels = 1 if len(data.shape) == 1 else data.shape[1]

-        # Calculate audio statistics
-        stats = {
-            "format": Path(file_path).suffix[1:],
-            "file_size_kb": round(file_size_kb, 2),
-            "duration_seconds": round(duration, 2),
-            "sample_rate": sample_rate,
-            "channels": channels,
-            "min_amplitude": float(np.min(data)),
-            "max_amplitude": float(np.max(data)),
-            "mean_amplitude": float(np.mean(np.abs(data))),
-            "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
-        }
-        return stats
-    except:
-        # For PCM, read raw bytes and estimate duration
-        with open(file_path, "rb") as f:
-            data = f.read()
-            # Assuming 16-bit PCM mono at 24kHz
-            samples = len(data) // 2  # 2 bytes per sample
-            duration = samples / 24000
-            return {
-                "format": "pcm",
-                "file_size_kb": round(file_size_kb, 2),
-                "duration_seconds": round(duration, 2),
-                "sample_rate": 24000,
-                "channels": 1,
-                "note": "PCM stats are estimated from raw bytes",
-            }
+    # Calculate audio statistics
+    stats = {
+        "format": format_name,
+        "file_size_kb": round(file_size_kb, 2),
+        "duration_seconds": round(duration, 2),
+        "sample_rate": sample_rate,
+        "channels": channels,
+        "min_amplitude": float(np.min(data)),
+        "max_amplitude": float(np.max(data)),
+        "mean_amplitude": float(np.mean(np.abs(data))),
+        "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
+    }
+    return stats


 def main():
@ -254,13 +256,49 @@ def main():

        # Generate and save
        start_time = time.time()
-        response = client.audio.speech.create(
-            model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
+
+        # Use requests with stream=False for consistent data handling
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "voice": voice,
+                "input": SAMPLE_TEXT,
+                "response_format": fmt,
+                "stream": False,  # Explicitly disable streaming to get single complete chunk
+            },
+            stream=False,
+            headers={"Accept": f"audio/{fmt}"},  # Explicitly request audio format
        )
        generation_time = time.time() - start_time

-        with open(output_path, "wb") as f:
-            f.write(response.content)
+        print(f"\nResponse headers for {fmt}:")
+        for header, value in response.headers.items():
+            print(f"{header}: {value}")
+        print(f"Content length: {len(response.content)} bytes")
+        print(f"First few bytes: {response.content[:20].hex()}")
+
+        # Write the file and verify it was written correctly
+        try:
+            with open(output_path, "wb") as f:
+                f.write(response.content)
+
+            # Verify file was written
+            if not output_path.exists():
+                raise Exception(f"Failed to write {fmt} file")
+
+            # Check file size matches content length
+            written_size = output_path.stat().st_size
+            if written_size != len(response.content):
+                raise Exception(
+                    f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
+                )
+
+            print(f"Successfully wrote {fmt} file")
+
+        except Exception as e:
+            print(f"Error writing {fmt} file: {e}")
+            continue

        # Get stats
        file_stats = get_audio_stats(str(output_path))
--- a/examples/assorted_checks/test_normalizer.py
+++ b/examples/assorted_checks/test_normalizer.py
@ -0,0 +1,308 @@
+import re
+import time
+import random
+import string
+from typing import List, Tuple
+
+
+def create_test_cases() -> List[str]:
+    """Create a variety of test cases with different characteristics"""
+
+    # Helper to create random text with specific patterns
+    def random_text(length: int) -> str:
+        return "".join(
+            random.choice(string.ascii_letters + string.digits + " .,!?")
+            for _ in range(length)
+        )
+
+    test_cases = []
+
+    # Base test cases that hit specific patterns
+    base_cases = [
+        "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
+        "Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
+        'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
+        "X's and Y's properties cost £50 million in the 1990s",
+        "こんにちは。今日は！",
+    ]
+
+    # Add base cases
+    test_cases.extend(base_cases)
+
+    # Add variations with random content
+    for length in [100, 1000, 10000]:
+        # Create 3 variations of each length
+        for _ in range(3):
+            text = random_text(length)
+            # Insert some patterns we're looking for
+            text = text.replace(text[10:20], "Dr. Smith")
+            text = text.replace(text[30:40], "$1,234.56")
+            text = text.replace(text[50:60], "A.B.C. xyz")
+            test_cases.append(text)
+
+    return test_cases
+
+
+class TextNormalizerInline:
+    """Text normalizer using inline patterns"""
+
+    def normalize(self, text: str) -> str:
+        # Replace quotes and brackets
+        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+        text = text.replace("«", chr(8220)).replace("»", chr(8221))
+        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+        text = text.replace("(", "«").replace(")", "»")
+
+        # Handle CJK punctuation
+        for a, b in zip("、。！，：；？", ",.!,:;?"):
+            text = text.replace(a, b + " ")
+
+        text = re.sub(r"[^\S \n]", " ", text)
+        text = re.sub(r"  +", " ", text)
+        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+        text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+        text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+        text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+        text = re.sub(
+            r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
+            split_num,
+            text,
+        )
+        text = re.sub(r"(?<=\d),(?=\d)", "", text)
+        text = re.sub(
+            r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
+            handle_money,
+            text,
+        )
+        text = re.sub(r"\d*\.\d+", handle_decimal, text)
+        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
+        text = re.sub(r"(?<=\d)S", " S", text)
+        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+        text = re.sub(r"(?<=X')S\b", "s", text)
+        text = re.sub(
+            r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
+        )
+        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+
+        return text.strip()
+
+
+class TextNormalizerCompiled:
+    """Text normalizer using all compiled patterns"""
+
+    def __init__(self):
+        self.patterns = {
+            "whitespace": re.compile(r"[^\S \n]"),
+            "multi_space": re.compile(r"  +"),
+            "newline_space": re.compile(r"(?<=\n) +(?=\n)"),
+            "doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
+            "mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
+            "miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
+            "mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
+            "etc": re.compile(r"\betc\.(?! [A-Z])"),
+            "yeah": re.compile(r"(?i)\b(y)eah?\b"),
+            "numbers": re.compile(
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
+            ),
+            "comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
+            "money": re.compile(
+                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
+            ),
+            "decimal": re.compile(r"\d*\.\d+"),
+            "range": re.compile(r"(?<=\d)-(?=\d)"),
+            "s_after_number": re.compile(r"(?<=\d)S"),
+            "possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
+            "x_possessive": re.compile(r"(?<=X')S\b"),
+            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
+            "single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
+        }
+
+    def normalize(self, text: str) -> str:
+        # Replace quotes and brackets
+        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+        text = text.replace("«", chr(8220)).replace("»", chr(8221))
+        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+        text = text.replace("(", "«").replace(")", "»")
+
+        # Handle CJK punctuation
+        for a, b in zip("、。！，：；？", ",.!,:;?"):
+            text = text.replace(a, b + " ")
+
+        # Use compiled patterns
+        text = self.patterns["whitespace"].sub(" ", text)
+        text = self.patterns["multi_space"].sub(" ", text)
+        text = self.patterns["newline_space"].sub("", text)
+        text = self.patterns["doctor"].sub("Doctor", text)
+        text = self.patterns["mister"].sub("Mister", text)
+        text = self.patterns["miss"].sub("Miss", text)
+        text = self.patterns["mrs"].sub("Mrs", text)
+        text = self.patterns["etc"].sub("etc", text)
+        text = self.patterns["yeah"].sub(r"\1e'a", text)
+        text = self.patterns["numbers"].sub(split_num, text)
+        text = self.patterns["comma_in_number"].sub("", text)
+        text = self.patterns["money"].sub(handle_money, text)
+        text = self.patterns["decimal"].sub(handle_decimal, text)
+        text = self.patterns["range"].sub(" to ", text)
+        text = self.patterns["s_after_number"].sub(" S", text)
+        text = self.patterns["possessive_s"].sub("'S", text)
+        text = self.patterns["x_possessive"].sub("s", text)
+        text = self.patterns["initials"].sub(
+            lambda m: m.group().replace(".", "-"), text
+        )
+        text = self.patterns["single_initial"].sub("-", text)
+
+        return text.strip()
+
+
+class TextNormalizerHybrid:
+    """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
+
+    def __init__(self):
+        # Only compile patterns that are complex or frequently used
+        self.patterns = {
+            "whitespace": re.compile(r"[^\S \n]"),
+            "numbers": re.compile(
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
+            ),
+            "money": re.compile(
+                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
+            ),
+            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
+        }
+
+    def normalize(self, text: str) -> str:
+        # Replace quotes and brackets
+        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+        text = text.replace("«", chr(8220)).replace("»", chr(8221))
+        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+        text = text.replace("(", "«").replace(")", "»")
+
+        # Handle CJK punctuation
+        for a, b in zip("、。！，：；？", ",.!,:;?"):
+            text = text.replace(a, b + " ")
+
+        # Use compiled patterns for complex operations
+        text = self.patterns["whitespace"].sub(" ", text)
+        text = self.patterns["numbers"].sub(split_num, text)
+        text = self.patterns["money"].sub(handle_money, text)
+        text = self.patterns["initials"].sub(
+            lambda m: m.group().replace(".", "-"), text
+        )
+
+        # Use inline patterns for simpler operations
+        text = re.sub(r"  +", " ", text)
+        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+        text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+        text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+        text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+        text = re.sub(r"(?<=\d),(?=\d)", "", text)
+        text = re.sub(r"\d*\.\d+", handle_decimal, text)
+        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
+        text = re.sub(r"(?<=\d)S", " S", text)
+        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+        text = re.sub(r"(?<=X')S\b", "s", text)
+        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+
+        return text.strip()
+
+
+def split_num(match: re.Match) -> str:
+    """Split numbers for TTS processing"""
+    num = match.group(0)
+    if ":" in num:
+        h, m = num.split(":")
+        return f"{h} {m}"
+    if num.endswith("s"):
+        return f"{num[:-1]} s"
+    return num
+
+
+def handle_money(match: re.Match) -> str:
+    """Format money strings for TTS"""
+    text = match.group(0)
+    return text.replace("$", " dollars ").replace("£", " pounds ")
+
+
+def handle_decimal(match: re.Match) -> str:
+    """Format decimal numbers for TTS"""
+    num = match.group(0)
+    return num.replace(".", " point ")
+
+
+def benchmark_normalizers(
+    test_cases: List[str], iterations: int = 100
+) -> Tuple[float, float, float]:
+    """Benchmark all three implementations"""
+
+    normalizers = {
+        "inline": TextNormalizerInline(),
+        "compiled": TextNormalizerCompiled(),
+        "hybrid": TextNormalizerHybrid(),
+    }
+
+    results = {}
+
+    # Test each normalizer
+    for name, normalizer in normalizers.items():
+        start = time.perf_counter()
+
+        # Run normalizations
+        for _ in range(iterations):
+            for test in test_cases:
+                normalizer.normalize(test)
+
+        results[name] = time.perf_counter() - start
+
+    return results
+
+
+def verify_outputs(test_cases: List[str]) -> bool:
+    """Verify that all implementations produce identical output"""
+    normalizers = {
+        "inline": TextNormalizerInline(),
+        "compiled": TextNormalizerCompiled(),
+        "hybrid": TextNormalizerHybrid(),
+    }
+
+    for test in test_cases:
+        results = [norm.normalize(test) for norm in normalizers.values()]
+        if not all(r == results[0] for r in results):
+            return False
+    return True
+
+
+def main():
+    # Create test cases
+    print("Generating test cases...")
+    test_cases = create_test_cases()
+    total_chars = sum(len(t) for t in test_cases)
+    print(
+        f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
+    )
+
+    # Verify output consistency
+    print("\nVerifying output consistency...")
+    if verify_outputs(test_cases):
+        print("✓ All implementations produce identical output")
+    else:
+        print("✗ Warning: Implementations produce different outputs!")
+        return
+
+    # Run benchmarks
+    print("\nRunning benchmarks...")
+    iterations = 100
+    results = benchmark_normalizers(test_cases, iterations)
+
+    # Print results
+    print(f"\nResults for {iterations} iterations: ")
+    for name, time_taken in results.items():
+        print(f"{name.capitalize()}: {time_taken:.3f}s")
+
+
+main()
--- a/examples/assorted_checks/validate_wav.py
+++ b/examples/assorted_checks/validate_wav.py
@ -1,218 +1,262 @@
+import argparse
+from typing import Any, Dict
+from pathlib import Path
+
 import numpy as np
 import soundfile as sf
-import argparse
-from pathlib import Path
+from tqdm import tqdm
+

 def validate_tts(wav_path: str) -> dict:
    """
-    Quick validation checks for TTS-generated audio files to detect common artifacts.
-    
-    Checks for:
-    - Unnatural silence gaps
-    - Audio glitches and artifacts
-    - Repeated speech segments (stuck/looping)
-    - Abrupt changes in speech
-    - Audio quality issues
-    
-    Args:
-        wav_path: Path to audio file (wav, mp3, etc)
-    Returns:
-        Dictionary with validation results
+    Validation checks for TTS-generated audio files to detect common artifacts.
    """
    try:
-        # Load audio
+        # Load and process audio
        audio, sr = sf.read(wav_path)
        if len(audio.shape) > 1:
-            audio = audio.mean(axis=1)  # Convert to mono
-            
-        # Basic audio stats
+            audio = np.mean(audio, axis=1)
+
        duration = len(audio) / sr
-        rms = np.sqrt(np.mean(audio**2))
-        peak = np.max(np.abs(audio))
-        dc_offset = np.mean(audio)
-        
-        # Calculate clipping stats if we're near peak
-        clip_count = np.sum(np.abs(audio) >= 0.99)
-        clip_percent = (clip_count / len(audio)) * 100
-        if clip_percent > 0:
-            clip_stats = f" ({clip_percent:.2e} ratio near peak)"
-        else:
-            clip_stats = " (no samples near peak)"
-        
-        # Convert to dB for analysis
-        eps = np.finfo(float).eps
-        db = 20 * np.log10(np.abs(audio) + eps)
-        
        issues = []
-        
-        # Check if audio is too short (likely failed generation)
-        if duration < 0.1:  # Less than 100ms
-            issues.append("WARNING: Audio is suspiciously short - possible failed generation")
-        
-        # 1. Check for basic audio quality
-        if peak >= 1.0:
-            # Calculate percentage of samples that are clipping
-            clip_count = np.sum(np.abs(audio) >= 0.99)
-            clip_percent = (clip_count / len(audio)) * 100
-            
-            if clip_percent > 1.0:  # Only warn if more than 1% of samples clip
-                issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
-            elif clip_percent > 0.01:  # Add info if more than 0.01% but less than 1%
-                issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples) - likely intentional normalization")
-            
-        if rms < 0.01:
+
+        # Basic quality checks
+        abs_audio = np.abs(audio)
+        stats = {
+            "rms": float(np.sqrt(np.mean(audio**2))),
+            "peak": float(np.max(abs_audio)),
+            "dc_offset": float(np.mean(audio)),
+        }
+
+        clip_count = np.sum(abs_audio >= 0.99)
+        clip_percent = (clip_count / len(audio)) * 100
+
+        if duration < 0.1:
+            issues.append(
+                "WARNING: Audio is suspiciously short - possible failed generation"
+            )
+
+        if stats["peak"] >= 1.0:
+            if clip_percent > 1.0:
+                issues.append(
+                    f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
+                )
+            elif clip_percent > 0.01:
+                issues.append(
+                    f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
+                )
+
+        if stats["rms"] < 0.01:
            issues.append("WARNING: Audio is very quiet - possible failed generation")
-        if abs(dc_offset) > 0.1:  # DC offset is particularly bad for speech
-            issues.append(f"WARNING: High DC offset ({dc_offset:.3f}) - may cause audio artifacts")
-            
-        # 2. Check for long silence gaps (potential TTS failures)
+
+        if abs(stats["dc_offset"]) > 0.1:
+            issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
+
+        # Check for long silence gaps
+        eps = np.finfo(float).eps
+        db = 20 * np.log10(abs_audio + eps)
        silence_threshold = -45  # dB
-        min_silence = 2.0  # Only detect silences longer than 2 seconds
+        min_silence = 2.0  # seconds
        window_size = int(min_silence * sr)
        silence_count = 0
        last_silence = -1
-        
-        # Skip the first 0.2s for silence detection (avoid false positives at start)
-        start_idx = int(0.2 * sr)
-        for i in range(start_idx, len(db) - window_size, window_size):
-            window = db[i:i+window_size]
+
+        start_idx = int(0.2 * sr)  # Skip first 0.2s
+        for i in tqdm(
+            range(start_idx, len(db) - window_size, window_size),
+            desc="Checking for silence",
+        ):
+            window = db[i : i + window_size]
            if np.mean(window) < silence_threshold:
-                # Verify the entire window is mostly silence
                silent_ratio = np.mean(window < silence_threshold)
-                if silent_ratio > 0.9:  # 90% of the window should be below threshold
-                    if last_silence == -1 or (i/sr - last_silence) > 2.0:  # Only count silences more than 2s apart
+                if silent_ratio > 0.9:
+                    if last_silence == -1 or (i / sr - last_silence) > 2.0:
                        silence_count += 1
-                        last_silence = i/sr
-                        issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
-        
-        if silence_count > 2:  # Only warn if there are multiple long silences
-            issues.append(f"WARNING: Multiple long silences found ({silence_count} total) - possible generation issue")
-                
-        # 3. Check for extreme audio artifacts (changes too rapid for natural speech)
-        # Use a longer window to avoid flagging normal phoneme transitions
-        window_size = int(0.02 * sr)  # 20ms window
-        db_smooth = np.convolve(db, np.ones(window_size)/window_size, 'same')
-        db_diff = np.abs(np.diff(db_smooth))
-        
-        # Much higher threshold to only catch truly unnatural changes
-        artifact_threshold = 40  # dB
-        min_duration = int(0.01 * sr)  # Minimum 10ms duration
-        
-        # Find regions where the smoothed dB change is extreme
-        artifact_points = np.where(db_diff > artifact_threshold)[0]
-        
-        if len(artifact_points) > 0:
-            # Group artifacts that are very close together
-            grouped_artifacts = []
-            current_group = [artifact_points[0]]
-            
-            for i in range(1, len(artifact_points)):
-                if (artifact_points[i] - current_group[-1]) < min_duration:
-                    current_group.append(artifact_points[i])
-                else:
-                    if len(current_group) * (1/sr) >= 0.01:  # Only keep groups lasting >= 10ms
-                        grouped_artifacts.append(current_group)
-                    current_group = [artifact_points[i]]
-            
-            if len(current_group) * (1/sr) >= 0.01:
-                grouped_artifacts.append(current_group)
-            
-            # Report only the most severe artifacts
-            for group in grouped_artifacts[:2]:  # Report up to 2 worst artifacts
-                center_idx = group[len(group)//2]
-                db_change = db_diff[center_idx]
-                if db_change > 45:  # Only report very extreme changes
-                    issues.append(
-                        f"WARNING: Possible audio artifact at {center_idx/sr:.2f}s "
-                        f"({db_change:.1f}dB change over {len(group)/sr*1000:.0f}ms)"
-                    )
-            
-        # 4. Check for repeated speech segments (stuck/looping)
-        # Check both short and long sentence durations at audiobook speed (150-160 wpm)
-        for chunk_duration in [5.0, 10.0]:  # 5s (~12 words) and 10s (~25 words) at ~audiobook speed
+                        last_silence = i / sr
+                        issues.append(
+                            f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
+                        )
+
+        if silence_count > 2:
+            issues.append(
+                f"WARNING: Multiple long silences found ({silence_count} total)"
+            )
+
+        # Detect audio artifacts
+        diff = np.diff(audio)
+        abs_diff = np.abs(diff)
+        window_size = min(int(0.005 * sr), 256)
+        window = np.ones(window_size) / window_size
+        local_avg_diff = np.convolve(abs_diff, window, mode="same")
+
+        spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
+        artifact_indices = np.nonzero(spikes)[0]
+
+        artifacts = []
+        if len(artifact_indices) > 0:
+            gaps = np.diff(artifact_indices)
+            min_gap = int(0.005 * sr)
+            break_points = np.nonzero(gaps > min_gap)[0] + 1
+            groups = np.split(artifact_indices, break_points)
+
+            for group in groups:
+                if len(group) >= 5:
+                    severity = np.max(abs_diff[group])
+                    if severity > 0.2:
+                        center_idx = group[len(group) // 2]
+                        artifacts.append(
+                            {
+                                "time": float(
+                                    center_idx / sr
+                                ),  # Ensure float for consistent timing
+                                "severity": float(severity),
+                            }
+                        )
+                        issues.append(
+                            f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
+                            f"(severity: {severity:.3f})"
+                        )
+
+        # Check for repeated speech segments
+        for chunk_duration in tqdm(
+            [0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
+        ):
            chunk_size = int(chunk_duration * sr)
-            overlap = int(0.2 * chunk_size)  # 20% overlap between chunks
-            
-            for i in range(0, len(audio) - 2*chunk_size, overlap):
-                chunk1 = audio[i:i+chunk_size]
-                chunk2 = audio[i+chunk_size:i+2*chunk_size]
-                
-                # Ignore chunks that are mostly silence
+            overlap = int(0.2 * chunk_size)
+
+            for i in range(0, len(audio) - 2 * chunk_size, overlap):
+                chunk1 = audio[i : i + chunk_size]
+                chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
+
                if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
                    continue
-                    
+
                try:
-                    correlation = np.corrcoef(chunk1, chunk2)[0,1]
-                    if not np.isnan(correlation) and correlation > 0.92:  # Lower threshold for sentence-length chunks
+                    correlation = np.corrcoef(chunk1, chunk2)[0, 1]
+                    if not np.isnan(correlation) and correlation > 0.92:
                        issues.append(
                            f"WARNING: Possible repeated speech at {i/sr:.1f}s "
                            f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
                        )
-                        break  # Found repetition at this duration, try next duration
+                        break
                except:
                    continue
-        
-        # 5. Check for extreme amplitude discontinuities (common in failed TTS)
-        amplitude_envelope = np.abs(audio)
-        window_size = sr // 10  # 100ms window for smoother envelope
-        smooth_env = np.convolve(amplitude_envelope, np.ones(window_size)/float(window_size), 'same')
-        env_diff = np.abs(np.diff(smooth_env))
-        
-        # Only detect very extreme amplitude changes
-        jump_threshold = 0.5  # Much higher threshold
-        jumps = np.where(env_diff > jump_threshold)[0]
-        
-        if len(jumps) > 0:
-            # Group jumps that are close together
-            grouped_jumps = []
-            current_group = [jumps[0]]
-            
-            for i in range(1, len(jumps)):
-                if (jumps[i] - current_group[-1]) < 0.05 * sr:  # Group within 50ms
-                    current_group.append(jumps[i])
-                else:
-                    if len(current_group) >= 3:  # Only keep significant discontinuities
-                        grouped_jumps.append(current_group)
-                    current_group = [jumps[i]]
-            
-            if len(current_group) >= 3:
-                grouped_jumps.append(current_group)
-            
-            # Report only the most severe discontinuities
-            for group in grouped_jumps[:2]:  # Report up to 2 worst cases
-                center_idx = group[len(group)//2]
-                jump_size = env_diff[center_idx]
-                if jump_size > 0.6:  # Only report very extreme changes
-                    issues.append(
-                        f"WARNING: Possible audio discontinuity at {center_idx/sr:.2f}s "
-                        f"({jump_size:.2f} amplitude ratio change)"
-                    )
-        
+
        return {
            "file": wav_path,
            "duration": f"{duration:.2f}s",
            "sample_rate": sr,
-            "peak_amplitude": f"{peak:.3f}{clip_stats}",
-            "rms_level": f"{rms:.3f}",
-            "dc_offset": f"{dc_offset:.3f}",
+            "peak_amplitude": f"{stats['peak']:.3f}",
+            "rms_level": f"{stats['rms']:.3f}",
+            "dc_offset": f"{stats['dc_offset']:.3f}",
+            "artifact_count": len(artifacts),
+            "artifact_locations": [a["time"] for a in artifacts],
+            "artifact_severities": [a["severity"] for a in artifacts],
            "issues": issues,
-            "valid": len(issues) == 0
-        }
-        
-    except Exception as e:
-        return {
-            "file": wav_path,
-            "error": str(e),
-            "valid": False
+            "valid": len(issues) == 0,
        }

+    except Exception as e:
+        return {"file": wav_path, "error": str(e), "valid": False}
+
+
+def generate_analysis_plots(
+    wav_path: str, output_dir: str, validation_result: Dict[str, Any]
+):
+    """
+    Generate analysis plots for audio file with time-aligned visualizations.
+    """
+    import matplotlib.pyplot as plt
+    from scipy.signal import spectrogram
+
+    # Load audio
+    audio, sr = sf.read(wav_path)
+    if len(audio.shape) > 1:
+        audio = np.mean(audio, axis=1)
+
+    # Create figure with shared x-axis
+    fig = plt.figure(figsize=(15, 8))
+    gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
+    ax1 = fig.add_subplot(gs[0])
+    ax2 = fig.add_subplot(gs[1], sharex=ax1)
+
+    # Calculate spectrogram
+    nperseg = 2048
+    noverlap = 1536
+    f, t, Sxx = spectrogram(
+        audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
+    )
+
+    # Plot spectrogram
+    im = ax1.pcolormesh(
+        t,
+        f,
+        10 * np.log10(Sxx + 1e-10),
+        shading="gouraud",
+        cmap="viridis",
+        vmin=-100,
+        vmax=-20,
+    )
+    ax1.set_ylabel("Frequency [Hz]", fontsize=10)
+    cbar = plt.colorbar(im, ax=ax1, label="dB")
+    ax1.set_title("Spectrogram", pad=10, fontsize=12)
+
+    # Plot waveform with exact time alignment
+    times = np.arange(len(audio)) / sr
+    ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
+    ax2.set_ylabel("Amplitude", fontsize=10)
+    ax2.set_xlabel("Time [sec]", fontsize=10)
+    ax2.grid(True, alpha=0.2)
+
+    # Add artifact markers
+    if (
+        "artifact_locations" in validation_result
+        and validation_result["artifact_locations"]
+    ):
+        for loc in validation_result["artifact_locations"]:
+            ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
+            ax2.axvline(
+                x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
+            )
+
+        # Add legend to both plots
+        if len(validation_result["artifact_locations"]) > 0:
+            ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
+            ax1.legend(loc="upper right", fontsize=8)
+            # Only add unique labels to legend
+            handles, labels = ax2.get_legend_handles_labels()
+            unique_labels = dict(zip(labels, handles))
+            ax2.legend(
+                unique_labels.values(),
+                unique_labels.keys(),
+                loc="upper right",
+                fontsize=8,
+            )
+
+    # Set common x limits
+    xlim = (0, len(audio) / sr)
+    ax1.set_xlim(xlim)
+    ax2.set_xlim(xlim)
+    og_filename = Path(wav_path).name.split(".")[0]
+    # Save plot
+    plt.savefig(
+        Path(output_dir) / f"{og_filename}_audio_analysis.png",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    plt.close()
+
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="TTS Output Validator")
-    parser.add_argument("wav_file", help="Path to audio file to validate")
-    args = parser.parse_args()
-    
-    result = validate_tts(args.wav_file)
-    
+    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
+    silent = False
+
+    print(f"\n\n Processing:\n\t{wav_file}")
+    result = validate_tts(wav_file)
+    if not silent:
+        wav_root_dir = Path(wav_file).parent
+        generate_analysis_plots(wav_file, wav_root_dir, result)
+
    print(f"\nValidating: {result['file']}")
    if "error" in result:
        print(f"Error: {result['error']}")
@ -222,7 +266,8 @@ if __name__ == "__main__":
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
-        
+        print(f"Detected Artifacts: {result['artifact_count']}")
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
--- a/examples/assorted_checks/validate_wavs.py
+++ b/examples/assorted_checks/validate_wavs.py
@ -1,7 +1,9 @@
 import argparse
 from pathlib import Path
+
 from validate_wav import validate_tts

+
 def print_validation_result(result: dict, rel_path: Path):
    """Print full validation details for a single file."""
    print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
-        
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
        else:
            print("\nNo issues found")

+
 def validate_directory(directory: str):
    """Validate all wav files in a directory with detailed output and summary."""
    dir_path = Path(directory)
-    
+
    # Find all wav files (including nested directories)
    wav_files = list(dir_path.rglob("*.wav"))
    wav_files.extend(dir_path.rglob("*.mp3"))  # Also check mp3s
    wav_files = sorted(wav_files)
-    
+
    if not wav_files:
        print(f"No .wav or .mp3 files found in {directory}")
        return
-        
+
    print(f"Found {len(wav_files)} files in {directory}")
    print("=" * 80)
-    
+
    # Store results for summary
    results = []
-    
+
    # Detailed validation output
    for wav_file in wav_files:
        result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
        print_validation_result(result, rel_path)
        results.append((rel_path, result))
        print("=" * 80)
-    
+
    # Summary with detailed issues
    print("\nSUMMARY:")
    for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
            issues = result["issues"]
            first_issue = issues[0].replace("WARNING: ", "")
            if len(issues) > 1:
-                print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
+                print(
+                    f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
+                )
            else:
                print(f"{rel_path}: FAIL - {first_issue}")
        else:
            print(f"{rel_path}: PASS")

+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
    parser.add_argument("directory", help="Directory containing wav files to validate")
    args = parser.parse_args()
-    
+
    validate_directory(args.directory)
--- a/examples/audio_analysis.png
+++ b/examples/audio_analysis.png
--- a/examples/openai_streaming_audio.py
+++ b/examples/openai_streaming_audio.py
@ -0,0 +1,49 @@
+
+#!/usr/bin/env rye run python
+
+import time
+from pathlib import Path
+
+from openai import OpenAI
+
+# gets OPENAI_API_KEY from your environment variables
+openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
+
+speech_file_path = Path(__file__).parent / "speech.mp3"
+
+
+def main() -> None:
+    stream_to_speakers()
+
+    # Create text-to-speech audio file
+    with openai.audio.speech.with_streaming_response.create(
+        model="kokoro",
+        voice="af",
+        input="the quick brown fox jumped over the lazy dogs",
+    ) as response:
+        response.stream_to_file(speech_file_path)
+
+
+
+def stream_to_speakers() -> None:
+    import pyaudio
+
+    player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+
+    start_time = time.time()
+
+    with openai.audio.speech.with_streaming_response.create(
+        model="kokoro",
+        voice="af",
+        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
+        input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earth’s surface""",
+    ) as response:
+        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+        for chunk in response.iter_bytes(chunk_size=1024):
+            player_stream.write(chunk)
+
+    print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/stream_tts_playback.py
+++ b/examples/stream_tts_playback.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+import requests
+import numpy as np
+import sounddevice as sd
+import time
+import os
+import wave
+
+def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"):
+    """Stream TTS audio and play it back in real-time"""
+    
+    print("\nStarting TTS stream request...")
+    start_time = time.time()
+    
+    # Initialize variables
+    sample_rate = 24000  # Known sample rate for Kokoro
+    audio_started = False
+    chunk_count = 0
+    total_bytes = 0
+    first_chunk_time = None
+    all_audio_data = bytearray()  # Raw PCM audio data
+    
+    # Start sounddevice stream with buffer
+    stream = sd.OutputStream(
+        samplerate=sample_rate,
+        channels=1,
+        dtype=np.int16,
+        blocksize=1024,  # Buffer size in samples
+        latency='low'    # Request low latency
+    )
+    stream.start()
+    
+    # Make streaming request to API
+    try:
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "input": text,
+                "voice": voice,
+                "response_format": "pcm",
+                "stream": True
+            },
+            stream=True,
+            timeout=1800
+        )
+        response.raise_for_status()
+        print(f"Request started successfully after {time.time() - start_time:.2f}s")
+        
+        # Process streaming response with smaller chunks for lower latency
+        for chunk in response.iter_content(chunk_size=512):  # 512 bytes = 256 samples at 16-bit
+            if chunk:
+                chunk_count += 1
+                total_bytes += len(chunk)
+                
+                # Handle first chunk
+                if not audio_started:
+                    first_chunk_time = time.time()
+                    print(f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s")
+                    print(f"First chunk size: {len(chunk)} bytes")
+                    audio_started = True
+                
+                # Convert bytes to numpy array and play
+                audio_chunk = np.frombuffer(chunk, dtype=np.int16)
+                stream.write(audio_chunk)
+                
+                # Accumulate raw audio data
+                all_audio_data.extend(chunk)
+                
+                # Log progress every 10 chunks
+                if chunk_count % 10 == 0:
+                    elapsed = time.time() - start_time
+                    print(f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed")
+        
+        # Final stats
+        total_time = time.time() - start_time
+        print(f"\nStream complete:")
+        print(f"Total chunks: {chunk_count}")
+        print(f"Total data: {total_bytes/1024:.1f}KB")
+        print(f"Total time: {total_time:.2f}s")
+        print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
+        
+        # Save as WAV file
+        if output_file:
+            print(f"\nWriting audio to {output_file}")
+            with wave.open(output_file, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
+                wav_file.setframerate(sample_rate)
+                wav_file.writeframes(all_audio_data)
+            print(f"Saved {len(all_audio_data)} bytes of audio data")
+        
+        # Clean up
+        stream.stop()
+        stream.close()
+            
+    except requests.exceptions.ConnectionError as e:
+        print(f"Connection error - Is the server running? Error: {str(e)}")
+        stream.stop()
+        stream.close()
+    except Exception as e:
+        print(f"Error during streaming: {str(e)}")
+        stream.stop()
+        stream.close()
+
+def main():
+    # Load sample text from HG Wells
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    wells_path = os.path.join(script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt")
+    output_path = os.path.join(script_dir, "output.wav")
+    
+    with open(wells_path, "r", encoding="utf-8") as f:
+        full_text = f.read()
+        # Take first few paragraphs
+        text = " ".join(full_text.split("\n\n")[:2])
+    
+    print("\nStarting TTS stream playback...")
+    print(f"Text length: {len(text)} characters")
+    print("\nFirst 100 characters:")
+    print(text[:100] + "...")
+    
+    play_streaming_tts(text, output_file=output_path)
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ numpy==2.2.1
 scipy==1.14.1

 # Audio processing
-soundfile==0.12.1
+soundfile==0.13.0

 # Text processing
 phonemizer==3.3.0