-update soundfile version

-alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo
2025-08-05 16:48:53 +00:00 · 2025-01-06 03:32:41 -07:00 · 2025-01-06 03:32:41 -07:00 · 720c1fb97d
commit 720c1fb97d
parent 4c6cd83f85
77 changed files with 2945 additions and 5522 deletions
--- a/.coverage
+++ b/.coverage
--- a/README.md
+++ b/README.md
@ -129,7 +129,7 @@ response = requests.post(
 )
 ```
 <p align="center">
-  <img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>
 </details>

@ -144,7 +144,7 @@ response = requests.post(
 - pcm

 <p align="center">
-<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
+<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>

 </details>
@ -175,8 +175,8 @@ Benchmarking was performed on generation via the local API using text lengths up
 - H.G. Wells - The Time Machine (full text)

 <p align="center">
-  <img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
-  <img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
+  <img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
 </p>

 Key Performance Metrics:
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -18,6 +18,8 @@ class Settings(BaseSettings):
    onnx_model_path: str = "kokoro-v0_19.onnx"
    voices_dir: str = "voices"
    sample_rate: int = 24000
+    max_chunk_size: int = 300  # Maximum size of text chunks for processing
+    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
    
    # ONNX Optimization Settings
    onnx_num_threads: int = 4  # Number of threads for intra-op parallelism
--- a/api/src/core/don_quixote.txt
+++ b/api/src/core/don_quixote.txt
@ -0,0 +1,9 @@
+In a village of La Mancha, the name of which I have no desire to call
+to mind, there lived not long since one of those gentlemen that keep a
+lance in the lance-rack, an old buckler, a lean hack, and a greyhound
+for coursing. An olla of rather more beef than mutton, a salad on most
+nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
+extra on Sundays, made away with three-quarters of his income. The rest
+of it went in a doublet of fine cloth and velvet breeches and shoes to
+match for holidays, while on week-days he made a brave figure in his
+best homespun. 
--- a/api/src/main.py
+++ b/api/src/main.py
@ -22,10 +22,11 @@ async def lifespan(app: FastAPI):
    logger.info("Loading TTS model and voice packs...")

    # Initialize the main model with warm-up
-    voicepack_count = TTSModel.setup()
+    voicepack_count = await TTSModel.setup()
    # boundary = "█████╗"*9
-    boundary = "░" * 30
+    boundary = "░" * 24
    startup_msg =f"""
+
 {boundary}

    ╔═╗┌─┐┌─┐┌┬┐
@ -37,8 +38,9 @@ async def lifespan(app: FastAPI):

 {boundary}
                """
-    startup_msg += f"\nModel loaded and warmed up on {TTSModel.get_device()}"
-    startup_msg += f"\n{voicepack_count} voice packs loaded successfully\n"
+    # TODO: Improve CPU warmup, threads, memory, etc
+    startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
+    startup_msg += f"\n{voicepack_count} voice packs loaded\n"
    startup_msg += f"\n{boundary}\n"
    logger.info(startup_msg)

--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -83,8 +83,8 @@ async def create_speech(
                audio, 
                24000, 
                request.response_format,
-                is_first_chunk=True
-            )
+                is_first_chunk=True,
+                stream=False)

            return Response(
                content=content,
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,22 +4,30 @@ from io import BytesIO

 import numpy as np
 import soundfile as sf
+import scipy.io.wavfile as wavfile
 from loguru import logger
-
+from ..core.config import settings

 class AudioNormalizer:
    """Handles audio normalization state for a single stream"""
    def __init__(self):
        self.int16_max = np.iinfo(np.int16).max
+        self.chunk_trim_ms = settings.gap_trim_ms
+        self.sample_rate = 24000  # Sample rate of the audio
+        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
    
-    def normalize(self, audio_data: np.ndarray) -> np.ndarray:
-        """Normalize audio data to int16 range"""
+    def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
+        """Normalize audio data to int16 range and trim chunk boundaries"""
        # Convert to float32 if not already
        audio_float = audio_data.astype(np.float32)
        
        # Normalize to [-1, 1] range first
        if np.max(np.abs(audio_float)) > 0:
            audio_float = audio_float / np.max(np.abs(audio_float))
+        
+        # Trim end of non-final chunks to reduce gaps
+        if not is_last_chunk and len(audio_float) > self.samples_to_trim:
+            audio_float = audio_float[:-self.samples_to_trim]
            
        # Scale to int16 range
        return (audio_float * self.int16_max).astype(np.int16)
@ -27,13 +35,30 @@ class AudioNormalizer:
 class AudioService:
    """Service for audio format conversions"""
    
+    # Default audio format settings balanced for speed and compression
+    DEFAULT_SETTINGS = {
+        "mp3": {
+            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
+            "compression_level": 0.0,  # Balanced compression
+        },
+        "opus": {
+            "compression_level": 0.0,  # Good balance for speech
+        },
+        "flac": {
+            "compression_level": 0.0,  # Light compression, still fast
+        }
+    }
+    
    @staticmethod
    def convert_audio(
        audio_data: np.ndarray, 
        sample_rate: int, 
        output_format: str, 
        is_first_chunk: bool = True,
-        normalizer: AudioNormalizer = None
+        is_last_chunk: bool = False,
+        normalizer: AudioNormalizer = None,
+        format_settings: dict = None,
+        stream: bool = True
    ) -> bytes:
        """Convert audio data to specified format

@ -42,6 +67,19 @@ class AudioService:
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
            is_first_chunk: Whether this is the first chunk of a stream
+            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
+            format_settings: Optional dict of format-specific settings to override defaults
+                Example: {
+                    "mp3": {
+                        "bitrate_mode": "VARIABLE",
+                        "compression_level": 0.8
+                    }
+                }
+                Default settings balance speed and compression:
+                optimized for localhost @ 0.0
+                - MP3: constant bitrate, no compression (0.0)
+                - OPUS: no compression (0.0)
+                - FLAC: no compression (0.0)

        Returns:
            Bytes of the converted audio
@ -50,31 +88,48 @@ class AudioService:

        try:
            # Always normalize audio to ensure proper amplitude scaling
-            if normalizer is None:
-                normalizer = AudioNormalizer()
-            normalized_audio = normalizer.normalize(audio_data)
+            if stream:
+                if normalizer is None:
+                    normalizer = AudioNormalizer()
+                normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
+            else:
+                normalized_audio = audio_data

            if output_format == "pcm":
-                logger.info("Writing PCM data...")
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
-                logger.info("Writing to WAV format...")
-                # Always include WAV header for WAV format
-                sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
+                if stream:
+                    # Use soundfile for streaming to ensure proper headers
+                    sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
+                else:
+                    # Trying scipy.io.wavfile for non-streaming WAV generation 
+                    # seems faster than soundfile
+                    # avoids overhead from header generation and PCM encoding
+                    wavfile.write(buffer, sample_rate, normalized_audio)
            elif output_format == "mp3":
-                logger.info("Converting to MP3 format...")
-                # Use lower bitrate for streaming
-                sf.write(buffer, normalized_audio, sample_rate, format="MP3")
+                # Use format settings or defaults
+                settings = format_settings.get("mp3", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
+                sf.write(
+                    buffer, normalized_audio, 
+                    sample_rate, format="MP3",
+                    **settings
+                    )
+                
            elif output_format == "opus":
-                logger.info("Converting to Opus format...")
-                # Use lower bitrate and smaller frame size for streaming
-                sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
+                settings = format_settings.get("opus", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
+                sf.write(buffer, normalized_audio, sample_rate, format="OGG", 
+                        subtype="OPUS", **settings)
+                
            elif output_format == "flac":
-                logger.info("Converting to FLAC format...")
-                # Use smaller block size for streaming
+                if is_first_chunk:
+                    logger.info("Starting FLAC stream...")
+                settings = format_settings.get("flac", {}) if format_settings else {}
+                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
                sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
-                        subtype='PCM_16')
+                        subtype='PCM_16', **settings)
            else:
                if output_format == "aac":
                    raise ValueError(
--- a/api/src/services/text_processing/chunker.py
+++ b/api/src/services/text_processing/chunker.py
@ -0,0 +1,52 @@
+"""Text chunking service"""
+
+import re
+from ...core.config import settings
+
+
+def split_text(text: str, max_chunk=None):
+    """Split text into chunks on natural pause points
+    
+    Args:
+        text: Text to split into chunks
+        max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
+    """
+    if max_chunk is None:
+        max_chunk = settings.max_chunk_size
+        
+    if not isinstance(text, str):
+        text = str(text) if text is not None else ""
+        
+    text = text.strip()
+    if not text:
+        return
+        
+    # First split into sentences
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+            
+        # For medium-length sentences, split on punctuation
+        if len(sentence) > max_chunk:  # Lower threshold for more consistent sizes
+            # First try splitting on semicolons and colons
+            parts = re.split(r"(?<=[;:])\s+", sentence)
+            
+            for part in parts:
+                part = part.strip()
+                if not part:
+                    continue
+                    
+                # If part is still long, split on commas
+                if len(part) > max_chunk:
+                    subparts = re.split(r"(?<=,)\s+", part)
+                    for subpart in subparts:
+                        subpart = subpart.strip()
+                        if subpart:
+                            yield subpart
+                else:
+                    yield part
+        else:
+            yield sentence
--- a/api/src/services/tts_base.py
+++ b/api/src/services/tts_base.py
@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")

    @classmethod
-    def setup(cls):
+    async def setup(cls):
        """Initialize model and setup voices"""
        with cls._lock:
            # Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
                            except Exception as e:
                                logger.error(f"Error copying voice {voice_name}: {str(e)}")

-            # Warm up with default voice
+            # Load warmup text
            try:
-                dummy_text = "Hello"
-                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
-                dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
-                
-                # Process text and generate audio
-                phonemes, tokens = cls.process_text(dummy_text, "a")
-                cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
-                
-                logger.info("Model warm-up complete")
+                with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
+                    warmup_text = f.read()
            except Exception as e:
-                logger.warning(f"Model warm-up failed: {e}")
+                logger.warning(f"Failed to load warmup text: {e}")
+                warmup_text = "This is a warmup text that will be split into chunks for processing."
+
+            # Use warmup service
+            from .warmup import WarmupService
+            warmup = WarmupService()
+            
+            # Load and warm up voices
+            loaded_voices = warmup.load_voices()
+            await warmup.warmup_voices(warmup_text, loaded_voices)
+            
+            logger.info("Model warm-up complete")

            # Count voices in directory
            voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -1,6 +1,7 @@
 import os
 import numpy as np
 import torch
+import time
 from loguru import logger
 from models import build_model
 from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
 from .tts_base import TTSBaseModel
 from ..core.config import settings

+# @torch.no_grad()
+# def forward(model, tokens, ref_s, speed):
+#     """Forward pass through the model"""
+#     device = ref_s.device
+#     tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+#     input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+#     text_mask = length_to_mask(input_lengths).to(device)
+#     bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+#     d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+#     s = ref_s[:, 128:]
+#     d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+#     x, _ = model.predictor.lstm(d)
+#     duration = model.predictor.duration_proj(x)
+#     duration = torch.sigmoid(duration).sum(axis=-1) / speed
+#     pred_dur = torch.round(duration).clamp(min=1).long()
+#     pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+#     c_frame = 0
+#     for i in range(pred_aln_trg.size(0)):
+#         pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+#         c_frame += pred_dur[0, i].item()
+#     en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+#     F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+#     t_en = model.text_encoder(tokens, input_lengths, text_mask)
+#     asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+#     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
 def forward(model, tokens, ref_s, speed):
-    """Forward pass through the model"""
+    """Forward pass through the model with light optimizations that preserve output quality"""
    device = ref_s.device
+    
+    # Keep original token handling but optimize device placement
    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
    text_mask = length_to_mask(input_lengths).to(device)
+    
+    # BERT and encoder pass
    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-    s = ref_s[:, 128:]
-    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+    
+    # Split reference signal once for efficiency
+    s_content = ref_s[:, 128:]
+    s_ref = ref_s[:, :128]
+    
+    # Predictor forward pass
+    d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
    x, _ = model.predictor.lstm(d)
+    
+    # Duration prediction - keeping original logic
    duration = model.predictor.duration_proj(x)
    duration = torch.sigmoid(duration).sum(axis=-1) / speed
    pred_dur = torch.round(duration).clamp(min=1).long()
-    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    
+    # Alignment matrix construction - keeping original approach for quality
+    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
    c_frame = 0
    for i in range(pred_aln_trg.size(0)):
-        pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+        pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
        c_frame += pred_dur[0, i].item()
-    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
-    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+    
+    # Matrix multiplications - reuse unsqueezed tensor
+    pred_aln_trg = pred_aln_trg.unsqueeze(0)  # Do unsqueeze once
+    en = d.transpose(-1, -2) @ pred_aln_trg
+    F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+    
+    # Text encoding and final decoding
    t_en = model.text_encoder(tokens, input_lengths, text_mask)
-    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
-    return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+    asr = t_en @ pred_aln_trg
+    
+    return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
+
+# def length_to_mask(lengths):
+#     """Create attention mask from lengths"""
+#     mask = (
+#         torch.arange(lengths.max())
+#         .unsqueeze(0)
+#         .expand(lengths.shape[0], -1)
+#         .type_as(lengths)
+#     )
+#     mask = torch.gt(mask + 1, lengths.unsqueeze(1))
+#     return mask

 def length_to_mask(lengths):
-    """Create attention mask from lengths"""
-    mask = (
-        torch.arange(lengths.max())
-        .unsqueeze(0)
-        .expand(lengths.shape[0], -1)
-        .type_as(lengths)
-    )
-    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
-    return mask
+    """Create attention mask from lengths - possibly optimized version"""
+    max_len = lengths.max()
+    # Create mask directly on the same device as lengths
+    mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
+    # Avoid type_as by using the correct dtype from the start
+    if lengths.dtype != mask.dtype:
+        mask = mask.to(dtype=lengths.dtype)
+    # Fuse operations  using broadcasting
+    return mask + 1 > lengths[:, None]

 class TTSGPUModel(TTSBaseModel):
    _instance = None
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -8,7 +8,7 @@ from functools import lru_cache
 import numpy as np
 import torch
 import scipy.io.wavfile as wavfile
-from .text_processing import normalize_text
+from .text_processing import normalize_text, chunker
 from loguru import logger

 from ..core.config import settings
@ -20,40 +20,6 @@ class TTSService:
    def __init__(self, output_dir: str = None):
        self.output_dir = output_dir

-    def _split_text(self, text: str):
-        """Generate text chunks one at a time, splitting on natural pause points"""
-        if not isinstance(text, str):
-            text = str(text) if text is not None else ""
-            
-        # First split into sentences
-        sentences = re.split(r"(?<=[.!?])\s+", text)
-        
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-                
-            # For longer sentences, split on commas and semicolons
-            if len(sentence) > 300:  # Only split long sentences
-                # Split on pause points while preserving the punctuation
-                chunks = re.split(r"((?<=[,;])\s+)", sentence)
-                
-                # Reassemble chunks with their trailing punctuation
-                current_chunk = ""
-                for i, chunk in enumerate(chunks):
-                    if i % 2 == 0:  # Text chunk
-                        current_chunk += chunk
-                    else:  # Punctuation/whitespace chunk
-                        current_chunk += chunk
-                        if current_chunk.strip():
-                            yield current_chunk.strip()
-                        current_chunk = ""
-                        
-                # Yield any remaining text
-                if current_chunk.strip():
-                    yield current_chunk.strip()
-            else:
-                yield sentence

    @staticmethod
    @lru_cache(maxsize=20)  # Cache up to 8 most recently used voices
@ -96,28 +62,32 @@ class TTSService:
            # Load voice using cached loader
            voicepack = self._load_voice(voice_path)

-            # Generate audio with or without stitching
+            # For non-streaming, preprocess all chunks first
            if stitch_long_output:
-                audio_chunks = []
-                chunk_count = 0
-
-                # Process chunks as they're generated
-                for chunk in self._split_text(text):
+                # Preprocess all chunks to phonemes/tokens
+                chunks_data = []
+                for chunk in chunker.split_text(text):
                    try:
-                        # Process text and generate audio
                        phonemes, tokens = TTSModel.process_text(chunk, voice[0])
+                        chunks_data.append((chunk, tokens))
+                    except Exception as e:
+                        logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
+                        continue
+
+                if not chunks_data:
+                    raise ValueError("No chunks were processed successfully")
+
+                # Generate audio for all chunks
+                audio_chunks = []
+                for chunk, tokens in chunks_data:
+                    try:
                        chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
-    
                        if chunk_audio is not None:
                            audio_chunks.append(chunk_audio)
-                            chunk_count += 1
                        else:
-                            logger.error(f"No audio generated for chunk {chunk_count + 1}")
-                            
+                            logger.error(f"No audio generated for chunk: '{chunk}'")
                    except Exception as e:
-                        logger.error(
-                            f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
-                        )
+                        logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
                        continue

                if not audio_chunks:
@ -138,53 +108,93 @@ class TTSService:
            raise

    async def generate_audio_stream(
-        self, text: str, voice: str, speed: float, output_format: str = "wav"
+        self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
    ):
        """Generate and yield audio chunks as they're generated for real-time streaming"""
        try:
+            stream_start = time.time()
            # Create normalizer for consistent audio levels
            stream_normalizer = AudioNormalizer()
            
            # Input validation and preprocessing
            if not text:
                raise ValueError("Text is empty")
+            preprocess_start = time.time()
            normalized = normalize_text(text)
            if not normalized:
                raise ValueError("Text is empty after preprocessing")
            text = str(normalized)
+            logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")

            # Voice validation and loading
+            voice_start = time.time()
            voice_path = self._get_voice_path(voice)
            if not voice_path:
                raise ValueError(f"Voice not found: {voice}")
            voicepack = self._load_voice(voice_path)
+            logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")

            # Process chunks as they're generated
            is_first = True
-            for chunk in self._split_text(text):
+            chunks_processed = 0
+            # last_chunk_end = time.time()
+            
+            # Process chunks as they come from generator
+            chunk_gen = chunker.split_text(text)
+            current_chunk = next(chunk_gen, None)
+            
+            while current_chunk is not None:
+                next_chunk = next(chunk_gen, None)  # Peek at next chunk
+                # chunk_start = time.time()
+                chunks_processed += 1
                try:
                    # Process text and generate audio
-                    phonemes, tokens = TTSModel.process_text(chunk, voice[0])
+                    # text_process_start = time.time()
+                    phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
+                    # text_process_time = time.time() - text_process_start
+                    
+                    # audio_gen_start = time.time()
                    chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
-
+                    # audio_gen_time = time.time() - audio_gen_start
+                    
                    if chunk_audio is not None:
                        # Convert chunk with proper header handling
+                        convert_start = time.time()
                        chunk_bytes = AudioService.convert_audio(
                            chunk_audio,
                            24000,
                            output_format,
                            is_first_chunk=is_first,
-                            normalizer=stream_normalizer
+                            normalizer=stream_normalizer,
+                            is_last_chunk=(next_chunk is None)  # Last if no next chunk
                        )
+                        # convert_time = time.time() - convert_start
+                        
+                        # Calculate gap from last chunk
+                        # gap_time = chunk_start - last_chunk_end
+                        
+                        # Log timing details if not silent
+                        # if not silent:
+                        #     logger.debug(
+                        #         f"\nChunk {chunks_processed} timing:"
+                        #         f"\n  Gap from last chunk: {gap_time*1000:.1f}ms"
+                        #         f"\n  Text processing: {text_process_time*1000:.1f}ms"
+                        #         f"\n  Audio generation: {audio_gen_time*1000:.1f}ms"
+                        #         f"\n  Audio conversion: {convert_time*1000:.1f}ms"
+                        #         f"\n  Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
+                        #     )
+                        
                        yield chunk_bytes
                        is_first = False
+                        # last_chunk_end = time.time()
                    else:
-                        logger.error(f"No audio generated for chunk: '{chunk}'")
+                        logger.error(f"No audio generated for chunk: '{current_chunk}'")

                except Exception as e:
-                    logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
-                    continue
-
+                    logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
+                
+                current_chunk = next_chunk  # Move to next chunk
+                
        except Exception as e:
            logger.error(f"Error in audio generation stream: {str(e)}")
            raise
--- a/api/src/services/warmup.py
+++ b/api/src/services/warmup.py
@ -0,0 +1,52 @@
+import os
+from typing import List, Tuple
+import torch
+from loguru import logger
+
+from .tts_service import TTSService
+from .tts_model import TTSModel
+
+
+class WarmupService:
+    """Service for warming up TTS models and voice caches"""
+    
+    def __init__(self):
+        self.tts_service = TTSService()
+        
+    def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
+        """Load and cache voices up to LRU limit"""
+        # Get all voices sorted by filename length (shorter names first, usually base voices)
+        voice_files = sorted(
+            [f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
+            key=len
+        )
+        
+        # Load up to LRU cache limit (20)
+        loaded_voices = []
+        for voice_file in voice_files[:20]:
+            try:
+                voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
+                voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
+                loaded_voices.append((voice_file[:-3], voicepack))  # Store name and tensor
+                # logger.info(f"Loaded voice {voice_file[:-3]} into cache")
+            except Exception as e:
+                logger.error(f"Failed to load voice {voice_file}: {e}")
+        logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
+        return loaded_voices
+        
+    async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
+        """Warm up voice inference and streaming"""
+        n_warmups = 1
+        for voice_name, _ in loaded_voices[:n_warmups]:
+            try:
+                logger.info(f"Running warmup inference on voice {voice_name}")
+                async for _ in self.tts_service.generate_audio_stream(
+                    warmup_text,
+                    voice_name,
+                    1.0,
+                    "pcm"
+                ):
+                    pass  # Process all chunks to properly warm up
+                logger.info(f"Completed warmup for voice {voice_name}")
+            except Exception as e:
+                logger.warning(f"Warmup failed for voice {voice_name}: {e}")
--- a/api/tests/test_chunker.py
+++ b/api/tests/test_chunker.py
@ -0,0 +1,35 @@
+"""Tests for text chunking service"""
+
+import pytest
+from api.src.services.text_processing import chunker
+
+
+def test_split_text():
+    """Test text splitting into sentences"""
+    text = "First sentence. Second sentence! Third sentence?"
+    sentences = list(chunker.split_text(text))
+    assert len(sentences) == 3
+    assert sentences[0] == "First sentence."
+    assert sentences[1] == "Second sentence!"
+    assert sentences[2] == "Third sentence?"
+
+
+def test_split_text_empty():
+    """Test splitting empty text"""
+    assert list(chunker.split_text("")) == []
+
+
+def test_split_text_single_sentence():
+    """Test splitting single sentence"""
+    text = "Just one sentence."
+    assert list(chunker.split_text(text)) == ["Just one sentence."]
+
+
+def test_split_text_with_custom_chunk_size():
+    """Test splitting with custom max chunk size"""
+    text = "First part, second part, third part."
+    chunks = list(chunker.split_text(text, max_chunk=15))
+    assert len(chunks) == 3
+    assert chunks[0] == "First part,"
+    assert chunks[1] == "second part,"
+    assert chunks[2] == "third part."
--- a/api/tests/test_endpoints.py
+++ b/api/tests/test_endpoints.py
@ -1,7 +1,8 @@
-from unittest.mock import Mock
+from unittest.mock import Mock, AsyncMock

 import pytest
 import pytest_asyncio
+import asyncio
 from fastapi.testclient import TestClient
 from httpx import AsyncClient

@ -22,6 +23,12 @@ async def async_client():
 def mock_tts_service(monkeypatch):
    mock_service = Mock()
    mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
+    
+    # Create proper async generator mock
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_service.generate_audio_stream = mock_stream
    mock_service.list_voices.return_value = [
        "af",
        "bm_lewis",
@ -65,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
        "voice": "bm_lewis",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 200
@ -84,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
        "voice": "invalid_voice",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 400  # Bad request
@ -98,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": -1.0,  # Invalid speed
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 422  # Validation error
@ -112,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": 1.0,
+        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 500
@ -171,13 +182,14 @@ async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
        "input": "Hello world",
        "voice": "af",
        "response_format": "pcm",
+        "stream": True
    }
    
-    # Mock streaming response
-    async def mock_stream():
-        yield b"chunk1"
-        yield b"chunk2"
-    mock_tts_service.generate_audio_stream.return_value = mock_stream()
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
    
    # Add streaming header
    headers = {"x-raw-response": "stream"}
@ -198,13 +210,14 @@ async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
        "input": "Hello world",
        "voice": "af",
        "response_format": "mp3",
+        "stream": True
    }
    
-    # Mock streaming response
-    async def mock_stream():
-        yield b"mp3header"
-        yield b"mp3data"
-    mock_tts_service.generate_audio_stream.return_value = mock_stream()
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"mp3header", b"mp3data"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
    
    # Add streaming header
    headers = {"x-raw-response": "stream"}
@ -227,14 +240,14 @@ async def test_openai_speech_streaming_generator(mock_tts_service, async_client)
        "input": "Hello world",
        "voice": "af",
        "response_format": "pcm",
+        "stream": True
    }
    
-    # Mock streaming response
-    async def mock_stream():
-        yield b"chunk1"
-        yield b"chunk2"
-    
-    mock_tts_service.generate_audio_stream.return_value = mock_stream()
+    # Create streaming mock for this test
+    async def mock_stream(*args, **kwargs):
+        for chunk in [b"chunk1", b"chunk2"]:
+            yield chunk
+    mock_tts_service.generate_audio_stream = mock_stream
    
    # Add streaming header
    headers = {"x-raw-response": "stream"}
--- a/api/tests/test_main.py
+++ b/api/tests/test_main.py
@ -28,29 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
    """Test successful model warmup in lifespan"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
-    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
-        mock_tts_model.setup.return_value = 3  # 3 voice files
-        mock_tts_model.get_device.return_value = "cuda"
-
-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    # Start the context manager
-    await async_gen.__aenter__()
-
-    # Verify the expected logging sequence
-    mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
    
-    # Check for the startup message containing the required info
-    startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
-    startup_msg = next(msg for msg in startup_calls if "Model loaded and warmed up on" in msg)
-    assert "Model loaded and warmed up on cuda" in startup_msg
-    assert "3 voice packs loaded successfully" in startup_msg
+    # Create async mock
+    async def async_setup():
+        return 3
+    mock_tts_model.setup = MagicMock()
+    mock_tts_model.setup.side_effect = async_setup
+    mock_tts_model.get_device.return_value = "cuda"
+    
+    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
+        # Create an async generator from the lifespan context manager
+        async_gen = lifespan(MagicMock())
+        # Start the context manager
+        await async_gen.__aenter__()

-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
+        # Verify the expected logging sequence
+        mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
+        
+        # Check for the startup message containing the required info
+        startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
+        startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
+        assert "Model warmed up on" in startup_msg
+        assert "3 voice packs loaded" in startup_msg

-    # Clean up
-    await async_gen.__aexit__(None, None, None)
+        # Verify model setup was called
+        mock_tts_model.setup.assert_called_once()
+
+        # Clean up
+        await async_gen.__aexit__(None, None, None)


@pytest.mark.asyncio
@ -81,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
    """Test model warmup specifically on CUDA"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
+    
+    # Create async mock
+    async def async_setup():
+        return 2
+    mock_tts_model.setup = MagicMock()
+    mock_tts_model.setup.side_effect = async_setup
+    mock_tts_model.get_device.return_value = "cuda"
+    
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
-        mock_tts_model.setup.return_value = 2  # 2 voice files
-        mock_tts_model.get_device.return_value = "cuda"
+        # Create an async generator from the lifespan context manager
+        async_gen = lifespan(MagicMock())
+        await async_gen.__aenter__()

-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    await async_gen.__aenter__()
+        # Verify model setup was called
+        mock_tts_model.setup.assert_called_once()

-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
-
-    # Clean up
-    await async_gen.__aexit__(None, None, None)
-
-
-@pytest.mark.asyncio
-@patch("api.src.main.TTSModel")
-async def test_lifespan_cpu_fallback(mock_tts_model):
-    """Test model warmup falling back to CPU"""
-    # Mock file system for voice counting
-    mock_tts_model.VOICES_DIR = "/mock/voices"
-    with patch(
-        "os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
-    ):
-        mock_tts_model.setup.return_value = 4  # 4 voice files
-        mock_tts_model.get_device.return_value = "cpu"
-
-    # Create an async generator from the lifespan context manager
-    async_gen = lifespan(MagicMock())
-    await async_gen.__aenter__()
-
-    # Verify model setup was called
-    mock_tts_model.setup.assert_called_once()
-
-    # Clean up
-    await async_gen.__aexit__(None, None, None)
+        # Clean up
+        await async_gen.__aexit__(None, None, None)
--- a/api/tests/test_tts_implementations.py
+++ b/api/tests/test_tts_implementations.py
@ -16,13 +16,14 @@ def test_get_device_error():
    with pytest.raises(RuntimeError, match="Model not initialized"):
        TTSBaseModel.get_device()

+@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA available"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
    
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cuda"
    assert voice_count == 2

+@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA unavailable"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
    
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cpu"
    assert voice_count == 2

--- a/api/tests/test_tts_service.py
+++ b/api/tests/test_tts_service.py
@ -31,27 +31,6 @@ def sample_audio():
    return np.sin(2 * np.pi * frequency * t).astype(np.float32)


-def test_split_text(tts_service):
-    """Test text splitting into sentences"""
-    text = "First sentence. Second sentence! Third sentence?"
-    sentences = tts_service._split_text(text)
-    assert len(sentences) == 3
-    assert sentences[0] == "First sentence."
-    assert sentences[1] == "Second sentence!"
-    assert sentences[2] == "Third sentence?"
-
-
-def test_split_text_empty(tts_service):
-    """Test splitting empty text"""
-    assert tts_service._split_text("") == []
-
-
-def test_split_text_single_sentence(tts_service):
-    """Test splitting single sentence"""
-    text = "Just one sentence."
-    assert tts_service._split_text(text) == ["Just one sentence."]
-
-
 def test_audio_to_bytes(tts_service, sample_audio):
    """Test converting audio tensor to bytes"""
    audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
    mock_torch_load.return_value = torch.zeros((10, 24000))
    mock_generate.return_value = (None, None)

-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)


@ -185,7 +164,7 @@ def test_generate_audio_error(
    mock_exists.return_value = True
    mock_torch_load.return_value = torch.zeros((10, 24000))

-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)


--- a/assets/format_comparison.png
+++ b/assets/format_comparison.png
--- a/assets/gpu_first_token_latency_direct.png
+++ b/assets/gpu_first_token_latency_direct.png
--- a/assets/gpu_first_token_latency_openai.png
+++ b/assets/gpu_first_token_latency_openai.png
--- a/assets/gpu_first_token_timeline_direct.png
+++ b/assets/gpu_first_token_timeline_direct.png
--- a/assets/gpu_first_token_timeline_openai.png
+++ b/assets/gpu_first_token_timeline_openai.png
--- a/assets/gpu_processing_time.png
+++ b/assets/gpu_processing_time.png
--- a/assets/gpu_realtime_factor.png
+++ b/assets/gpu_realtime_factor.png
--- a/assets/gpu_total_time_latency_direct.png
+++ b/assets/gpu_total_time_latency_direct.png
--- a/assets/gpu_total_time_latency_openai.png
+++ b/assets/gpu_total_time_latency_openai.png
--- a/assets/voice_analysis.png
+++ b/assets/voice_analysis.png
--- a/docker-compose.cpu.yml
+++ b/docker-compose.cpu.yml
@ -43,6 +43,7 @@ services:
      - ONNX_OPTIMIZATION_LEVEL=all
      - ONNX_MEMORY_PATTERN=true
      - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
+      
    depends_on:
      model-fetcher:
        condition: service_healthy
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,7 +2,7 @@ services:
  model-fetcher:
    image: datamachines/git-lfs:latest
    environment:
-      - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-true}
+      - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
    volumes:
      - ./Kokoro-82M:/app/Kokoro-82M
    working_dir: /app/Kokoro-82M
@ -32,10 +32,10 @@ services:
      start_period: 1s

  kokoro-tts:
-    image: ghcr.io/remsky/kokoro-fastapi:latest
+    # image: ghcr.io/remsky/kokoro-fastapi:latest
    # Uncomment below to build from source instead of using the released image
-    # build:
-    #   context: .
+    build:
+      context: .
    volumes:
      - ./api/src:/app/api/src
      - ./Kokoro-82M:/app/Kokoro-82M
@ -54,14 +54,14 @@ services:
      model-fetcher:
        condition: service_healthy

-  # # Gradio UI service [Comment out everything below if you don't need it]
-  # gradio-ui:
-  #   build:
-  #     context: ./ui
-  #   ports:
-  #     - "7860:7860"
-  #   volumes:
-  #     - ./ui/data:/app/ui/data
-  #     - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-  #   environment:
-  #     - GRADIO_WATCH=True  # Enable hot reloading
+  # Gradio UI service [Comment out everything below if you don't need it]
+  gradio-ui:
+    build:
+      context: ./ui
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./ui/data:/app/ui/data
+      - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+    environment:
+      - GRADIO_WATCH=True  # Enable hot reloading
--- a/examples/assorted_checks/benchmarks/benchmark_first_token.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@ -1,15 +1,19 @@
 #!/usr/bin/env python3
 import os
-import time
 import json
-import numpy as np
-import requests
-import pandas as pd
-from lib.shared_benchmark_utils import get_text_for_tokens, enc
-from lib.shared_utils import save_json_results
-from lib.shared_plotting import plot_correlation, plot_timeline
+import time

-def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
+import numpy as np
+import pandas as pd
+import requests
+from lib.shared_utils import save_json_results
+from lib.shared_plotting import plot_timeline, plot_correlation
+from lib.shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def measure_first_token(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
    """Measure time to audio via API calls and save the audio output"""
    results = {
        "text_length": len(text),
@ -18,12 +22,12 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
-        "audio_length": None  # Length of output audio in seconds
+        "audio_length": None,  # Length of output audio in seconds
    }
-    
+
    try:
        start_time = time.time()
-        
+
        # Make request without streaming
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
@ -32,58 +36,62 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
                "input": text,
                "voice": "af",
                "response_format": "wav",
-                "stream": False
+                "stream": False,
            },
-            timeout=1800
+            timeout=1800,
        )
        response.raise_for_status()
-        
+
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
-        
+
        content = response.content
-        with open(audio_path, 'wb') as f:
+        with open(audio_path, "wb") as f:
            f.write(content)
-        
+
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
+
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["time_to_first_chunk"] = time.time() - start_time
-        
+
        results["total_time"] = time.time() - start_time
        return results
-        
+
    except Exception as e:
        results["error"] = str(e)
        return results

+
 def main():
    # Set up paths
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)

    # Load sample text
-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
        text = f.read()

    # Test specific token counts
    token_sizes = [10, 25, 50, 100, 200, 500]
    all_results = []
-    
+
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
-        
+
        # Run test 3 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/3...")
@ -91,67 +99,74 @@ def main():
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
-            
+
            print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
-            
+
            if result["error"]:
                print(f"Error: {result['error']}")
-            
+
            all_results.append(result)
-    
+
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
-        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
+        matching_results = [
+            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+        ]
        if matching_results:
-            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
-            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
-            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
+            avg_first_chunk = sum(
+                r["time_to_first_chunk"] for r in matching_results
+            ) / len(matching_results)
+            avg_total = sum(r["total_time"] for r in matching_results) / len(
+                matching_results
+            )
+            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+                matching_results
+            )
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
-                "num_successful_runs": len(matching_results)
+                "num_successful_runs": len(matching_results),
            }
-    
+
    # Save results
    # Save results
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    save_json_results(
-        results_data,
-        os.path.join(output_data_dir, "first_token_benchmark.json")
+        results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
    )
-    
+
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
-    
+
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
-    
+
    # Create both plots
    plot_correlation(
-        df, "target_tokens", "time_to_first_chunk",
+        df,
+        "target_tokens",
+        "time_to_first_chunk",
        "Time to Audio vs Input Size",
        "Number of Input Tokens",
        "Time to Audio (seconds)",
-        os.path.join(output_plots_dir, "first_token_latency.png")
+        os.path.join(output_plots_dir, "first_token_latency.png"),
    )
-    
-    plot_timeline(
-        df,
-        os.path.join(output_plots_dir, "first_token_timeline.png")
-    )
-    
+
+    plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
+
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")

+
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
@ -1,193 +0,0 @@
-#!/usr/bin/env python3
-import os
-import time
-import json
-import numpy as np
-import requests
-import pandas as pd
-from lib.shared_benchmark_utils import get_text_for_tokens, enc
-from lib.shared_utils import save_json_results
-from lib.shared_plotting import plot_correlation, plot_timeline
-
-def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
-    """Measure time to audio via API calls and save the audio output"""
-    results = {
-        "text_length": len(text),
-        "token_count": len(enc.encode(text)),
-        "total_time": None,
-        "time_to_first_chunk": None,
-        "error": None,
-        "audio_path": None,
-        "audio_length": None  # Length of output audio in seconds
-    }
-    
-    try:
-        start_time = time.time()
-        
-        # Make request with streaming enabled
-        response = requests.post(
-            "http://localhost:8880/v1/audio/speech",
-            json={
-                "model": "kokoro",
-                "input": text,
-                "voice": "af",
-                "response_format": "pcm",
-                "stream": True
-            },
-            stream=True,
-            timeout=1800
-        )
-        response.raise_for_status()
-        
-        # Save complete audio
-        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
-        audio_path = os.path.join(output_dir, audio_filename)
-        results["audio_path"] = audio_path
-        
-        first_chunk_time = None
-        chunks = []
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                if first_chunk_time is None:
-                    first_chunk_time = time.time()
-                    results["time_to_first_chunk"] = first_chunk_time - start_time
-                chunks.append(chunk)
-        
-        # Concatenate all PCM chunks
-        if not chunks:
-            raise ValueError("No audio chunks received")
-            
-        all_audio_data = b''.join(chunks)
-        
-        # Write as WAV file
-        import wave
-        with wave.open(audio_path, 'wb') as wav_file:
-            wav_file.setnchannels(1)  # Mono
-            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
-            wav_file.setframerate(24000)  # Known sample rate for Kokoro
-            wav_file.writeframes(all_audio_data)
-        
-        # Calculate audio length using scipy
-        import scipy.io.wavfile as wavfile
-        sample_rate, audio_data = wavfile.read(audio_path)
-        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
-        
-        results["total_time"] = time.time() - start_time
-        
-        # Print debug info
-        print(f"Complete audio size: {len(all_audio_data)} bytes")
-        print(f"Number of chunks received: {len(chunks)}")
-        print(f"Audio length: {results['audio_length']:.3f}s")
-        
-        return results
-        
-    except Exception as e:
-        results["error"] = str(e)
-        return results
-
-def main():
-    # Set up paths with _stream suffix
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    output_dir = os.path.join(script_dir, "output_audio_stream")
-    output_data_dir = os.path.join(script_dir, "output_data")
-    
-    # Create output directories
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(output_data_dir, exist_ok=True)
-
-    # Load sample text
-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
-        text = f.read()
-
-    # Test specific token counts
-    token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
-    all_results = []
-    
-    for tokens in token_sizes:
-        print(f"\nTesting {tokens} tokens (streaming)")
-        test_text = get_text_for_tokens(text, tokens)
-        actual_tokens = len(enc.encode(test_text))
-        print(f"Text preview: {test_text[:50]}...")
-        
-        # Run test 3 times for each size to get average
-        for i in range(5):
-            print(f"Run {i+1}/3...")
-            result = measure_first_token(test_text, output_dir, tokens, i + 1)
-            result["target_tokens"] = tokens
-            result["actual_tokens"] = actual_tokens
-            result["run_number"] = i + 1
-            
-            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
-            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
-            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
-            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
-            
-            if result["error"]:
-                print(f"Error: {result['error']}")
-            
-            all_results.append(result)
-    
-    # Calculate averages per token size
-    summary = {}
-    for tokens in token_sizes:
-        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
-        if matching_results:
-            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
-            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
-            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
-            summary[tokens] = {
-                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
-                "avg_total_time": round(avg_total, 3),
-                "avg_audio_length": round(avg_audio_length, 3),
-                "num_successful_runs": len(matching_results)
-            }
-    
-    # Save results with _stream suffix
-    results_data = {
-        "individual_runs": all_results,
-        "summary": summary,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-    }
-    save_json_results(
-        results_data,
-        os.path.join(output_data_dir, "first_token_benchmark_stream.json")
-    )
-    
-    # Create plot directory if it doesn't exist
-    output_plots_dir = os.path.join(script_dir, "output_plots")
-    os.makedirs(output_plots_dir, exist_ok=True)
-    
-    # Create DataFrame for plotting
-    df = pd.DataFrame(all_results)
-    
-    # Create both plots with _stream suffix
-    # Plot correlation for both metrics
-    plot_correlation(
-        df, "target_tokens", "time_to_first_chunk",
-        "Time to First Audio vs Input Size (Streaming)",
-        "Number of Input Tokens",
-        "Time to First Audio (seconds)",
-        os.path.join(output_plots_dir, "first_token_latency_stream.png")
-    )
-    
-    plot_correlation(
-        df, "target_tokens", "total_time",
-        "Total Time vs Input Size (Streaming)",
-        "Number of Input Tokens",
-        "Total Time (seconds)",
-        os.path.join(output_plots_dir, "total_time_latency_stream.png")
-    )
-    
-    plot_timeline(
-        df,
-        os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
-    )
-    
-    print("\nResults and plots saved to:")
-    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
-    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
-    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
-    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
-
-if __name__ == "__main__":
-    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-import os
-import time
-import json
-import numpy as np
-import pandas as pd
-from openai import OpenAI
-from lib.shared_benchmark_utils import get_text_for_tokens, enc
-from lib.shared_utils import save_json_results
-from lib.shared_plotting import plot_correlation, plot_timeline
-
-def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
-    """Measure time to audio via OpenAI API calls and save the audio output"""
-    results = {
-        "text_length": len(text),
-        "token_count": len(enc.encode(text)),
-        "total_time": None,
-        "time_to_first_chunk": None,
-        "error": None,
-        "audio_path": None,
-        "audio_length": None  # Length of output audio in seconds
-    }
-    
-    try:
-        start_time = time.time()
-        
-        # Initialize OpenAI client
-        openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
-        
-        # Save complete audio
-        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
-        audio_path = os.path.join(output_dir, audio_filename)
-        results["audio_path"] = audio_path
-        
-        first_chunk_time = None
-        all_audio_data = bytearray()
-        chunk_count = 0
-        
-        # Make streaming request using OpenAI client
-        with openai.audio.speech.with_streaming_response.create(
-            model="kokoro",
-            voice="af",
-            response_format="pcm",
-            input=text,
-        ) as response:
-            for chunk in response.iter_bytes(chunk_size=1024):
-                if chunk:
-                    chunk_count += 1
-                    if first_chunk_time is None:
-                        first_chunk_time = time.time()
-                        results["time_to_first_chunk"] = first_chunk_time - start_time
-                    all_audio_data.extend(chunk)
-        
-        # Write as WAV file
-        import wave
-        with wave.open(audio_path, 'wb') as wav_file:
-            wav_file.setnchannels(1)  # Mono
-            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
-            wav_file.setframerate(24000)  # Known sample rate for Kokoro
-            wav_file.writeframes(all_audio_data)
-        
-        # Calculate audio length using scipy
-        import scipy.io.wavfile as wavfile
-        sample_rate, audio_data = wavfile.read(audio_path)
-        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
-        
-        results["total_time"] = time.time() - start_time
-        
-        # Print debug info
-        print(f"Complete audio size: {len(all_audio_data)} bytes")
-        print(f"Number of chunks received: {chunk_count}")
-        print(f"Audio length: {results['audio_length']:.3f}s")
-        
-        return results
-        
-    except Exception as e:
-        results["error"] = str(e)
-        return results
-
-def main():
-    # Set up paths with _stream_openai suffix
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    output_dir = os.path.join(script_dir, "output_audio_stream_openai")
-    output_data_dir = os.path.join(script_dir, "output_data")
-    
-    # Create output directories
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(output_data_dir, exist_ok=True)
-
-    # Load sample text
-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
-        text = f.read()
-
-    # Test specific token counts
-    token_sizes = [50, 100, 200, 500]
-    all_results = []
-    
-    for tokens in token_sizes:
-        print(f"\nTesting {tokens} tokens (streaming)")
-        test_text = get_text_for_tokens(text, tokens)
-        actual_tokens = len(enc.encode(test_text))
-        print(f"Text preview: {test_text[:50]}...")
-        
-        # Run test 5 times for each size to get average
-        for i in range(5):
-            print(f"Run {i+1}/5...")
-            result = measure_first_token(test_text, output_dir, tokens, i + 1)
-            result["target_tokens"] = tokens
-            result["actual_tokens"] = actual_tokens
-            result["run_number"] = i + 1
-            
-            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
-            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
-            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
-            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
-            
-            if result["error"]:
-                print(f"Error: {result['error']}")
-            
-            all_results.append(result)
-    
-    # Calculate averages per token size
-    summary = {}
-    for tokens in token_sizes:
-        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
-        if matching_results:
-            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
-            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
-            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
-            summary[tokens] = {
-                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
-                "avg_total_time": round(avg_total, 3),
-                "avg_audio_length": round(avg_audio_length, 3),
-                "num_successful_runs": len(matching_results)
-            }
-    
-    # Save results with _stream_openai suffix
-    results_data = {
-        "individual_runs": all_results,
-        "summary": summary,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-    }
-    save_json_results(
-        results_data,
-        os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
-    )
-    
-    # Create plot directory if it doesn't exist
-    output_plots_dir = os.path.join(script_dir, "output_plots")
-    os.makedirs(output_plots_dir, exist_ok=True)
-    
-    # Create DataFrame for plotting
-    df = pd.DataFrame(all_results)
-    
-    # Create plots with _stream_openai suffix
-    plot_correlation(
-        df, "target_tokens", "time_to_first_chunk",
-        "Time to First Audio vs Input Size (OpenAI Streaming)",
-        "Number of Input Tokens",
-        "Time to First Audio (seconds)",
-        os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
-    )
-    
-    plot_correlation(
-        df, "target_tokens", "total_time",
-        "Total Time vs Input Size (OpenAI Streaming)",
-        "Number of Input Tokens",
-        "Total Time (seconds)",
-        os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
-    )
-    
-    plot_timeline(
-        df,
-        os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
-    )
-    
-    print("\nResults and plots saved to:")
-    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
-    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
-    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
-    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
-
-if __name__ == "__main__":
-    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import os
+import time
+
+import requests
+from openai import OpenAI
+from lib.stream_utils import run_benchmark
+
+OPENAI_CLIENT = OpenAI(
+    base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
+)
+
+
+def measure_first_token_requests(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+    """Measure time to audio via direct API calls and save the audio output"""
+    results = {
+        "text_length": len(text),
+        "token_count": None,  # Will be set by run_benchmark
+        "total_time": None,
+        "time_to_first_chunk": None,
+        "error": None,
+        "audio_path": None,
+        "audio_length": None,
+    }
+
+    try:
+        start_time = time.time()
+
+        # Make request with streaming enabled
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "input": text,
+                "voice": "af",
+                "response_format": "pcm",
+                "stream": True,
+            },
+            stream=True,
+            timeout=1800,
+        )
+        response.raise_for_status()
+
+        # Save complete audio
+        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
+        audio_path = os.path.join(output_dir, audio_filename)
+        results["audio_path"] = audio_path
+
+        first_chunk_time = None
+        chunks = []
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                if first_chunk_time is None:
+                    first_chunk_time = time.time()
+                    results["time_to_first_chunk"] = first_chunk_time - start_time
+                chunks.append(chunk)
+
+        # Concatenate all PCM chunks
+        if not chunks:
+            raise ValueError("No audio chunks received")
+
+        all_audio_data = b"".join(chunks)
+
+        # Write as WAV file
+        import wave
+
+        with wave.open(audio_path, "wb") as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
+            wav_file.setframerate(24000)  # Known sample rate for Kokoro
+            wav_file.writeframes(all_audio_data)
+
+        # Calculate audio length using scipy
+        import scipy.io.wavfile as wavfile
+
+        sample_rate, audio_data = wavfile.read(audio_path)
+        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
+
+        results["total_time"] = time.time() - start_time
+
+        # Print debug info
+        print(f"Complete audio size: {len(all_audio_data)} bytes")
+        print(f"Number of chunks received: {len(chunks)}")
+        print(f"Audio length: {results['audio_length']:.3f}s")
+
+        return results
+
+    except Exception as e:
+        results["error"] = str(e)
+        return results
+
+
+def measure_first_token_openai(
+    text: str, output_dir: str, tokens: int, run_number: int
+) -> dict:
+    """Measure time to audio via OpenAI API calls and save the audio output"""
+    results = {
+        "text_length": len(text),
+        "token_count": None,  # Will be set by run_benchmark
+        "total_time": None,
+        "time_to_first_chunk": None,
+        "error": None,
+        "audio_path": None,
+        "audio_length": None,
+    }
+
+    try:
+        start_time = time.time()
+
+        # Initialize OpenAI client
+
+        # Save complete audio
+        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
+        audio_path = os.path.join(output_dir, audio_filename)
+        results["audio_path"] = audio_path
+
+        first_chunk_time = None
+        all_audio_data = bytearray()
+        chunk_count = 0
+
+        # Make streaming request using OpenAI client
+        with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
+            model="kokoro",
+            voice="af",
+            response_format="pcm",
+            input=text,
+        ) as response:
+            for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunk_count += 1
+                    if first_chunk_time is None:
+                        first_chunk_time = time.time()
+                        results["time_to_first_chunk"] = first_chunk_time - start_time
+                    all_audio_data.extend(chunk)
+
+        # Write as WAV file
+        import wave
+
+        with wave.open(audio_path, "wb") as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
+            wav_file.setframerate(24000)  # Known sample rate for Kokoro
+            wav_file.writeframes(all_audio_data)
+
+        # Calculate audio length using scipy
+        import scipy.io.wavfile as wavfile
+
+        sample_rate, audio_data = wavfile.read(audio_path)
+        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
+
+        results["total_time"] = time.time() - start_time
+
+        # Print debug info
+        print(f"Complete audio size: {len(all_audio_data)} bytes")
+        print(f"Number of chunks received: {chunk_count}")
+        print(f"Audio length: {results['audio_length']:.3f}s")
+
+        return results
+
+    except Exception as e:
+        results["error"] = str(e)
+        return results
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    prefix='cpu'
+    # Run requests benchmark
+    print("\n=== Running Direct Requests Benchmark ===")
+    run_benchmark(
+        measure_first_token_requests,
+        output_dir=os.path.join(script_dir, "output_audio_stream"),
+        output_data_dir=os.path.join(script_dir, "output_data"),
+        output_plots_dir=os.path.join(script_dir, "output_plots"),
+        suffix="_stream",
+        plot_title_suffix="(Streaming)",
+        prefix=prefix
+    )
+    # Run OpenAI benchmark
+    print("\n=== Running OpenAI Library Benchmark ===")
+    run_benchmark(
+        measure_first_token_openai,
+        output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
+        output_data_dir=os.path.join(script_dir, "output_data"),
+        output_plots_dir=os.path.join(script_dir, "output_plots"),
+        suffix="_stream_openai",
+        plot_title_suffix="(OpenAI Streaming)",
+        prefix=prefix
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
+++ b/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
@ -1,30 +1,37 @@
 #!/usr/bin/env python3
 import os
+import sys
 import json
 import time
-import threading
 import queue
-import pandas as pd
-import sys
+import threading
 from datetime import datetime

-from lib.shared_plotting import plot_system_metrics, plot_correlation
+import pandas as pd
 from lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats,
-    real_time_factor
+    real_time_factor,
+    save_json_results,
+    get_system_metrics,
+    write_benchmark_stats,
 )
+from lib.shared_plotting import plot_correlation, plot_system_metrics
 from lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
+    make_tts_request,
+    get_text_for_tokens,
+    generate_token_sizes,
 )

+
 class SystemMonitor:
    def __init__(self, interval=1.0):
+        """Rough system tracker: Not always accurate"""
        self.interval = interval
        self.metrics_queue = queue.Queue()
        self.stop_event = threading.Event()
        self.metrics_timeline = []
        self.start_time = None
-        
+
    def _monitor_loop(self):
        """Background thread function to collect system metrics."""
        while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
            metrics["relative_time"] = time.time() - self.start_time
            self.metrics_queue.put(metrics)
            time.sleep(self.interval)
-    
+
    def start(self):
        """Start the monitoring thread."""
        self.start_time = time.time()
        self.monitor_thread = threading.Thread(target=self._monitor_loop)
        self.monitor_thread.daemon = True
        self.monitor_thread.start()
-    
+
    def stop(self):
        """Stop the monitoring thread and collect final metrics."""
        self.stop_event.set()
-        if hasattr(self, 'monitor_thread'):
+        if hasattr(self, "monitor_thread"):
            self.monitor_thread.join(timeout=2)
-        
+
        # Collect all metrics from queue
        while True:
            try:
@ -53,23 +60,24 @@ class SystemMonitor:
                self.metrics_timeline.append(metrics)
            except queue.Empty:
                break
-        
+
        return self.metrics_timeline

+
 def main():
    # Initialize system monitor
    monitor = SystemMonitor(interval=1.0)  # 1 second interval
    # Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
-    prefix = "gpu"
+    prefix = "cpu"
    # Generate token sizes
-    if 'gpu' in prefix:
+    if "gpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=5000, dense_step=150, 
-            dense_max=1000, sparse_step=1000)
-    elif 'cpu' in prefix:
+            max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
+        )
+    elif "cpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=1000, dense_step=300, 
-            dense_max=1000, sparse_step=0)
+            max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
+        )
    else:
        token_sizes = generate_token_sizes(max_tokens=3000)

@ -78,7 +86,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
            filename = f"{prefix}_{filename}"
        return os.path.join(path, filename)

-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
        text = f.read()

    total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():

    results = []
    test_start_time = time.time()
-    
+
    # Start system monitoring
    monitor.start()

@ -114,7 +124,8 @@ def main():
        processing_time, audio_length = make_tts_request(
            chunk,
            output_dir=output_dir,
-            prefix=prefix
+            prefix=prefix,
+            stream=False,  # Use non-streaming mode for RTF benchmarking
        )
        if processing_time is None or audio_length is None:
            print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
        # Calculate RTF using the correct formula
        rtf = real_time_factor(processing_time, audio_length)
        print(f"Real-Time Factor: {rtf:.5f}")
-        
-        results.append({
-            "tokens": actual_tokens,
-            "processing_time": processing_time,
-            "output_length": audio_length,
-            "rtf": rtf,
-            "elapsed_time": round(time.time() - test_start_time, 2),
-        })
+
+        results.append(
+            {
+                "tokens": actual_tokens,
+                "processing_time": processing_time,
+                "output_length": audio_length,
+                "rtf": rtf,
+                "elapsed_time": round(time.time() - test_start_time, 5),
+            }
+        )

    df = pd.DataFrame(results)
    if df.empty:
@ -144,89 +157,101 @@ def main():
        {
            "title": "Benchmark Statistics (with correct RTF)",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
-                "Average RTF": df['rtf'].mean(),
-                "Average Real Time Speed": 1/df['rtf'].mean()
-            }
+                "Total tokens processed": df["tokens"].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
+                "Total test duration (s)": df["elapsed_time"].max(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+                "Average RTF": df["rtf"].mean(),
+                "Average Real Time Speed": 1 / df["rtf"].mean(),
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
-                "Average output length (s)": df['output_length'].mean()
-            }
+                "Average chunk size (tokens)": df["tokens"].mean(),
+                "Min chunk size (tokens)": df["tokens"].min(),
+                "Max chunk size (tokens)": df["tokens"].max(),
+                "Average processing time (s)": df["processing_time"].mean(),
+                "Average output length (s)": df["output_length"].mean(),
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
                "RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
-                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
-            }
-        }
+                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
+            },
+        },
    ]
-    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
+    write_benchmark_stats(
+        stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
+    )

    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
+        "tokens",
+        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time_rtf.png")
+        prefix_path(output_plots_dir, "processing_time_rtf.png"),
    )

    # Plot RTF vs Token Count
    plot_correlation(
-        df, "tokens", "rtf",
+        df,
+        "tokens",
+        "rtf",
        "Real-Time Factor vs Input Size",
        "Number of Input Tokens",
        "Real-Time Factor (processing time / audio length)",
-        prefix_path(output_plots_dir, "realtime_factor_rtf.png")
+        prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
    )

    # Stop monitoring and get final metrics
    final_metrics = monitor.stop()
-    
+
    # Convert metrics timeline to DataFrame for stats
    metrics_df = pd.DataFrame(final_metrics)
-    
+
    # Add system usage stats
    if not metrics_df.empty:
-        stats.append({
-            "title": "System Usage Statistics",
-            "stats": {
-                "Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
-                "Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
-                "Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
-                "Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
-                "Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
-                "Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
+        stats.append(
+            {
+                "title": "System Usage Statistics",
+                "stats": {
+                    "Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
+                    "Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
+                    "Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
+                    "Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
+                    "Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
+                    "Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
+                },
            }
-        })
-        if 'gpu_memory_used' in metrics_df:
-            stats[-1]["stats"].update({
-                "Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
-                "Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
-            })
-    
+        )
+        if "gpu_memory_used" in metrics_df:
+            stats[-1]["stats"].update(
+                {
+                    "Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
+                    "Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
+                }
+            )
+
    # Plot system metrics
-    plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
+    plot_system_metrics(
+        final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
+    )

    # Save final results
    save_json_results(
        {
            "results": results,
            "system_metrics": final_metrics,
-            "test_duration": time.time() - test_start_time
+            "test_duration": time.time() - test_start_time,
        },
-        prefix_path(output_data_dir, "benchmark_results_rtf.json")
+        prefix_path(output_data_dir, "benchmark_results_rtf.json"),
    )

    print("\nResults saved to:")
--- a/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
+++ b/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
@ -1,19 +1,30 @@
 import os
 import json
 import time
+
 import pandas as pd
-from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
+
 from examples.assorted_checks.lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats
+    save_json_results,
+    get_system_metrics,
+    write_benchmark_stats,
+)
+from examples.assorted_checks.lib.shared_plotting import (
+    plot_correlation,
+    plot_system_metrics,
 )
 from examples.assorted_checks.lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
+    make_tts_request,
+    get_text_for_tokens,
+    generate_token_sizes,
 )


 def main():
    # Get optional prefix from first command line argument
    import sys
+
    prefix = sys.argv[1] if len(sys.argv) > 1 else ""

    # Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
    total_tokens = len(enc.encode(text))
    print(f"Total tokens in file: {total_tokens}")

-
    token_sizes = generate_token_sizes(total_tokens)

    print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
        # Save intermediate results
        save_json_results(
            {"results": results, "system_metrics": system_metrics},
-            prefix_path(output_data_dir, "benchmark_results.json")
+            prefix_path(output_data_dir, "benchmark_results.json"),
        )

    # Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
        {
            "title": "Benchmark Statistics",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
-                "Average realtime factor": df['realtime_factor'].mean()
-            }
+                "Total tokens processed": df["tokens"].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
+                "Total test duration (s)": df["elapsed_time"].max(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
+                "Average realtime factor": df["realtime_factor"].mean(),
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
-                "Average output length (s)": df['output_length'].mean()
-            }
+                "Average chunk size (tokens)": df["tokens"].mean(),
+                "Min chunk size (tokens)": df["tokens"].min(),
+                "Max chunk size (tokens)": df["tokens"].max(),
+                "Average processing time (s)": df["processing_time"].mean(),
+                "Average output length (s)": df["output_length"].mean(),
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
-                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
-            }
-        }
+                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
+            },
+        },
    ]
    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))

    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
+        "tokens",
+        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time.png")
+        prefix_path(output_plots_dir, "processing_time.png"),
    )

    # Plot Realtime Factor vs Token Count
    plot_correlation(
-        df, "tokens", "realtime_factor",
+        df,
+        "tokens",
+        "realtime_factor",
        "Realtime Factor vs Input Size",
        "Number of Input Tokens",
        "Realtime Factor (output length / processing time)",
-        prefix_path(output_plots_dir, "realtime_factor.png")
+        prefix_path(output_plots_dir, "realtime_factor.png"),
    )

    # Plot system metrics
-    plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
+    plot_system_metrics(
+        system_metrics, prefix_path(output_plots_dir, "system_usage.png")
+    )

    print("\nResults saved to:")
    print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
--- a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
@ -1,11 +1,12 @@
 """Shared utilities specific to TTS benchmarking."""
+
 import time
-from typing import List, Optional, Tuple
+from typing import List, Tuple, Optional

 import requests
 import tiktoken

-from .shared_utils import get_audio_length, save_audio_file
+from .shared_utils import save_audio_file, get_audio_length

 # Global tokenizer instance
 enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")

 def get_text_for_tokens(text: str, num_tokens: int) -> str:
    """Get a slice of text that contains exactly num_tokens tokens.
-    
+
    Args:
        text: Input text to slice
        num_tokens: Desired number of tokens
-        
+
    Returns:
        str: Text slice containing exactly num_tokens tokens
    """
@ -31,44 +32,69 @@ def make_tts_request(
    text: str,
    output_dir: str = None,
    timeout: int = 1800,
-    prefix: str = ""
+    prefix: str = "",
+    stream: bool = True,
 ) -> Tuple[Optional[float], Optional[float]]:
    """Make TTS request using OpenAI-compatible endpoint.
-    
+
    Args:
        text: Input text to convert to speech
        output_dir: Directory to save audio files. If None, audio won't be saved.
        timeout: Request timeout in seconds
        prefix: Optional prefix for output filenames
-        
+
    Returns:
        tuple: (processing_time, audio_length) in seconds, or (None, None) on error
    """
    try:
        start_time = time.time()
-        response = requests.post(
-            "http://localhost:8880/v1/audio/speech",
-            json={
-                "model": "kokoro",
-                "input": text,
-                "voice": "af",
-                "response_format": "wav",
-            },
-            timeout=timeout,
-        )
-        response.raise_for_status()
+        if stream:
+            # For streaming, we need to collect all chunks
+            audio_chunks = []
+            response = requests.post(
+                "http://localhost:8880/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": "af",
+                    "response_format": "wav",
+                    "stream": True,
+                },
+                timeout=timeout,
+                stream=True,
+            )
+            response.raise_for_status()
+
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    audio_chunks.append(chunk)
+
+            # Combine all chunks
+            audio_data = b"".join(audio_chunks)
+        else:
+            response = requests.post(
+                "http://localhost:8880/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": "af",
+                    "response_format": "wav",
+                    "stream": False,
+                },
+                timeout=timeout,
+            )
+            response.raise_for_status()
+            audio_data = response.content

        processing_time = round(time.time() - start_time, 2)
-        # Calculate audio length from response content
-        audio_length = get_audio_length(response.content)
-        
+        # Calculate audio length from audio data
+        audio_length = get_audio_length(audio_data)
+
        # Save the audio file if output_dir is provided
        if output_dir:
            token_count = len(enc.encode(text))
            output_file = save_audio_file(
-                response.content,
-                f"chunk_{token_count}_tokens",
-                output_dir
+                audio_data, f"chunk_{token_count}_tokens", output_dir
            )
            print(f"Saved audio to {output_file}")

@ -86,26 +112,26 @@ def generate_token_sizes(
    max_tokens: int,
    dense_step: int = 100,
    dense_max: int = 1000,
-    sparse_step: int = 1000
+    sparse_step: int = 1000,
 ) -> List[int]:
    """Generate token size ranges with dense sampling at start.
-    
+
    Args:
        max_tokens: Maximum number of tokens to generate sizes up to
        dense_step: Step size for dense sampling range
        dense_max: Maximum value for dense sampling
        sparse_step: Step size for sparse sampling range
-        
+
    Returns:
        list: Sorted list of token sizes
    """
    # Dense sampling at start
    dense_range = list(range(dense_step, dense_max + 1, dense_step))
-    
+
    if max_tokens <= dense_max or sparse_step < dense_max:
        return sorted(dense_range)
    # Sparse sampling for larger sizes
    sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
-    
+
    # Combine and deduplicate
    return sorted(list(set(dense_range + sparse_range)))
--- a/examples/assorted_checks/benchmarks/lib/shared_plotting.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@ -1,7 +1,8 @@
 """Shared plotting utilities for benchmarks and tests."""
+
+import numpy as np
 import pandas as pd
 import seaborn as sns
-import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches

@ -12,66 +13,71 @@ STYLE_CONFIG = {
    "secondary_color": "#05d9e8",
    "grid_color": "#ffffff",
    "text_color": "#ffffff",
-    "font_sizes": {
-        "title": 16,
-        "label": 14,
-        "tick": 12,
-        "text": 10
-    }
+    "font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
 }

+
 def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
    """Configure plot styling with consistent theme.
-    
+
    Args:
        fig: matplotlib figure object
        ax: matplotlib axis object
        title: str, plot title
        xlabel: str, optional x-axis label
        ylabel: str, optional y-axis label
-    
+
    Returns:
        tuple: (fig, ax) with applied styling
    """
    # Grid styling
    ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
-    
+
    # Title and labels
-    ax.set_title(title, pad=20, 
-                fontsize=STYLE_CONFIG["font_sizes"]["title"], 
-                fontweight="bold", 
-                color=STYLE_CONFIG["text_color"])
-    
+    ax.set_title(
+        title,
+        pad=20,
+        fontsize=STYLE_CONFIG["font_sizes"]["title"],
+        fontweight="bold",
+        color=STYLE_CONFIG["text_color"],
+    )
+
    if xlabel:
-        ax.set_xlabel(xlabel, 
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
-                     fontweight="medium", 
-                     color=STYLE_CONFIG["text_color"])
+        ax.set_xlabel(
+            xlabel,
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
+            fontweight="medium",
+            color=STYLE_CONFIG["text_color"],
+        )
    if ylabel:
-        ax.set_ylabel(ylabel, 
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
-                     fontweight="medium", 
-                     color=STYLE_CONFIG["text_color"])
-    
+        ax.set_ylabel(
+            ylabel,
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
+            fontweight="medium",
+            color=STYLE_CONFIG["text_color"],
+        )
+
    # Tick styling
-    ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"], 
-                  colors=STYLE_CONFIG["text_color"])
-    
+    ax.tick_params(
+        labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
+    )
+
    # Spine styling
    for spine in ax.spines.values():
        spine.set_color(STYLE_CONFIG["text_color"])
        spine.set_alpha(0.3)
        spine.set_linewidth(0.5)
-    
+
    # Background colors
    ax.set_facecolor(STYLE_CONFIG["background_color"])
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    return fig, ax

+
 def plot_system_metrics(metrics_data, output_path):
    """Create plots for system metrics over time.
-    
+
    Args:
        metrics_data: list of dicts containing system metrics
        output_path: str, path to save the output plot
@ -79,68 +85,118 @@ def plot_system_metrics(metrics_data, output_path):
    df = pd.DataFrame(metrics_data)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
-    
+
    # Get baseline values
    baseline_cpu = df["cpu_percent"].iloc[0]
    baseline_ram = df["ram_used_gb"].iloc[0]
-    baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
-    
+    baseline_gpu = (
+        df["gpu_memory_used"].iloc[0] / 1024
+        if "gpu_memory_used" in df.columns
+        else None
+    )
+
    # Convert GPU memory to GB if present
    if "gpu_memory_used" in df.columns:
        df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
-    
+
    plt.style.use("dark_background")
-    
+
    # Create subplots based on available metrics
    has_gpu = "gpu_memory_used" in df.columns
    num_plots = 3 if has_gpu else 2
    fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    # Smoothing window
    window = min(5, len(df) // 2)
-    
+
    # Plot CPU Usage
    smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], 
-                color=STYLE_CONFIG["primary_color"], linewidth=2)
-    axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"], 
-                    linestyle="--", alpha=0.5, label="Baseline")
-    setup_plot(fig, axes[0], "CPU Usage Over Time", 
-              xlabel="Time (seconds)", ylabel="CPU Usage (%)")
+    sns.lineplot(
+        x=elapsed_time,
+        y=smoothed_cpu,
+        ax=axes[0],
+        color=STYLE_CONFIG["primary_color"],
+        linewidth=2,
+    )
+    axes[0].axhline(
+        y=baseline_cpu,
+        color=STYLE_CONFIG["secondary_color"],
+        linestyle="--",
+        alpha=0.5,
+        label="Baseline",
+    )
+    setup_plot(
+        fig,
+        axes[0],
+        "CPU Usage Over Time",
+        xlabel="Time (seconds)",
+        ylabel="CPU Usage (%)",
+    )
    axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
    axes[0].legend()
-    
+
    # Plot RAM Usage
    smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], 
-                color=STYLE_CONFIG["secondary_color"], linewidth=2)
-    axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"], 
-                    linestyle="--", alpha=0.5, label="Baseline")
-    setup_plot(fig, axes[1], "RAM Usage Over Time", 
-              xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
+    sns.lineplot(
+        x=elapsed_time,
+        y=smoothed_ram,
+        ax=axes[1],
+        color=STYLE_CONFIG["secondary_color"],
+        linewidth=2,
+    )
+    axes[1].axhline(
+        y=baseline_ram,
+        color=STYLE_CONFIG["primary_color"],
+        linestyle="--",
+        alpha=0.5,
+        label="Baseline",
+    )
+    setup_plot(
+        fig,
+        axes[1],
+        "RAM Usage Over Time",
+        xlabel="Time (seconds)",
+        ylabel="RAM Usage (GB)",
+    )
    axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
    axes[1].legend()
-    
+
    # Plot GPU Memory if available
    if has_gpu:
        smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
-        sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], 
-                    color=STYLE_CONFIG["primary_color"], linewidth=2)
-        axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"], 
-                        linestyle="--", alpha=0.5, label="Baseline")
-        setup_plot(fig, axes[2], "GPU Memory Usage Over Time", 
-                  xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
+        sns.lineplot(
+            x=elapsed_time,
+            y=smoothed_gpu,
+            ax=axes[2],
+            color=STYLE_CONFIG["primary_color"],
+            linewidth=2,
+        )
+        axes[2].axhline(
+            y=baseline_gpu,
+            color=STYLE_CONFIG["secondary_color"],
+            linestyle="--",
+            alpha=0.5,
+            label="Baseline",
+        )
+        setup_plot(
+            fig,
+            axes[2],
+            "GPU Memory Usage Over Time",
+            xlabel="Time (seconds)",
+            ylabel="GPU Memory (GB)",
+        )
        axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
        axes[2].legend()
-    
+
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()

-def plot_timeline(df, output_path, suffix=""):
+
+def plot_timeline(df, output_path, suffix="", prefix=""):
    """Create timeline plot showing latency for each run.
-    
+
    Args:
        df: pandas DataFrame containing run data with columns:
            - target_tokens: number of tokens
@ -149,124 +205,161 @@ def plot_timeline(df, output_path, suffix=""):
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
-    
+
    # Sort by tokens and run number
-    df = df.sort_values(['target_tokens', 'run_number'])
-    
+    df = df.sort_values(["target_tokens", "run_number"])
+
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))
-    
+
    # Calculate y positions for each run with tighter grouping
-    unique_tokens = sorted(df['target_tokens'].unique())
+    unique_tokens = sorted(df["target_tokens"].unique())
    y_positions = {}
    current_y = 0
    group_spacing = 0.8  # Space between groups
-    run_spacing = 0.2    # Space between runs in a group
-    
+    run_spacing = 0.2  # Space between runs in a group
+
    for tokens in unique_tokens:
-        runs = df[df['target_tokens'] == tokens]
+        runs = df[df["target_tokens"] == tokens]
        base_y = current_y
        for i, (_, run) in enumerate(runs.iterrows()):
-            y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
+            y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
        current_y = base_y + (len(runs) * run_spacing) + group_spacing
-    
+
    # Plot bars and points with more transparency
    bar_height = 0.15
    for _, row in df.iterrows():
-        y = y_positions[(row['target_tokens'], row['run_number'])]
-        latency = row['time_to_first_chunk']
-        
+        y = y_positions[(row["target_tokens"], row["run_number"])]
+        latency = row["time_to_first_chunk"]
+
        # Latency bar
-        ax.add_patch(patches.Rectangle(
-            (0, y - bar_height/2),
-            latency,
-            bar_height,
-            facecolor=STYLE_CONFIG["primary_color"],
-            alpha=0.3
-        ))
-        
+        ax.add_patch(
+            patches.Rectangle(
+                (0, y - bar_height / 2),
+                latency,
+                bar_height,
+                facecolor=STYLE_CONFIG["primary_color"],
+                alpha=0.3,
+            )
+        )
+
        # End point
-        ax.plot(latency, y, 'o', 
-                color=STYLE_CONFIG["secondary_color"],
-                markersize=4,
-                alpha=0.5)
-    
+        ax.plot(
+            latency,
+            y,
+            "o",
+            color=STYLE_CONFIG["secondary_color"],
+            markersize=4,
+            alpha=0.5,
+        )
+
    # Add mean lines and values for each token group
    for tokens in unique_tokens:
-        token_runs = df[df['target_tokens'] == tokens]
-        mean_latency = token_runs['time_to_first_chunk'].mean()
-        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
+        token_runs = df[df["target_tokens"] == tokens]
+        mean_latency = token_runs["time_to_first_chunk"].mean()
+        y_positions_for_token = [
+            y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
+        ]
        min_y = min(y_positions_for_token)
        max_y = max(y_positions_for_token)
        group_center = (min_y + max_y) / 2
-        
+
        # Plot mean line with gradient alpha
        gradient = np.linspace(0.2, 0.8, 100)
-        for i in range(len(gradient)-1):
-            y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
-            y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
-            ax.plot([mean_latency, mean_latency], [y1, y2],
-                   '-', color=STYLE_CONFIG["secondary_color"],
-                   linewidth=3, alpha=gradient[i])
-        
+        for i in range(len(gradient) - 1):
+            y1 = (
+                min_y
+                - bar_height
+                + (max_y - min_y + 2 * bar_height) * (i / len(gradient))
+            )
+            y2 = (
+                min_y
+                - bar_height
+                + (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
+            )
+            ax.plot(
+                [mean_latency, mean_latency],
+                [y1, y2],
+                "-",
+                color=STYLE_CONFIG["secondary_color"],
+                linewidth=3,
+                alpha=gradient[i],
+            )
+
        # Add mean value label with background
-        label_text = f'Mean: {mean_latency:.3f}s'
+        label_text = f"Mean: {mean_latency:.3f}s"
        bbox_props = dict(
            facecolor=STYLE_CONFIG["background_color"],
            edgecolor=STYLE_CONFIG["secondary_color"],
            alpha=0.8,
            pad=3,
-            linewidth=1
+            linewidth=1,
        )
-        ax.text(mean_latency + 0.02, group_center,
-                label_text,
-                color=STYLE_CONFIG["secondary_color"],
-                va='center',
-                fontsize=10,
-                fontweight='bold',
-                bbox=bbox_props)
-    
+        ax.text(
+            mean_latency + 0.02,
+            group_center,
+            label_text,
+            color=STYLE_CONFIG["secondary_color"],
+            va="center",
+            fontsize=10,
+            fontweight="bold",
+            bbox=bbox_props,
+        )
+
    # Customize plot
    ax.set_ylim(-1, current_y)
-    ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3)  # Extra space for labels
-    
+    ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3)  # Extra space for labels
+
    # Add labels for token groups with tighter spacing
    group_positions = {}
    for tokens in unique_tokens:
-        runs = df[df['target_tokens'] == tokens]
-        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
-        group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
-        plt.axhline(y=min(y_positions_for_token) - bar_height, 
-                   color='white', alpha=0.1, linestyle='-')
-    
+        runs = df[df["target_tokens"] == tokens]
+        y_positions_for_token = [
+            y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
+        ]
+        group_positions[tokens] = sum(y_positions_for_token) / len(
+            y_positions_for_token
+        )
+        plt.axhline(
+            y=min(y_positions_for_token) - bar_height,
+            color="white",
+            alpha=0.1,
+            linestyle="-",
+        )
+
    # Calculate mean audio length for each token group
    audio_lengths = {}
    for tokens in unique_tokens:
-        token_runs = df[df['target_tokens'] == tokens]
-        audio_lengths[tokens] = token_runs['audio_length'].mean()
+        token_runs = df[df["target_tokens"] == tokens]
+        audio_lengths[tokens] = token_runs["audio_length"].mean()

    # Set y-ticks at group centers with token counts and audio lengths
    plt.yticks(
        list(group_positions.values()),
-        [f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
-        fontsize=10
+        [
+            f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
+            for tokens in group_positions.keys()
+        ],
+        fontsize=10,
    )
-    
+
    # Customize appearance
    setup_plot(
-        fig, ax,
-        "Time-To-Audio Latency" + suffix,
+        fig,
+        ax,
+        prefix.upper() + " Time-To-Audio Latency " + suffix,
        xlabel="Time (seconds)",
-        ylabel="Input Size"
+        ylabel="Input Size",
    )
-    
+
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()

+
 def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
    """Create correlation plot with regression line and correlation coefficient.
-    
+
    Args:
        df: pandas DataFrame containing the data
        x: str, column name for x-axis
@ -277,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
-    
+
    fig, ax = plt.subplots(figsize=(12, 8))
-    
+
    # Scatter plot
-    sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6, 
-                    color=STYLE_CONFIG["primary_color"])
-    
+    sns.scatterplot(
+        data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
+    )
+
    # Regression line
-    sns.regplot(data=df, x=x, y=y, scatter=False, 
-                color=STYLE_CONFIG["secondary_color"], 
-                line_kws={"linewidth": 2})
-    
+    sns.regplot(
+        data=df,
+        x=x,
+        y=y,
+        scatter=False,
+        color=STYLE_CONFIG["secondary_color"],
+        line_kws={"linewidth": 2},
+    )
+
    # Add correlation coefficient
    corr = df[x].corr(df[y])
-    plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", 
-             transform=ax.transAxes, 
-             fontsize=STYLE_CONFIG["font_sizes"]["text"], 
-             color=STYLE_CONFIG["text_color"],
-             bbox=dict(facecolor=STYLE_CONFIG["background_color"], 
-                      edgecolor=STYLE_CONFIG["text_color"], 
-                      alpha=0.7))
-    
+    plt.text(
+        0.05,
+        0.95,
+        f"Correlation: {corr:.2f}",
+        transform=ax.transAxes,
+        fontsize=STYLE_CONFIG["font_sizes"]["text"],
+        color=STYLE_CONFIG["text_color"],
+        bbox=dict(
+            facecolor=STYLE_CONFIG["background_color"],
+            edgecolor=STYLE_CONFIG["text_color"],
+            alpha=0.7,
+        ),
+    )
+
    setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
--- a/examples/assorted_checks/benchmarks/lib/shared_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_utils.py
@ -1,9 +1,10 @@
 """Shared utilities for benchmarks and tests."""
+
 import os
 import json
 import subprocess
+from typing import Any, Dict, List, Union, Optional
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Union

 import psutil
 import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
 TORCH_AVAILABLE = False
 try:
    import torch
+
    TORCH_AVAILABLE = torch.cuda.is_available()
 except ImportError:
    pass


+def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
+    """Check if an audio file is silent by comparing peak amplitude to a threshold.
+
+    Args:
+        audio_path: Path to the audio file
+        threshold: Peak amplitude threshold for silence
+
+    Returns:
+        bool: True if audio is silent, False otherwise
+    """
+    rate, data = wavfile.read(audio_path)
+    peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0  # 16-bit audio
+
+    return peak_amplitude < threshold
+
+
 def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
    """Get audio length in seconds from bytes data.
-    
+
    Args:
        audio_data: Raw audio bytes
        temp_dir: Directory for temporary file. If None, uses system temp directory.
-        
+
    Returns:
        float: Audio length in seconds
    """
    if temp_dir is None:
        import tempfile
+
        temp_dir = tempfile.gettempdir()
-    
+
    temp_path = os.path.join(temp_dir, "temp.wav")
    os.makedirs(temp_dir, exist_ok=True)
-    
+
    with open(temp_path, "wb") as f:
        f.write(audio_data)

@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:

 def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
    """Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
-    
+
    Args:
        average: If True and multiple GPUs present, returns average memory usage.
                If False, returns list of memory usage per GPU.
-    
+
    Returns:
        float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
        If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
        n_gpus = torch.cuda.device_count()
        memory_used = []
        for i in range(n_gpus):
-            memory_used.append(torch.cuda.memory_allocated(i) / 1024**2)  # Convert to MB
-        
+            memory_used.append(
+                torch.cuda.memory_allocated(i) / 1024**2
+            )  # Convert to MB
+
        if average and len(memory_used) > 0:
            return sum(memory_used) / len(memory_used)
        return memory_used if len(memory_used) > 1 else memory_used[0]
-    
+
    # Fall back to nvidia-smi
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
        )
-        memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
-        
+        memory_values = [
+            float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
+        ]
+
        if average and len(memory_values) > 0:
            return sum(memory_values) / len(memory_values)
        return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:

 def get_system_metrics() -> Dict[str, Union[str, float]]:
    """Get current system metrics including CPU, RAM, and GPU if available.
-    
+
    Returns:
        dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
    """
    # Get per-CPU percentages and calculate average
    cpu_percentages = psutil.cpu_percent(percpu=True)
    avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
-    
+
    metrics = {
        "timestamp": datetime.now().isoformat(),
        "cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:

 def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
    """Save audio data to a file with proper naming and directory creation.
-    
+
    Args:
        audio_data: Raw audio bytes
        identifier: String to identify this audio file (e.g. token count, test name)
        output_dir: Directory to save the file
-        
+
    Returns:
        str: Path to the saved audio file
    """
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{identifier}.wav")
-    
+
    with open(output_file, "wb") as f:
        f.write(audio_data)
-        
+
    return output_file


 def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
    """Write benchmark statistics to a file in a clean, organized format.
-    
+
    Args:
        stats: List of dictionaries containing stat name/value pairs
        output_file: Path to output file
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    
+
    with open(output_file, "w") as f:
        for section in stats:
            # Write section header
            f.write(f"=== {section['title']} ===\n\n")
-            
+
            # Write stats
-            for label, value in section['stats'].items():
+            for label, value in section["stats"].items():
                if isinstance(value, float):
                    f.write(f"{label}: {value:.2f}\n")
                else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None

 def save_json_results(results: Dict[str, Any], output_file: str) -> None:
    """Save benchmark results to a JSON file with proper formatting.
-    
+
    Args:
        results: Dictionary of results to save
        output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
        json.dump(results, f, indent=2)


-def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
+def real_time_factor(
+    processing_time: float, audio_length: float, decimals: int = 2
+) -> float:
    """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
-    
+
    Args:
        processing_time: Time taken to process/generate audio
        audio_length: Length of the generated audio
        decimals: Number of decimal places to round to
-        
+
    Returns:
        float: RTF value
    """
--- a/examples/assorted_checks/benchmarks/lib/stream_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/stream_utils.py
@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+import os
+import time
+import wave
+from typing import Any, Dict, List, Callable, Optional
+
+import pandas as pd
+import scipy.io.wavfile as wavfile
+
+from .shared_utils import save_json_results
+from .shared_plotting import plot_timeline, plot_correlation
+from .shared_benchmark_utils import enc, get_text_for_tokens
+
+
+def check_audio_silence(audio_path: str) -> bool:
+    """Check if audio file contains only silence"""
+    sample_rate, audio_data = wavfile.read(audio_path)
+    # Convert to float for RMS calculation
+    audio_float = audio_data.astype(float)
+    # Calculate RMS value
+    rms = (audio_float**2).mean() ** 0.5
+    # Define silence threshold (adjust if needed)
+    SILENCE_THRESHOLD = 50.0
+    return rms < SILENCE_THRESHOLD
+
+
+def process_benchmark_results(
+    all_results: List[Dict[str, Any]], token_sizes: List[int]
+) -> Dict[str, Any]:
+    """Process benchmark results and generate summary"""
+    summary = {}
+    for tokens in token_sizes:
+        matching_results = [
+            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
+        ]
+        if matching_results:
+            avg_first_chunk = sum(
+                r["time_to_first_chunk"] for r in matching_results
+            ) / len(matching_results)
+            avg_total = sum(r["total_time"] for r in matching_results) / len(
+                matching_results
+            )
+            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
+                matching_results
+            )
+            summary[tokens] = {
+                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
+                "avg_total_time": round(avg_total, 3),
+                "avg_audio_length": round(avg_audio_length, 3),
+                "num_successful_runs": len(matching_results),
+            }
+    return summary
+
+
+def save_benchmark_results(
+    all_results: List[Dict[str, Any]],
+    summary: Dict[str, Any],
+    output_data_dir: str,
+    output_plots_dir: str,
+    suffix: str,
+    plot_title_suffix: str,
+    prefix: str = "",
+):
+    """Save benchmark results and generate plots"""
+    # Save results
+    results_data = {
+        "individual_runs": all_results,
+        "summary": summary,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    save_json_results(
+        results_data,
+        os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
+    )
+
+    # Create DataFrame for plotting
+    df = pd.DataFrame(all_results)
+
+    # Create plots
+    plot_correlation(
+        df,
+        "target_tokens",
+        "time_to_first_chunk",
+        f"Time to First Audio vs Input Size {plot_title_suffix}",
+        "Number of Input Tokens",
+        "Time to First Audio (seconds)",
+        os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
+    )
+
+    plot_correlation(
+        df,
+        "target_tokens",
+        "total_time",
+        f"Total Time vs Input Size {plot_title_suffix}",
+        "Number of Input Tokens",
+        "Total Time (seconds)",
+        os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
+    )
+
+    plot_timeline(
+        df,
+        os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
+        suffix=plot_title_suffix,
+    )
+
+
+def run_benchmark(
+    measure_func: Callable,
+    output_dir: str,
+    output_data_dir: str,
+    output_plots_dir: str,
+    suffix: str = "",
+    plot_title_suffix: str = "",
+    num_runs: int = 5,
+    client=None,
+    prefix="",
+):
+    """Run benchmark with the given measurement function"""
+    # Create output directories
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(output_data_dir, exist_ok=True)
+    os.makedirs(output_plots_dir, exist_ok=True)
+
+    # Load sample text
+    script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    with open(
+        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
+    ) as f:
+        text = f.read()
+
+    # Test specific token counts
+    token_sizes = [10, 50, 100, 250, 500]
+    all_results = []
+    silent_files = []
+
+    for tokens in token_sizes:
+        print(
+            f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
+        )
+        test_text = get_text_for_tokens(text, tokens)
+        actual_tokens = len(enc.encode(test_text))
+        print(f"Text preview: {test_text[:50]}...")
+
+        for i in range(num_runs):
+            print(f"Run {i+1}/{num_runs}...")
+            result = measure_func(test_text, output_dir, tokens, i + 1)
+            result["target_tokens"] = tokens
+            result["actual_tokens"] = actual_tokens
+            result["run_number"] = i + 1
+
+            # Handle time to first audio
+            first_chunk = result.get('time_to_first_chunk')
+            print(
+                f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
+            )
+            
+            # Handle total time
+            total_time = result.get('total_time')
+            print(
+                f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
+            )
+            
+            # Handle audio length
+            audio_length = result.get('audio_length')
+            print(
+                f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
+            )
+            # Calculate streaming overhead only if both values exist
+            if total_time is not None and first_chunk is not None:
+                print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
+            else:
+                print("Streaming overhead: N/A")
+
+            if result["error"]:
+                print(f"Error: {result['error']}")
+            elif result["audio_path"] and check_audio_silence(result["audio_path"]):
+                silent_files.append(result["audio_path"])
+
+            all_results.append(result)
+
+    # Process and save results
+    summary = process_benchmark_results(all_results, token_sizes)
+    save_benchmark_results(
+        all_results,
+        summary,
+        output_data_dir,
+        output_plots_dir,
+        suffix,
+        plot_title_suffix,
+    )
+
+    # Print paths
+    print("\nResults and plots saved to:")
+    print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
+    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
+
+    # Print silence check summary
+    if silent_files:
+        print("\nWARNING: The following files contain only silence:")
+        for file in silent_files:
+            print(f"- {file}")
+    else:
+        print("\nAll generated audio files contain valid audio content.")
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
@ -1,111 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 18.833295583724976,
-      "output_length": 31.15,
-      "realtime_factor": 1.6539856161403135,
-      "elapsed_time": 19.024322748184204
-    },
-    {
-      "tokens": 200,
-      "processing_time": 38.95506024360657,
-      "output_length": 62.6,
-      "realtime_factor": 1.6069799304257042,
-      "elapsed_time": 58.21527123451233
-    },
-    {
-      "tokens": 300,
-      "processing_time": 49.74252939224243,
-      "output_length": 96.325,
-      "realtime_factor": 1.9364716908630366,
-      "elapsed_time": 108.19673728942871
-    },
-    {
-      "tokens": 400,
-      "processing_time": 61.349056243896484,
-      "output_length": 128.575,
-      "realtime_factor": 2.095794261102292,
-      "elapsed_time": 169.733656167984
-    },
-    {
-      "tokens": 500,
-      "processing_time": 82.86568236351013,
-      "output_length": 158.575,
-      "realtime_factor": 1.9136389815071193,
-      "elapsed_time": 252.7968451976776
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T00:13:49.865330",
-      "cpu_percent": 8.0,
-      "ram_percent": 39.4,
-      "ram_used_gb": 25.03811264038086,
-      "gpu_memory_used": 1204.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:08.781551",
-      "cpu_percent": 26.8,
-      "ram_percent": 42.6,
-      "ram_used_gb": 27.090862274169922,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:08.916973",
-      "cpu_percent": 16.1,
-      "ram_percent": 42.6,
-      "ram_used_gb": 27.089553833007812,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:47.979053",
-      "cpu_percent": 31.5,
-      "ram_percent": 43.6,
-      "ram_used_gb": 27.714427947998047,
-      "gpu_memory_used": 1225.0
-    },
-    {
-      "timestamp": "2025-01-03T00:14:48.098976",
-      "cpu_percent": 20.0,
-      "ram_percent": 43.6,
-      "ram_used_gb": 27.704315185546875,
-      "gpu_memory_used": 1211.0
-    },
-    {
-      "timestamp": "2025-01-03T00:15:37.944729",
-      "cpu_percent": 29.7,
-      "ram_percent": 38.6,
-      "ram_used_gb": 24.53925323486328,
-      "gpu_memory_used": 1217.0
-    },
-    {
-      "timestamp": "2025-01-03T00:15:38.071915",
-      "cpu_percent": 8.6,
-      "ram_percent": 38.5,
-      "ram_used_gb": 24.51690673828125,
-      "gpu_memory_used": 1208.0
-    },
-    {
-      "timestamp": "2025-01-03T00:16:39.525449",
-      "cpu_percent": 23.4,
-      "ram_percent": 38.8,
-      "ram_used_gb": 24.71230697631836,
-      "gpu_memory_used": 1221.0
-    },
-    {
-      "timestamp": "2025-01-03T00:16:39.612442",
-      "cpu_percent": 5.5,
-      "ram_percent": 38.9,
-      "ram_used_gb": 24.72066879272461,
-      "gpu_memory_used": 1221.0
-    },
-    {
-      "timestamp": "2025-01-03T00:18:02.569076",
-      "cpu_percent": 27.4,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.868202209472656,
-      "gpu_memory_used": 1264.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
@ -1,216 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 14.349808931350708,
-      "output_length": 31.15,
-      "rtf": 0.46,
-      "elapsed_time": 14.716031074523926
-    },
-    {
-      "tokens": 200,
-      "processing_time": 28.341803312301636,
-      "output_length": 62.6,
-      "rtf": 0.45,
-      "elapsed_time": 43.44207406044006
-    },
-    {
-      "tokens": 300,
-      "processing_time": 43.352553606033325,
-      "output_length": 96.325,
-      "rtf": 0.45,
-      "elapsed_time": 87.26906609535217
-    },
-    {
-      "tokens": 400,
-      "processing_time": 71.02449822425842,
-      "output_length": 128.575,
-      "rtf": 0.55,
-      "elapsed_time": 158.7198133468628
-    },
-    {
-      "tokens": 500,
-      "processing_time": 70.92521691322327,
-      "output_length": 158.575,
-      "rtf": 0.45,
-      "elapsed_time": 230.01379895210266
-    },
-    {
-      "tokens": 600,
-      "processing_time": 83.6328592300415,
-      "output_length": 189.25,
-      "rtf": 0.44,
-      "elapsed_time": 314.02610969543457
-    },
-    {
-      "tokens": 700,
-      "processing_time": 103.0810194015503,
-      "output_length": 222.075,
-      "rtf": 0.46,
-      "elapsed_time": 417.5678551197052
-    },
-    {
-      "tokens": 800,
-      "processing_time": 127.02162909507751,
-      "output_length": 253.85,
-      "rtf": 0.5,
-      "elapsed_time": 545.0128681659698
-    },
-    {
-      "tokens": 900,
-      "processing_time": 130.49781227111816,
-      "output_length": 283.775,
-      "rtf": 0.46,
-      "elapsed_time": 675.8943417072296
-    },
-    {
-      "tokens": 1000,
-      "processing_time": 154.76425909996033,
-      "output_length": 315.475,
-      "rtf": 0.49,
-      "elapsed_time": 831.0677945613861
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T00:23:52.896889",
-      "cpu_percent": 4.5,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.86032485961914,
-      "gpu_memory_used": 1281.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:07.429461",
-      "cpu_percent": 4.5,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.847564697265625,
-      "gpu_memory_used": 1285.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:07.620587",
-      "cpu_percent": 2.7,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.846607208251953,
-      "gpu_memory_used": 1275.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:36.140754",
-      "cpu_percent": 5.4,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.857810974121094,
-      "gpu_memory_used": 1267.0
-    },
-    {
-      "timestamp": "2025-01-03T00:24:36.340675",
-      "cpu_percent": 6.2,
-      "ram_percent": 39.1,
-      "ram_used_gb": 24.85773468017578,
-      "gpu_memory_used": 1267.0
-    },
-    {
-      "timestamp": "2025-01-03T00:25:19.905634",
-      "cpu_percent": 29.1,
-      "ram_percent": 39.2,
-      "ram_used_gb": 24.920318603515625,
-      "gpu_memory_used": 1256.0
-    },
-    {
-      "timestamp": "2025-01-03T00:25:20.182219",
-      "cpu_percent": 20.0,
-      "ram_percent": 39.2,
-      "ram_used_gb": 24.930198669433594,
-      "gpu_memory_used": 1256.0
-    },
-    {
-      "timestamp": "2025-01-03T00:26:31.414760",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.127891540527344,
-      "gpu_memory_used": 1259.0
-    },
-    {
-      "timestamp": "2025-01-03T00:26:31.617256",
-      "cpu_percent": 3.6,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.126346588134766,
-      "gpu_memory_used": 1252.0
-    },
-    {
-      "timestamp": "2025-01-03T00:27:42.736097",
-      "cpu_percent": 10.5,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.100231170654297,
-      "gpu_memory_used": 1249.0
-    },
-    {
-      "timestamp": "2025-01-03T00:27:42.912870",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.098285675048828,
-      "gpu_memory_used": 1249.0
-    },
-    {
-      "timestamp": "2025-01-03T00:29:06.725264",
-      "cpu_percent": 8.9,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.123123168945312,
-      "gpu_memory_used": 1239.0
-    },
-    {
-      "timestamp": "2025-01-03T00:29:06.928826",
-      "cpu_percent": 5.5,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.128646850585938,
-      "gpu_memory_used": 1239.0
-    },
-    {
-      "timestamp": "2025-01-03T00:30:50.206349",
-      "cpu_percent": 49.6,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.162948608398438,
-      "gpu_memory_used": 1245.0
-    },
-    {
-      "timestamp": "2025-01-03T00:30:50.491837",
-      "cpu_percent": 14.8,
-      "ram_percent": 39.5,
-      "ram_used_gb": 25.13379669189453,
-      "gpu_memory_used": 1245.0
-    },
-    {
-      "timestamp": "2025-01-03T00:32:57.721467",
-      "cpu_percent": 6.2,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.187721252441406,
-      "gpu_memory_used": 1384.0
-    },
-    {
-      "timestamp": "2025-01-03T00:32:57.913350",
-      "cpu_percent": 3.6,
-      "ram_percent": 39.6,
-      "ram_used_gb": 25.199390411376953,
-      "gpu_memory_used": 1384.0
-    },
-    {
-      "timestamp": "2025-01-03T00:35:08.608730",
-      "cpu_percent": 6.3,
-      "ram_percent": 39.8,
-      "ram_used_gb": 25.311710357666016,
-      "gpu_memory_used": 1330.0
-    },
-    {
-      "timestamp": "2025-01-03T00:35:08.791851",
-      "cpu_percent": 5.3,
-      "ram_percent": 39.8,
-      "ram_used_gb": 25.326683044433594,
-      "gpu_memory_used": 1333.0
-    },
-    {
-      "timestamp": "2025-01-03T00:37:43.782406",
-      "cpu_percent": 6.8,
-      "ram_percent": 40.6,
-      "ram_used_gb": 25.803058624267578,
-      "gpu_memory_used": 1409.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
@ -1,300 +0,0 @@
-{
-  "results": [
-    {
-      "tokens": 100,
-      "processing_time": 0.96,
-      "output_length": 31.1,
-      "rtf": 0.03,
-      "elapsed_time": 1.11
-    },
-    {
-      "tokens": 250,
-      "processing_time": 2.23,
-      "output_length": 77.17,
-      "rtf": 0.03,
-      "elapsed_time": 3.49
-    },
-    {
-      "tokens": 400,
-      "processing_time": 4.05,
-      "output_length": 128.05,
-      "rtf": 0.03,
-      "elapsed_time": 7.77
-    },
-    {
-      "tokens": 550,
-      "processing_time": 4.06,
-      "output_length": 171.45,
-      "rtf": 0.02,
-      "elapsed_time": 12.0
-    },
-    {
-      "tokens": 700,
-      "processing_time": 6.01,
-      "output_length": 221.6,
-      "rtf": 0.03,
-      "elapsed_time": 18.16
-    },
-    {
-      "tokens": 850,
-      "processing_time": 6.9,
-      "output_length": 269.1,
-      "rtf": 0.03,
-      "elapsed_time": 25.21
-    },
-    {
-      "tokens": 1000,
-      "processing_time": 7.65,
-      "output_length": 315.05,
-      "rtf": 0.02,
-      "elapsed_time": 33.03
-    },
-    {
-      "tokens": 6000,
-      "processing_time": 48.7,
-      "output_length": 1837.1,
-      "rtf": 0.03,
-      "elapsed_time": 82.21
-    },
-    {
-      "tokens": 11000,
-      "processing_time": 92.44,
-      "output_length": 3388.57,
-      "rtf": 0.03,
-      "elapsed_time": 175.46
-    },
-    {
-      "tokens": 16000,
-      "processing_time": 163.61,
-      "output_length": 4977.32,
-      "rtf": 0.03,
-      "elapsed_time": 340.46
-    },
-    {
-      "tokens": 21000,
-      "processing_time": 209.72,
-      "output_length": 6533.3,
-      "rtf": 0.03,
-      "elapsed_time": 551.92
-    },
-    {
-      "tokens": 26000,
-      "processing_time": 329.35,
-      "output_length": 8068.15,
-      "rtf": 0.04,
-      "elapsed_time": 883.37
-    },
-    {
-      "tokens": 31000,
-      "processing_time": 473.52,
-      "output_length": 9611.48,
-      "rtf": 0.05,
-      "elapsed_time": 1359.28
-    },
-    {
-      "tokens": 36000,
-      "processing_time": 650.98,
-      "output_length": 11157.15,
-      "rtf": 0.06,
-      "elapsed_time": 2012.9
-    }
-  ],
-  "system_metrics": [
-    {
-      "timestamp": "2025-01-03T14:41:01.331735",
-      "cpu_percent": 7.5,
-      "ram_percent": 50.2,
-      "ram_used_gb": 31.960269927978516,
-      "gpu_memory_used": 3191.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:02.357116",
-      "cpu_percent": 17.01,
-      "ram_percent": 50.2,
-      "ram_used_gb": 31.96163558959961,
-      "gpu_memory_used": 3426.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:02.445009",
-      "cpu_percent": 9.5,
-      "ram_percent": 50.3,
-      "ram_used_gb": 31.966781616210938,
-      "gpu_memory_used": 3426.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:04.742152",
-      "cpu_percent": 18.27,
-      "ram_percent": 50.4,
-      "ram_used_gb": 32.08788299560547,
-      "gpu_memory_used": 3642.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:04.847795",
-      "cpu_percent": 16.27,
-      "ram_percent": 50.5,
-      "ram_used_gb": 32.094364166259766,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:09.019590",
-      "cpu_percent": 15.97,
-      "ram_percent": 50.7,
-      "ram_used_gb": 32.23244094848633,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:09.110324",
-      "cpu_percent": 3.54,
-      "ram_percent": 50.7,
-      "ram_used_gb": 32.234458923339844,
-      "gpu_memory_used": 3640.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:13.252607",
-      "cpu_percent": 13.4,
-      "ram_percent": 50.6,
-      "ram_used_gb": 32.194271087646484,
-      "gpu_memory_used": 3935.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:13.327557",
-      "cpu_percent": 4.69,
-      "ram_percent": 50.6,
-      "ram_used_gb": 32.191776275634766,
-      "gpu_memory_used": 3935.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:19.413633",
-      "cpu_percent": 12.92,
-      "ram_percent": 50.9,
-      "ram_used_gb": 32.3467903137207,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:19.492758",
-      "cpu_percent": 7.5,
-      "ram_percent": 50.8,
-      "ram_used_gb": 32.34375,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:26.467284",
-      "cpu_percent": 13.09,
-      "ram_percent": 51.2,
-      "ram_used_gb": 32.56281280517578,
-      "gpu_memory_used": 4249.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:26.553559",
-      "cpu_percent": 8.39,
-      "ram_percent": 51.2,
-      "ram_used_gb": 32.56183624267578,
-      "gpu_memory_used": 4249.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:34.284362",
-      "cpu_percent": 12.61,
-      "ram_percent": 51.7,
-      "ram_used_gb": 32.874778747558594,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:41:34.362353",
-      "cpu_percent": 1.25,
-      "ram_percent": 51.7,
-      "ram_used_gb": 32.87461471557617,
-      "gpu_memory_used": 4250.0
-    },
-    {
-      "timestamp": "2025-01-03T14:42:23.471312",
-      "cpu_percent": 11.64,
-      "ram_percent": 54.9,
-      "ram_used_gb": 34.90264129638672,
-      "gpu_memory_used": 4647.0
-    },
-    {
-      "timestamp": "2025-01-03T14:42:23.547203",
-      "cpu_percent": 5.31,
-      "ram_percent": 54.9,
-      "ram_used_gb": 34.91563415527344,
-      "gpu_memory_used": 4647.0
-    },
-    {
-      "timestamp": "2025-01-03T14:43:56.724933",
-      "cpu_percent": 12.97,
-      "ram_percent": 59.5,
-      "ram_used_gb": 37.84241485595703,
-      "gpu_memory_used": 4655.0
-    },
-    {
-      "timestamp": "2025-01-03T14:43:56.815453",
-      "cpu_percent": 11.75,
-      "ram_percent": 59.5,
-      "ram_used_gb": 37.832679748535156,
-      "gpu_memory_used": 4655.0
-    },
-    {
-      "timestamp": "2025-01-03T14:46:41.705155",
-      "cpu_percent": 12.94,
-      "ram_percent": 66.3,
-      "ram_used_gb": 42.1534538269043,
-      "gpu_memory_used": 4729.0
-    },
-    {
-      "timestamp": "2025-01-03T14:46:41.835177",
-      "cpu_percent": 7.73,
-      "ram_percent": 66.2,
-      "ram_used_gb": 42.13554000854492,
-      "gpu_memory_used": 4729.0
-    },
-    {
-      "timestamp": "2025-01-03T14:50:13.166236",
-      "cpu_percent": 11.62,
-      "ram_percent": 73.4,
-      "ram_used_gb": 46.71288299560547,
-      "gpu_memory_used": 4676.0
-    },
-    {
-      "timestamp": "2025-01-03T14:50:13.261611",
-      "cpu_percent": 8.16,
-      "ram_percent": 73.4,
-      "ram_used_gb": 46.71356201171875,
-      "gpu_memory_used": 4676.0
-    },
-    {
-      "timestamp": "2025-01-03T14:55:44.623607",
-      "cpu_percent": 12.92,
-      "ram_percent": 82.8,
-      "ram_used_gb": 52.65533447265625,
-      "gpu_memory_used": 4636.0
-    },
-    {
-      "timestamp": "2025-01-03T14:55:44.735410",
-      "cpu_percent": 15.29,
-      "ram_percent": 82.7,
-      "ram_used_gb": 52.63290786743164,
-      "gpu_memory_used": 4636.0
-    },
-    {
-      "timestamp": "2025-01-03T15:03:40.534449",
-      "cpu_percent": 13.88,
-      "ram_percent": 85.0,
-      "ram_used_gb": 54.050071716308594,
-      "gpu_memory_used": 4771.0
-    },
-    {
-      "timestamp": "2025-01-03T15:03:40.638708",
-      "cpu_percent": 12.21,
-      "ram_percent": 85.0,
-      "ram_used_gb": 54.053733825683594,
-      "gpu_memory_used": 4771.0
-    },
-    {
-      "timestamp": "2025-01-03T15:14:34.159142",
-      "cpu_percent": 14.51,
-      "ram_percent": 78.1,
-      "ram_used_gb": 49.70396423339844,
-      "gpu_memory_used": 4739.0
-    }
-  ]
-}
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
@ -1,19 +0,0 @@
-=== Benchmark Statistics (with correct RTF) ===
-
-Overall Stats:
-Total tokens processed: 5500
-Total audio generated: 1741.65s
-Total test duration: 831.07s
-Average processing rate: 6.72 tokens/second
-Average RTF: 0.47x
-
-Per-chunk Stats:
-Average chunk size: 550.00 tokens
-Min chunk size: 100.00 tokens
-Max chunk size: 1000.00 tokens
-Average processing time: 82.70s
-Average output length: 174.17s
-
-Performance Ranges:
-Processing rate range: 5.63 - 7.17 tokens/second
-RTF range: 0.44x - 0.55x
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
@ -1,9 +0,0 @@
-=== Benchmark Statistics (with correct RTF) ===
-
-Overall Stats:
-Total tokens processed: 150850
-Total audio generated: 46786.59s
-Total test duration: 2012.90s
-Average processing rate: 104.34 tokens/second
-Average RTF: 0.03x
-
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
@ -1,23 +0,0 @@
-=== Benchmark Statistics (with correct RTF) ===
-
-Total tokens processed: 1800
-Total audio generated (s): 568.53
-Total test duration (s): 244.10
-Average processing rate (tokens/s): 7.34
-Average RTF: 0.43
-Average Real Time Speed: 2.33
-
-=== Per-chunk Stats ===
-
-Average chunk size (tokens): 600.00
-Min chunk size (tokens): 300
-Max chunk size (tokens): 900
-Average processing time (s): 81.30
-Average output length (s): 189.51
-
-=== Performance Ranges ===
-
-Processing rate range (tokens/s): 7.21 - 7.47
-RTF range: 0.43x - 0.43x
-Real Time Speed range: 2.33x - 2.33x
-
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
@ -1,403 +0,0 @@
-{
-  "individual_runs": [
-    {
-      "text_length": 37,
-      "token_count": 10,
-      "total_time": 0.16574740409851074,
-      "time_to_first_chunk": 0.16574740409851074,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
-      "audio_length": 3.45,
-      "target_tokens": 10,
-      "actual_tokens": 10,
-      "run_number": 1
-    },
-    {
-      "text_length": 37,
-      "token_count": 10,
-      "total_time": 0.18812799453735352,
-      "time_to_first_chunk": 0.18812799453735352,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
-      "audio_length": 3.45,
-      "target_tokens": 10,
-      "actual_tokens": 10,
-      "run_number": 2
-    },
-    {
-      "text_length": 37,
-      "token_count": 10,
-      "total_time": 0.18645429611206055,
-      "time_to_first_chunk": 0.18645429611206055,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
-      "audio_length": 3.45,
-      "target_tokens": 10,
-      "actual_tokens": 10,
-      "run_number": 3
-    },
-    {
-      "text_length": 37,
-      "token_count": 10,
-      "total_time": 0.17632031440734863,
-      "time_to_first_chunk": 0.17632031440734863,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
-      "audio_length": 3.45,
-      "target_tokens": 10,
-      "actual_tokens": 10,
-      "run_number": 4
-    },
-    {
-      "text_length": 37,
-      "token_count": 10,
-      "total_time": 0.13381195068359375,
-      "time_to_first_chunk": 0.13381195068359375,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
-      "audio_length": 3.45,
-      "target_tokens": 10,
-      "actual_tokens": 10,
-      "run_number": 5
-    },
-    {
-      "text_length": 102,
-      "token_count": 25,
-      "total_time": 0.2086498737335205,
-      "time_to_first_chunk": 0.2086498737335205,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
-      "audio_length": 7.225,
-      "target_tokens": 25,
-      "actual_tokens": 25,
-      "run_number": 1
-    },
-    {
-      "text_length": 102,
-      "token_count": 25,
-      "total_time": 0.2727653980255127,
-      "time_to_first_chunk": 0.2727653980255127,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
-      "audio_length": 7.225,
-      "target_tokens": 25,
-      "actual_tokens": 25,
-      "run_number": 2
-    },
-    {
-      "text_length": 102,
-      "token_count": 25,
-      "total_time": 0.2096250057220459,
-      "time_to_first_chunk": 0.2096250057220459,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
-      "audio_length": 7.225,
-      "target_tokens": 25,
-      "actual_tokens": 25,
-      "run_number": 3
-    },
-    {
-      "text_length": 102,
-      "token_count": 25,
-      "total_time": 0.2256758213043213,
-      "time_to_first_chunk": 0.2256758213043213,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
-      "audio_length": 7.225,
-      "target_tokens": 25,
-      "actual_tokens": 25,
-      "run_number": 4
-    },
-    {
-      "text_length": 102,
-      "token_count": 25,
-      "total_time": 0.1945042610168457,
-      "time_to_first_chunk": 0.1945042610168457,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
-      "audio_length": 7.225,
-      "target_tokens": 25,
-      "actual_tokens": 25,
-      "run_number": 5
-    },
-    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.4975121021270752,
-      "time_to_first_chunk": 0.4975121021270752,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
-      "audio_length": 16.325,
-      "target_tokens": 50,
-      "actual_tokens": 50,
-      "run_number": 1
-    },
-    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.4518404006958008,
-      "time_to_first_chunk": 0.4518404006958008,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
-      "audio_length": 16.325,
-      "target_tokens": 50,
-      "actual_tokens": 50,
-      "run_number": 2
-    },
-    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.5640325546264648,
-      "time_to_first_chunk": 0.5640325546264648,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
-      "audio_length": 16.325,
-      "target_tokens": 50,
-      "actual_tokens": 50,
-      "run_number": 3
-    },
-    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.5305957794189453,
-      "time_to_first_chunk": 0.5305957794189453,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
-      "audio_length": 16.325,
-      "target_tokens": 50,
-      "actual_tokens": 50,
-      "run_number": 4
-    },
-    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.5540030002593994,
-      "time_to_first_chunk": 0.5540030002593994,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
-      "audio_length": 16.325,
-      "target_tokens": 50,
-      "actual_tokens": 50,
-      "run_number": 5
-    },
-    {
-      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.7963137626647949,
-      "time_to_first_chunk": 0.7963137626647949,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
-      "audio_length": 31.1,
-      "target_tokens": 100,
-      "actual_tokens": 100,
-      "run_number": 1
-    },
-    {
-      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.9320805072784424,
-      "time_to_first_chunk": 0.9320805072784424,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
-      "audio_length": 31.1,
-      "target_tokens": 100,
-      "actual_tokens": 100,
-      "run_number": 2
-    },
-    {
-      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.824256181716919,
-      "time_to_first_chunk": 0.824256181716919,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
-      "audio_length": 31.1,
-      "target_tokens": 100,
-      "actual_tokens": 100,
-      "run_number": 3
-    },
-    {
-      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.9034836292266846,
-      "time_to_first_chunk": 0.9034836292266846,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
-      "audio_length": 31.1,
-      "target_tokens": 100,
-      "actual_tokens": 100,
-      "run_number": 4
-    },
-    {
-      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.8364357948303223,
-      "time_to_first_chunk": 0.8364357948303223,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
-      "audio_length": 31.1,
-      "target_tokens": 100,
-      "actual_tokens": 100,
-      "run_number": 5
-    },
-    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.8122682571411133,
-      "time_to_first_chunk": 1.8122682571411133,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
-      "run_number": 1
-    },
-    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.7290427684783936,
-      "time_to_first_chunk": 1.7290427684783936,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
-      "run_number": 2
-    },
-    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 2.141728401184082,
-      "time_to_first_chunk": 2.141728401184082,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
-      "run_number": 3
-    },
-    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 2.0155680179595947,
-      "time_to_first_chunk": 2.0155680179595947,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
-      "run_number": 4
-    },
-    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.8707575798034668,
-      "time_to_first_chunk": 1.8707575798034668,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
-      "run_number": 5
-    },
-    {
-      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.822713851928711,
-      "time_to_first_chunk": 4.822713851928711,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
-      "audio_length": 157.875,
-      "target_tokens": 500,
-      "actual_tokens": 500,
-      "run_number": 1
-    },
-    {
-      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.227782726287842,
-      "time_to_first_chunk": 4.227782726287842,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
-      "audio_length": 157.875,
-      "target_tokens": 500,
-      "actual_tokens": 500,
-      "run_number": 2
-    },
-    {
-      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.414916276931763,
-      "time_to_first_chunk": 4.414916276931763,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
-      "audio_length": 157.875,
-      "target_tokens": 500,
-      "actual_tokens": 500,
-      "run_number": 3
-    },
-    {
-      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.579505681991577,
-      "time_to_first_chunk": 4.579505681991577,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
-      "audio_length": 157.875,
-      "target_tokens": 500,
-      "actual_tokens": 500,
-      "run_number": 4
-    },
-    {
-      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.332529067993164,
-      "time_to_first_chunk": 4.332529067993164,
-      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
-      "audio_length": 157.875,
-      "target_tokens": 500,
-      "actual_tokens": 500,
-      "run_number": 5
-    }
-  ],
-  "summary": {
-    "10": {
-      "avg_time_to_first_chunk": 0.17,
-      "avg_total_time": 0.17,
-      "avg_audio_length": 3.45,
-      "num_successful_runs": 5
-    },
-    "25": {
-      "avg_time_to_first_chunk": 0.222,
-      "avg_total_time": 0.222,
-      "avg_audio_length": 7.225,
-      "num_successful_runs": 5
-    },
-    "50": {
-      "avg_time_to_first_chunk": 0.52,
-      "avg_total_time": 0.52,
-      "avg_audio_length": 16.325,
-      "num_successful_runs": 5
-    },
-    "100": {
-      "avg_time_to_first_chunk": 0.859,
-      "avg_total_time": 0.859,
-      "avg_audio_length": 31.1,
-      "num_successful_runs": 5
-    },
-    "200": {
-      "avg_time_to_first_chunk": 1.914,
-      "avg_total_time": 1.914,
-      "avg_audio_length": 62.625,
-      "num_successful_runs": 5
-    },
-    "500": {
-      "avg_time_to_first_chunk": 4.475,
-      "avg_total_time": 4.475,
-      "avg_audio_length": 157.875,
-      "num_successful_runs": 5
-    }
-  },
-  "timestamp": "2025-01-04 13:52:28"
-}
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@ -1,271 +1,337 @@
 {
  "individual_runs": [
    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.7278211116790771,
-      "time_to_first_chunk": 0.3613290786743164,
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.4376556873321533,
+      "time_to_first_chunk": 0.4189143180847168,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 1
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.37163758277893066,
+      "time_to_first_chunk": 0.34892702102661133,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 2
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.2654602527618408,
+      "time_to_first_chunk": 0.2409076690673828,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 3
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.24376440048217773,
+      "time_to_first_chunk": 0.23003816604614258,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 4
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.25968003273010254,
+      "time_to_first_chunk": 0.24081206321716309,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 5
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 1.049060344696045,
+      "time_to_first_chunk": 0.3336215019226074,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.4556088447570801,
-      "time_to_first_chunk": 0.18642044067382812,
+      "token_count": null,
+      "total_time": 0.8934676647186279,
+      "time_to_first_chunk": 0.3011031150817871,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.5538768768310547,
-      "time_to_first_chunk": 0.2720797061920166,
+      "token_count": null,
+      "total_time": 0.9444286823272705,
+      "time_to_first_chunk": 0.3198091983795166,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.4395604133605957,
-      "time_to_first_chunk": 0.15613913536071777,
+      "token_count": null,
+      "total_time": 0.9735183715820312,
+      "time_to_first_chunk": 0.369948148727417,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.45748305320739746,
-      "time_to_first_chunk": 0.18805718421936035,
+      "token_count": null,
+      "total_time": 0.8089118003845215,
+      "time_to_first_chunk": 0.30179858207702637,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.7347762584686279,
-      "time_to_first_chunk": 0.16963744163513184,
+      "token_count": null,
+      "total_time": 1.641003131866455,
+      "time_to_first_chunk": 0.2979745864868164,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.8288509845733643,
-      "time_to_first_chunk": 0.20123004913330078,
+      "token_count": null,
+      "total_time": 1.3709619045257568,
+      "time_to_first_chunk": 0.4272146224975586,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.7503848075866699,
-      "time_to_first_chunk": 0.21662068367004395,
+      "token_count": null,
+      "total_time": 1.2554471492767334,
+      "time_to_first_chunk": 0.29790568351745605,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.694899320602417,
-      "time_to_first_chunk": 0.1966841220855713,
+      "token_count": null,
+      "total_time": 1.3761844635009766,
+      "time_to_first_chunk": 0.32633328437805176,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 0.68701171875,
-      "time_to_first_chunk": 0.19341063499450684,
+      "token_count": null,
+      "total_time": 1.56705904006958,
+      "time_to_first_chunk": 0.32801246643066406,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.6845426559448242,
-      "time_to_first_chunk": 0.21096158027648926,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 5.086699962615967,
+      "time_to_first_chunk": 0.33925390243530273,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 1
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.3545098304748535,
-      "time_to_first_chunk": 0.18648386001586914,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 3.827953338623047,
+      "time_to_first_chunk": 0.39266157150268555,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 2
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.426060676574707,
-      "time_to_first_chunk": 0.20081472396850586,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 3.9389824867248535,
+      "time_to_first_chunk": 0.3231511116027832,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 3
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.4084081649780273,
-      "time_to_first_chunk": 0.18551135063171387,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 3.942399740219116,
+      "time_to_first_chunk": 0.34731340408325195,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 4
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.4703152179718018,
-      "time_to_first_chunk": 0.17750859260559082,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 3.7748308181762695,
+      "time_to_first_chunk": 0.40787601470947266,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 5
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.289574384689331,
-      "time_to_first_chunk": 0.1997976303100586,
+      "token_count": null,
+      "total_time": 9.003147840499878,
+      "time_to_first_chunk": 0.5455703735351562,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 3.7089381217956543,
-      "time_to_first_chunk": 0.25969815254211426,
+      "token_count": null,
+      "total_time": 10.081491231918335,
+      "time_to_first_chunk": 0.4591703414916992,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.138366222381592,
-      "time_to_first_chunk": 0.1831505298614502,
+      "token_count": null,
+      "total_time": 9.767668962478638,
+      "time_to_first_chunk": 0.31237053871154785,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 3.980635643005371,
-      "time_to_first_chunk": 0.20493030548095703,
+      "token_count": null,
+      "total_time": 9.090342998504639,
+      "time_to_first_chunk": 0.41753244400024414,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.1370298862457275,
-      "time_to_first_chunk": 0.19150757789611816,
+      "token_count": null,
+      "total_time": 9.876578330993652,
+      "time_to_first_chunk": 0.3965120315551758,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
+    "10": {
+      "avg_time_to_first_chunk": 0.296,
+      "avg_total_time": 0.316,
+      "avg_audio_length": 3.45,
+      "num_successful_runs": 5
+    },
    "50": {
-      "avg_time_to_first_chunk": 0.233,
-      "avg_total_time": 0.527,
-      "avg_audio_length": 16.325,
+      "avg_time_to_first_chunk": 0.325,
+      "avg_total_time": 0.934,
+      "avg_audio_length": 15.925,
      "num_successful_runs": 5
    },
    "100": {
-      "avg_time_to_first_chunk": 0.196,
-      "avg_total_time": 0.739,
-      "avg_audio_length": 31.1,
+      "avg_time_to_first_chunk": 0.335,
+      "avg_total_time": 1.442,
+      "avg_audio_length": 30.5,
      "num_successful_runs": 5
    },
-    "200": {
-      "avg_time_to_first_chunk": 0.192,
-      "avg_total_time": 1.469,
-      "avg_audio_length": 62.625,
+    "250": {
+      "avg_time_to_first_chunk": 0.362,
+      "avg_total_time": 4.114,
+      "avg_audio_length": 78.775,
      "num_successful_runs": 5
    },
    "500": {
-      "avg_time_to_first_chunk": 0.208,
-      "avg_total_time": 4.051,
-      "avg_audio_length": 157.875,
+      "avg_time_to_first_chunk": 0.426,
+      "avg_total_time": 9.564,
+      "avg_audio_length": 156.475,
      "num_successful_runs": 5
    }
  },
-  "timestamp": "2025-01-04 22:16:30"
+  "timestamp": "2025-01-06 00:00:43"
 }
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
@ -1,271 +1,337 @@
 {
  "individual_runs": [
    {
-      "text_length": 212,
-      "token_count": 50,
-      "total_time": 1.149611473083496,
-      "time_to_first_chunk": 0.8767304420471191,
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.7105245590209961,
+      "time_to_first_chunk": 0.6905441284179688,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 1
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.35063982009887695,
+      "time_to_first_chunk": 0.32647228240966797,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 2
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.43519043922424316,
+      "time_to_first_chunk": 0.41011548042297363,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 3
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.33886170387268066,
+      "time_to_first_chunk": 0.32068943977355957,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 4
+    },
+    {
+      "text_length": 37,
+      "token_count": null,
+      "total_time": 0.31725525856018066,
+      "time_to_first_chunk": 0.29624342918395996,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
+      "audio_length": 3.45,
+      "target_tokens": 10,
+      "actual_tokens": 10,
+      "run_number": 5
+    },
+    {
+      "text_length": 212,
+      "token_count": null,
+      "total_time": 1.0215234756469727,
+      "time_to_first_chunk": 0.38323354721069336,
+      "error": null,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.9325947761535645,
-      "time_to_first_chunk": 0.5965914726257324,
+      "token_count": null,
+      "total_time": 1.38511061668396,
+      "time_to_first_chunk": 0.47052764892578125,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 0.9205234050750732,
-      "time_to_first_chunk": 0.5961906909942627,
+      "token_count": null,
+      "total_time": 1.0185234546661377,
+      "time_to_first_chunk": 0.3535764217376709,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 1.1321916580200195,
-      "time_to_first_chunk": 0.6946916580200195,
+      "token_count": null,
+      "total_time": 0.8875925540924072,
+      "time_to_first_chunk": 0.3373105525970459,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
-      "token_count": 50,
-      "total_time": 1.1146185398101807,
-      "time_to_first_chunk": 0.6918885707855225,
+      "token_count": null,
+      "total_time": 0.9557526111602783,
+      "time_to_first_chunk": 0.3364882469177246,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 1.3645410537719727,
-      "time_to_first_chunk": 0.6802399158477783,
+      "token_count": null,
+      "total_time": 1.569596767425537,
+      "time_to_first_chunk": 0.42070746421813965,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 1.4154777526855469,
-      "time_to_first_chunk": 0.7297353744506836,
+      "token_count": null,
+      "total_time": 1.5172030925750732,
+      "time_to_first_chunk": 0.3982264995574951,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 1.3589520454406738,
-      "time_to_first_chunk": 0.698603630065918,
+      "token_count": null,
+      "total_time": 1.5318474769592285,
+      "time_to_first_chunk": 0.3533785343170166,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 1.2276430130004883,
-      "time_to_first_chunk": 0.6705801486968994,
+      "token_count": null,
+      "total_time": 1.3858752250671387,
+      "time_to_first_chunk": 0.3360786437988281,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
-      "token_count": 100,
-      "total_time": 1.0949454307556152,
-      "time_to_first_chunk": 0.5698442459106445,
+      "token_count": null,
+      "total_time": 1.7841475009918213,
+      "time_to_first_chunk": 0.34446048736572266,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.8211240768432617,
-      "time_to_first_chunk": 0.6070489883422852,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 4.334965467453003,
+      "time_to_first_chunk": 0.4336512088775635,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 1
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.8376774787902832,
-      "time_to_first_chunk": 0.6538689136505127,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 5.265941858291626,
+      "time_to_first_chunk": 0.5461773872375488,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 2
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.6953792572021484,
-      "time_to_first_chunk": 0.5554308891296387,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 5.66066575050354,
+      "time_to_first_chunk": 0.4757547378540039,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 3
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.887030839920044,
-      "time_to_first_chunk": 0.5866930484771729,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 9.289174318313599,
+      "time_to_first_chunk": 0.40159058570861816,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 4
    },
    {
-      "text_length": 906,
-      "token_count": 200,
-      "total_time": 1.7908406257629395,
-      "time_to_first_chunk": 0.5897490978240967,
+      "text_length": 1140,
+      "token_count": null,
+      "total_time": 4.425869703292847,
+      "time_to_first_chunk": 0.40808558464050293,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
-      "audio_length": 62.625,
-      "target_tokens": 200,
-      "actual_tokens": 200,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
+      "audio_length": 78.775,
+      "target_tokens": 250,
+      "actual_tokens": 250,
      "run_number": 5
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.228837013244629,
-      "time_to_first_chunk": 0.5315976142883301,
+      "token_count": null,
+      "total_time": 9.600461483001709,
+      "time_to_first_chunk": 0.3966805934906006,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.489210367202759,
-      "time_to_first_chunk": 0.5261838436126709,
+      "token_count": null,
+      "total_time": 8.82239580154419,
+      "time_to_first_chunk": 0.3900904655456543,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.5290446281433105,
-      "time_to_first_chunk": 0.6186764240264893,
+      "token_count": null,
+      "total_time": 10.99152159690857,
+      "time_to_first_chunk": 0.4041757583618164,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.209261178970337,
-      "time_to_first_chunk": 0.5990591049194336,
+      "token_count": null,
+      "total_time": 9.12995958328247,
+      "time_to_first_chunk": 0.43430614471435547,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
-      "token_count": 500,
-      "total_time": 4.218762636184692,
-      "time_to_first_chunk": 0.5466251373291016,
+      "token_count": null,
+      "total_time": 10.043727159500122,
+      "time_to_first_chunk": 0.41181445121765137,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
+    "10": {
+      "avg_time_to_first_chunk": 0.409,
+      "avg_total_time": 0.43,
+      "avg_audio_length": 3.45,
+      "num_successful_runs": 5
+    },
    "50": {
-      "avg_time_to_first_chunk": 0.691,
-      "avg_total_time": 1.05,
-      "avg_audio_length": 16.325,
+      "avg_time_to_first_chunk": 0.376,
+      "avg_total_time": 1.054,
+      "avg_audio_length": 15.925,
      "num_successful_runs": 5
    },
    "100": {
-      "avg_time_to_first_chunk": 0.67,
-      "avg_total_time": 1.292,
-      "avg_audio_length": 31.1,
+      "avg_time_to_first_chunk": 0.371,
+      "avg_total_time": 1.558,
+      "avg_audio_length": 30.5,
      "num_successful_runs": 5
    },
-    "200": {
-      "avg_time_to_first_chunk": 0.599,
-      "avg_total_time": 1.806,
-      "avg_audio_length": 62.625,
+    "250": {
+      "avg_time_to_first_chunk": 0.453,
+      "avg_total_time": 5.795,
+      "avg_audio_length": 78.775,
      "num_successful_runs": 5
    },
    "500": {
-      "avg_time_to_first_chunk": 0.564,
-      "avg_total_time": 4.335,
-      "avg_audio_length": 157.875,
+      "avg_time_to_first_chunk": 0.407,
+      "avg_total_time": 9.718,
+      "avg_audio_length": 156.475,
      "num_successful_runs": 5
    }
  },
-  "timestamp": "2025-01-04 22:18:03"
+  "timestamp": "2025-01-06 00:02:21"
 }
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
@ -1,23 +1,23 @@
 === Benchmark Statistics (with correct RTF) ===

-Total tokens processed: 17150
-Total audio generated (s): 5296.38
-Total test duration (s): 155.23
-Average processing rate (tokens/s): 102.86
-Average RTF: 0.03
-Average Real Time Speed: 31.25
+Total tokens processed: 3150
+Total audio generated (s): 1056.03
+Total test duration (s): 70.20
+Average processing rate (tokens/s): 46.46
+Average RTF: 0.07
+Average Real Time Speed: 15.00

 === Per-chunk Stats ===

-Average chunk size (tokens): 1715.00
+Average chunk size (tokens): 525.00
 Min chunk size (tokens): 150
-Max chunk size (tokens): 5000
-Average processing time (s): 15.39
-Average output length (s): 529.64
+Max chunk size (tokens): 900
+Average processing time (s): 11.57
+Average output length (s): 176.00

 === Performance Ranges ===

-Processing rate range (tokens/s): 80.65 - 125.10
-RTF range: 0.03x - 0.04x
-Real Time Speed range: 25.00x - 33.33x
+Processing rate range (tokens/s): 40.07 - 53.57
+RTF range: 0.06x - 0.08x
+Real Time Speed range: 12.50x - 16.67x

--- a/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
+++ b/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
--- a/examples/assorted_checks/generate_readme_plots.py
+++ b/examples/assorted_checks/generate_readme_plots.py
@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Script to generate all plots needed for the README."""
+
+import os
+import sys
+import shutil
+from pathlib import Path
+
+from validate_wav import validate_tts
+
+# Get absolute paths
+script_dir = Path(__file__).parent.resolve()
+project_root = script_dir.parent.parent
+
+# Add directories to Python path for imports
+sys.path.append(str(script_dir))
+sys.path.append(str(script_dir / "benchmarks"))
+
+# Import test scripts
+from benchmark_tts_rtf import main as benchmark_rtf
+from test_formats.test_audio_formats import main as test_formats
+from benchmark_first_token_stream_unified import main as benchmark_stream
+from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
+
+# Remove directories from path after imports
+sys.path.remove(str(script_dir))
+sys.path.remove(str(script_dir / "benchmarks"))
+
+
+def ensure_assets_dir():
+    """Create assets directory if it doesn't exist."""
+    assets_dir = project_root / "assets"
+    assets_dir.mkdir(exist_ok=True)
+    return assets_dir
+
+
+def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
+    """Copy a plot to the assets directory with a new name."""
+    if os.path.exists(src_path):
+        shutil.copy2(src_path, assets_dir / dest_name)
+        print(f"Copied {src_path} to {assets_dir / dest_name}")
+    else:
+        print(f"Warning: Source plot not found at {src_path}")
+
+
+def validate_and_print(wav_path: str, category: str):
+    """Validate a WAV file and print results."""
+    if not os.path.exists(wav_path):
+        print(f"Warning: WAV file not found at {wav_path}")
+        return
+
+    print(f"\n=== Validating {category} Audio ===")
+    result = validate_tts(wav_path)
+
+    if "error" in result:
+        print(f"Error: {result['error']}")
+    else:
+        print(f"Duration: {result['duration']}")
+        print(f"Sample Rate: {result['sample_rate']} Hz")
+        print(f"Peak Amplitude: {result['peak_amplitude']}")
+        print(f"RMS Level: {result['rms_level']}")
+
+        if result["issues"]:
+            print("\nIssues Found:")
+            for issue in result["issues"]:
+                print(f"- {issue}")
+        else:
+            print("\nNo issues found")
+
+
+def main():
+    """Generate all plots needed for the README."""
+    # Ensure assets directory exists
+    prefix = "gpu"
+    assets_dir = ensure_assets_dir()
+
+    print("\n=== Generating Format Comparison Plot ===")
+    test_formats()
+    copy_plot(
+        str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
+        "format_comparison.png",
+        assets_dir,
+    )
+    # Validate WAV output from format test
+    validate_and_print(
+        str(script_dir / "test_formats/output/test_formats/speech.wav"),
+        "Format Test WAV",
+    )
+
+    print("\n=== Generating Voice Analysis Plot ===")
+    test_voice_analysis()
+    copy_plot(
+        str(script_dir / "test_combinations/output/analysis_comparison.png"),
+        "voice_analysis.png",
+        assets_dir,
+    )
+    # Validate combined voice output
+    validate_and_print(
+        str(
+            script_dir
+            / "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
+        ),
+        "Combined Voice",
+    )
+
+    print("\n=== Generating Performance Benchmark Plots ===")
+    benchmark_rtf()
+    copy_plot(
+        str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
+        f"{prefix}_processing_time.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
+        f"{prefix}_realtime_factor.png",
+        assets_dir,
+    )
+    # Validate RTF benchmark output (~500 tokens)
+    validate_and_print(
+        str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
+        "RTF Benchmark",
+    )
+
+    print("\n=== Generating Streaming Benchmark Plots ===")
+    benchmark_stream()
+
+    # Copy direct streaming plots
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
+        f"{prefix}_first_token_latency_direct.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
+        f"{prefix}_first_token_timeline_direct.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
+        f"{prefix}_total_time_latency_direct.png",
+        assets_dir,
+    )
+
+    # Copy OpenAI streaming plots
+    copy_plot(
+        str(
+            script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
+        ),
+        f"{prefix}_first_token_latency_openai.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(
+            script_dir
+            / "benchmarks/output_plots/first_token_timeline_stream_openai.png"
+        ),
+        f"{prefix}_first_token_timeline_openai.png",
+        assets_dir,
+    )
+    copy_plot(
+        str(
+            script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
+        ),
+        f"{prefix}_total_time_latency_openai.png",
+        assets_dir,
+    )
+
+    # Wait a moment for files to be generated
+    import time
+
+    time.sleep(2)
+
+    # Validate streaming outputs (~500 tokens)
+    validate_and_print(
+        str(
+            script_dir
+            / "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
+        ),
+        "Direct Streaming",
+    )
+    validate_and_print(
+        str(
+            script_dir
+            / "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
+        ),
+        "OpenAI Streaming",
+    )
+
+    validate_and_print(
+        str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
+        "Format Test WAV",
+    )
+
+    print("\nAll plots have been generated and copied to the assets directory")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
+++ b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
@ -73,6 +73,7 @@ def generate_speech(
                "voice": voice,
                "speed": 1.0,
                "response_format": "wav",  # Use WAV for analysis
+                "stream": False,
            },
        )

@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    fig.patch.set_facecolor("#1a1a2e")
    num_files = len(audio_files)

-    # Create subplot grid with proper spacing
+    # Create subplot grid with proper spacing for waveforms and metrics
+    total_rows = num_files + 2  # Add one more row for metrics
    gs = plt.GridSpec(
-        num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
+        total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
    )

    # Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    # Colors for voices
    colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]

-    # Create two subplots for metrics with similar scales
-    # Left subplot: Brightness and Volume
-    ax1 = plt.subplot(gs[num_files, 0])
-    metrics1 = [
+    # Create metrics for each subplot
+    metrics = [
        (
-            "Brightness",
-            [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
-            "kHz",
-        ),
-        ("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
-    ]
-
-    # Right subplot: Voice Pitch and Texture
-    ax2 = plt.subplot(gs[num_files, 1])
-    metrics2 = [
-        (
-            "Voice Pitch",
-            [min(chars["dominant_frequencies"]) for chars in all_chars.values()],
-            "Hz",
+            plt.subplot(gs[num_files, 0]),
+            [
+                (
+                    "Volume",
+                    [chars["rms"] * 100 for chars in all_chars.values()],
+                    "RMS×100",
+                )
+            ],
        ),
        (
-            "Texture",
-            [chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
-            "ZCR×1000",
+            plt.subplot(gs[num_files, 1]),
+            [
+                (
+                    "Brightness",
+                    [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
+                    "kHz",
+                )
+            ],
+        ),
+        (
+            plt.subplot(gs[num_files + 1, 0]),
+            [
+                (
+                    "Voice Pitch",
+                    [
+                        min(chars["dominant_frequencies"])
+                        for chars in all_chars.values()
+                    ],
+                    "Hz",
+                )
+            ],
+        ),
+        (
+            plt.subplot(gs[num_files + 1, 1]),
+            [
+                (
+                    "Texture",
+                    [
+                        chars["zero_crossing_rate"] * 1000
+                        for chars in all_chars.values()
+                    ],
+                    "ZCR×1000",
+                )
+            ],
        ),
    ]

-    def plot_grouped_bars(ax, metrics, show_legend=True):
-        n_groups = len(metrics)
+    # Plot each metric
+    for i, (ax, metric_data) in enumerate(metrics):
        n_voices = len(audio_files)
        bar_width = 0.25
+        indices = np.array([0])

-        indices = np.arange(n_groups)
+        values = metric_data[0][1]
+        max_val = max(values)

-        # Get max value for y-axis scaling
-        max_val = max(max(m[1]) for m in metrics)
-
-        for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
-            values = [m[1][i] for m in metrics]
-            offset = (i - n_voices / 2 + 0.5) * bar_width
+        for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
+            offset = (j - n_voices / 2 + 0.5) * bar_width
            bars = ax.bar(
-                indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
+                indices + offset,
+                [values[j]],
+                bar_width,
+                label=voice,
+                color=color,
+                alpha=0.8,
            )

            # Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                )

        ax.set_xticks(indices)
-        ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
-
-        # Set y-axis limits with some padding
+        ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
        ax.set_ylim(0, max_val * 1.2)
+        ax.set_ylabel("Value")

-        if show_legend:
+        # Only show legend on first metric plot
+        if i == 0:
            ax.legend(
                bbox_to_anchor=(1.05, 1),
                loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                edgecolor="#ffffff",
            )

-    # Plot both subplots
-    plot_grouped_bars(ax1, metrics1, show_legend=True)
-    plot_grouped_bars(ax2, metrics2, show_legend=False)
+        # Style the subplot
+        setup_plot(fig, ax, metric_data[0][0])

-    # Style both subplots
-    setup_plot(fig, ax1, "Brightness and Volume")
-    setup_plot(fig, ax2, "Voice Pitch and Texture")
-
-    # Add y-axis labels
-    ax1.set_ylabel("Value")
-    ax2.set_ylabel("Value")
-
-    # Adjust the figure size to accommodate the legend
-    fig.set_size_inches(15, 15)
-
-    # Add padding around the entire figure
+    # Adjust the figure size and padding
+    fig.set_size_inches(15, 20)
    plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
    plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
    print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
    )
    parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
    parser.add_argument(
-        "--output-dir", 
+        "--output-dir",
        default="examples/assorted_checks/test_combinations/output",
        help="Output directory for audio files",
    )
--- a/examples/assorted_checks/test_formats/test_audio_formats.py
+++ b/examples/assorted_checks/test_formats/test_audio_formats.py
@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
    for i, stat in enumerate(stats):
        format_name = stat["format"].upper()
        try:
-            # Handle PCM format differently
-            if stat["format"] == "pcm":
-                # Read raw PCM data (16-bit mono)
-                with open(
-                    os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
-                ) as f:
-                    raw_data = f.read()
-                data = np.frombuffer(raw_data, dtype=np.int16)
-                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
-                sr = 24000
-            else:
-                # Read other formats with soundfile
-                data, sr = sf.read(
-                    os.path.join(output_dir, f"test_audio.{stat['format']}")
-                )
+            file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")

-            # Plot waveform
+            if stat["format"] == "wav":
+                # Use scipy.io.wavfile for WAV files
+                sr, data = wavfile.read(file_path)
+                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+            elif stat["format"] == "pcm":
+                # Read raw 16-bit signed little-endian PCM data at 24kHz
+                data = np.frombuffer(
+                    open(file_path, "rb").read(), dtype="<i2"
+                )  # '<i2' means little-endian 16-bit signed int
+                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+                sr = 24000  # Known sample rate for our endpoint
+            else:
+                # Use soundfile for other formats (mp3, opus, flac)
+                data, sr = sf.read(file_path)
+
+            # Plot waveform with consistent normalization
            ax = plt.subplot(gs_waves[i])
            time = np.arange(len(data)) / sr
-            plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
+            plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
            ax.set_xlabel("Time (seconds)")
            ax.set_ylabel("")
            ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
    """Get audio file statistics"""
    file_size = os.path.getsize(file_path)
    file_size_kb = file_size / 1024  # Convert to KB
+    format_name = Path(file_path).suffix[1:]

-    try:
-        # Try reading with soundfile first
+    if format_name == "wav":
+        # Use scipy.io.wavfile for WAV files
+        sample_rate, data = wavfile.read(file_path)
+        data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
+        duration = len(data) / sample_rate
+        channels = 1 if len(data.shape) == 1 else data.shape[1]
+    elif format_name == "pcm":
+        # For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
+        data = np.frombuffer(
+            open(file_path, "rb").read(), dtype="<i2"
+        )  # '<i2' means little-endian 16-bit signed int
+        data = data.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
+        sample_rate = 24000  # Known sample rate for our endpoint
+        duration = len(data) / sample_rate
+        channels = 1
+    else:
+        # Use soundfile for other formats (mp3, opus, flac)
        data, sample_rate = sf.read(file_path)
        duration = len(data) / sample_rate
        channels = 1 if len(data.shape) == 1 else data.shape[1]

-        # Calculate audio statistics
-        stats = {
-            "format": Path(file_path).suffix[1:],
-            "file_size_kb": round(file_size_kb, 2),
-            "duration_seconds": round(duration, 2),
-            "sample_rate": sample_rate,
-            "channels": channels,
-            "min_amplitude": float(np.min(data)),
-            "max_amplitude": float(np.max(data)),
-            "mean_amplitude": float(np.mean(np.abs(data))),
-            "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
-        }
-        return stats
-    except:
-        # For PCM, read raw bytes and estimate duration
-        with open(file_path, "rb") as f:
-            data = f.read()
-            # Assuming 16-bit PCM mono at 24kHz
-            samples = len(data) // 2  # 2 bytes per sample
-            duration = samples / 24000
-            return {
-                "format": "pcm",
-                "file_size_kb": round(file_size_kb, 2),
-                "duration_seconds": round(duration, 2),
-                "sample_rate": 24000,
-                "channels": 1,
-                "note": "PCM stats are estimated from raw bytes",
-            }
+    # Calculate audio statistics
+    stats = {
+        "format": format_name,
+        "file_size_kb": round(file_size_kb, 2),
+        "duration_seconds": round(duration, 2),
+        "sample_rate": sample_rate,
+        "channels": channels,
+        "min_amplitude": float(np.min(data)),
+        "max_amplitude": float(np.max(data)),
+        "mean_amplitude": float(np.mean(np.abs(data))),
+        "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
+    }
+    return stats


 def main():
@ -254,13 +256,49 @@ def main():

        # Generate and save
        start_time = time.time()
-        response = client.audio.speech.create(
-            model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
+
+        # Use requests with stream=False for consistent data handling
+        response = requests.post(
+            "http://localhost:8880/v1/audio/speech",
+            json={
+                "model": "kokoro",
+                "voice": voice,
+                "input": SAMPLE_TEXT,
+                "response_format": fmt,
+                "stream": False,  # Explicitly disable streaming to get single complete chunk
+            },
+            stream=False,
+            headers={"Accept": f"audio/{fmt}"},  # Explicitly request audio format
        )
        generation_time = time.time() - start_time

-        with open(output_path, "wb") as f:
-            f.write(response.content)
+        print(f"\nResponse headers for {fmt}:")
+        for header, value in response.headers.items():
+            print(f"{header}: {value}")
+        print(f"Content length: {len(response.content)} bytes")
+        print(f"First few bytes: {response.content[:20].hex()}")
+
+        # Write the file and verify it was written correctly
+        try:
+            with open(output_path, "wb") as f:
+                f.write(response.content)
+
+            # Verify file was written
+            if not output_path.exists():
+                raise Exception(f"Failed to write {fmt} file")
+
+            # Check file size matches content length
+            written_size = output_path.stat().st_size
+            if written_size != len(response.content):
+                raise Exception(
+                    f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
+                )
+
+            print(f"Successfully wrote {fmt} file")
+
+        except Exception as e:
+            print(f"Error writing {fmt} file: {e}")
+            continue

        # Get stats
        file_stats = get_audio_stats(str(output_path))
--- a/examples/assorted_checks/test_normalizer.py
+++ b/examples/assorted_checks/test_normalizer.py
@ -4,15 +4,19 @@ import random
 import string
 from typing import List, Tuple

+
 def create_test_cases() -> List[str]:
    """Create a variety of test cases with different characteristics"""
-    
+
    # Helper to create random text with specific patterns
    def random_text(length: int) -> str:
-        return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
-    
+        return "".join(
+            random.choice(string.ascii_letters + string.digits + " .,!?")
+            for _ in range(length)
+        )
+
    test_cases = []
-    
+
    # Base test cases that hit specific patterns
    base_cases = [
        "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
@ -21,10 +25,10 @@ def create_test_cases() -> List[str]:
        "X's and Y's properties cost £50 million in the 1990s",
        "こんにちは。今日は！",
    ]
-    
+
    # Add base cases
    test_cases.extend(base_cases)
-    
+
    # Add variations with random content
    for length in [100, 1000, 10000]:
        # Create 3 variations of each length
@ -35,23 +39,24 @@ def create_test_cases() -> List[str]:
            text = text.replace(text[30:40], "$1,234.56")
            text = text.replace(text[50:60], "A.B.C. xyz")
            test_cases.append(text)
-    
+
    return test_cases

+
 class TextNormalizerInline:
    """Text normalizer using inline patterns"""
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        text = re.sub(r"[^\S \n]", " ", text)
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -61,108 +66,132 @@ class TextNormalizerInline:
        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
-        text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
+        text = re.sub(
+            r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
+            split_num,
+            text,
+        )
        text = re.sub(r"(?<=\d),(?=\d)", "", text)
-        text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
+        text = re.sub(
+            r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
+            handle_money,
+            text,
+        )
        text = re.sub(r"\d*\.\d+", handle_decimal, text)
        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
        text = re.sub(r"(?<=\d)S", " S", text)
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
-        text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
+        text = re.sub(
+            r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
+        )
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
-        
+
        return text.strip()

+
 class TextNormalizerCompiled:
    """Text normalizer using all compiled patterns"""
-    
+
    def __init__(self):
        self.patterns = {
-            'whitespace': re.compile(r"[^\S \n]"),
-            'multi_space': re.compile(r"  +"),
-            'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
-            'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
-            'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
-            'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
-            'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
-            'etc': re.compile(r"\betc\.(?! [A-Z])"),
-            'yeah': re.compile(r"(?i)\b(y)eah?\b"),
-            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
-            'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
-            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
-            'decimal': re.compile(r"\d*\.\d+"),
-            'range': re.compile(r"(?<=\d)-(?=\d)"),
-            's_after_number': re.compile(r"(?<=\d)S"),
-            'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
-            'x_possessive': re.compile(r"(?<=X')S\b"),
-            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
-            'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
+            "whitespace": re.compile(r"[^\S \n]"),
+            "multi_space": re.compile(r"  +"),
+            "newline_space": re.compile(r"(?<=\n) +(?=\n)"),
+            "doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
+            "mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
+            "miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
+            "mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
+            "etc": re.compile(r"\betc\.(?! [A-Z])"),
+            "yeah": re.compile(r"(?i)\b(y)eah?\b"),
+            "numbers": re.compile(
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
+            ),
+            "comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
+            "money": re.compile(
+                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
+            ),
+            "decimal": re.compile(r"\d*\.\d+"),
+            "range": re.compile(r"(?<=\d)-(?=\d)"),
+            "s_after_number": re.compile(r"(?<=\d)S"),
+            "possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
+            "x_possessive": re.compile(r"(?<=X')S\b"),
+            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
+            "single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
        }
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        # Use compiled patterns
-        text = self.patterns['whitespace'].sub(" ", text)
-        text = self.patterns['multi_space'].sub(" ", text)
-        text = self.patterns['newline_space'].sub("", text)
-        text = self.patterns['doctor'].sub("Doctor", text)
-        text = self.patterns['mister'].sub("Mister", text)
-        text = self.patterns['miss'].sub("Miss", text)
-        text = self.patterns['mrs'].sub("Mrs", text)
-        text = self.patterns['etc'].sub("etc", text)
-        text = self.patterns['yeah'].sub(r"\1e'a", text)
-        text = self.patterns['numbers'].sub(split_num, text)
-        text = self.patterns['comma_in_number'].sub("", text)
-        text = self.patterns['money'].sub(handle_money, text)
-        text = self.patterns['decimal'].sub(handle_decimal, text)
-        text = self.patterns['range'].sub(" to ", text)
-        text = self.patterns['s_after_number'].sub(" S", text)
-        text = self.patterns['possessive_s'].sub("'S", text)
-        text = self.patterns['x_possessive'].sub("s", text)
-        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
-        text = self.patterns['single_initial'].sub("-", text)
-        
+        text = self.patterns["whitespace"].sub(" ", text)
+        text = self.patterns["multi_space"].sub(" ", text)
+        text = self.patterns["newline_space"].sub("", text)
+        text = self.patterns["doctor"].sub("Doctor", text)
+        text = self.patterns["mister"].sub("Mister", text)
+        text = self.patterns["miss"].sub("Miss", text)
+        text = self.patterns["mrs"].sub("Mrs", text)
+        text = self.patterns["etc"].sub("etc", text)
+        text = self.patterns["yeah"].sub(r"\1e'a", text)
+        text = self.patterns["numbers"].sub(split_num, text)
+        text = self.patterns["comma_in_number"].sub("", text)
+        text = self.patterns["money"].sub(handle_money, text)
+        text = self.patterns["decimal"].sub(handle_decimal, text)
+        text = self.patterns["range"].sub(" to ", text)
+        text = self.patterns["s_after_number"].sub(" S", text)
+        text = self.patterns["possessive_s"].sub("'S", text)
+        text = self.patterns["x_possessive"].sub("s", text)
+        text = self.patterns["initials"].sub(
+            lambda m: m.group().replace(".", "-"), text
+        )
+        text = self.patterns["single_initial"].sub("-", text)
+
        return text.strip()

+
 class TextNormalizerHybrid:
    """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
-    
+
    def __init__(self):
        # Only compile patterns that are complex or frequently used
        self.patterns = {
-            'whitespace': re.compile(r"[^\S \n]"),
-            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
-            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
-            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
+            "whitespace": re.compile(r"[^\S \n]"),
+            "numbers": re.compile(
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
+            ),
+            "money": re.compile(
+                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
+            ),
+            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
        }
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        # Use compiled patterns for complex operations
-        text = self.patterns['whitespace'].sub(" ", text)
-        text = self.patterns['numbers'].sub(split_num, text)
-        text = self.patterns['money'].sub(handle_money, text)
-        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
-        
+        text = self.patterns["whitespace"].sub(" ", text)
+        text = self.patterns["numbers"].sub(split_num, text)
+        text = self.patterns["money"].sub(handle_money, text)
+        text = self.patterns["initials"].sub(
+            lambda m: m.group().replace(".", "-"), text
+        )
+
        # Use inline patterns for simpler operations
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -179,9 +208,10 @@ class TextNormalizerHybrid:
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
-        
+
        return text.strip()

+
 def split_num(match: re.Match) -> str:
    """Split numbers for TTS processing"""
    num = match.group(0)
@ -192,61 +222,70 @@ def split_num(match: re.Match) -> str:
        return f"{num[:-1]} s"
    return num

+
 def handle_money(match: re.Match) -> str:
    """Format money strings for TTS"""
    text = match.group(0)
    return text.replace("$", " dollars ").replace("£", " pounds ")

+
 def handle_decimal(match: re.Match) -> str:
    """Format decimal numbers for TTS"""
    num = match.group(0)
    return num.replace(".", " point ")

-def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
+
+def benchmark_normalizers(
+    test_cases: List[str], iterations: int = 100
+) -> Tuple[float, float, float]:
    """Benchmark all three implementations"""
-    
+
    normalizers = {
-        'inline': TextNormalizerInline(),
-        'compiled': TextNormalizerCompiled(),
-        'hybrid': TextNormalizerHybrid()
+        "inline": TextNormalizerInline(),
+        "compiled": TextNormalizerCompiled(),
+        "hybrid": TextNormalizerHybrid(),
    }
-    
+
    results = {}
-    
+
    # Test each normalizer
    for name, normalizer in normalizers.items():
        start = time.perf_counter()
-        
+
        # Run normalizations
        for _ in range(iterations):
            for test in test_cases:
                normalizer.normalize(test)
-        
+
        results[name] = time.perf_counter() - start
-    
+
    return results

+
 def verify_outputs(test_cases: List[str]) -> bool:
    """Verify that all implementations produce identical output"""
    normalizers = {
-        'inline': TextNormalizerInline(),
-        'compiled': TextNormalizerCompiled(),
-        'hybrid': TextNormalizerHybrid()
+        "inline": TextNormalizerInline(),
+        "compiled": TextNormalizerCompiled(),
+        "hybrid": TextNormalizerHybrid(),
    }
-    
+
    for test in test_cases:
        results = [norm.normalize(test) for norm in normalizers.values()]
        if not all(r == results[0] for r in results):
            return False
    return True

+
 def main():
    # Create test cases
    print("Generating test cases...")
    test_cases = create_test_cases()
    total_chars = sum(len(t) for t in test_cases)
-    print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
-    
+    print(
+        f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
+    )
+
    # Verify output consistency
    print("\nVerifying output consistency...")
    if verify_outputs(test_cases):
@ -254,15 +293,16 @@ def main():
    else:
        print("✗ Warning: Implementations produce different outputs!")
        return
-    
+
    # Run benchmarks
    print("\nRunning benchmarks...")
    iterations = 100
    results = benchmark_normalizers(test_cases, iterations)
-    
+
    # Print results
    print(f"\nResults for {iterations} iterations: ")
    for name, time_taken in results.items():
        print(f"{name.capitalize()}: {time_taken:.3f}s")

-main()
+
+main()
--- a/examples/assorted_checks/validate_wav.py
+++ b/examples/assorted_checks/validate_wav.py
@ -1,8 +1,11 @@
+import argparse
+from typing import Any, Dict
+from pathlib import Path
+
 import numpy as np
 import soundfile as sf
-import argparse
-from pathlib import Path
-from typing import Dict, Any
+from tqdm import tqdm
+

 def validate_tts(wav_path: str) -> dict:
    """
@ -13,34 +16,40 @@ def validate_tts(wav_path: str) -> dict:
        audio, sr = sf.read(wav_path)
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)
-        
+
        duration = len(audio) / sr
        issues = []
-        
+
        # Basic quality checks
        abs_audio = np.abs(audio)
        stats = {
-            'rms': float(np.sqrt(np.mean(audio**2))),
-            'peak': float(np.max(abs_audio)),
-            'dc_offset': float(np.mean(audio))
+            "rms": float(np.sqrt(np.mean(audio**2))),
+            "peak": float(np.max(abs_audio)),
+            "dc_offset": float(np.mean(audio)),
        }
-        
+
        clip_count = np.sum(abs_audio >= 0.99)
        clip_percent = (clip_count / len(audio)) * 100
-        
+
        if duration < 0.1:
-            issues.append("WARNING: Audio is suspiciously short - possible failed generation")
-            
-        if stats['peak'] >= 1.0:
+            issues.append(
+                "WARNING: Audio is suspiciously short - possible failed generation"
+            )
+
+        if stats["peak"] >= 1.0:
            if clip_percent > 1.0:
-                issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
+                issues.append(
+                    f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
+                )
            elif clip_percent > 0.01:
-                issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
-            
-        if stats['rms'] < 0.01:
+                issues.append(
+                    f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
+                )
+
+        if stats["rms"] < 0.01:
            issues.append("WARNING: Audio is very quiet - possible failed generation")
-            
-        if abs(stats['dc_offset']) > 0.1:
+
+        if abs(stats["dc_offset"]) > 0.1:
            issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")

        # Check for long silence gaps
@ -51,66 +60,79 @@ def validate_tts(wav_path: str) -> dict:
        window_size = int(min_silence * sr)
        silence_count = 0
        last_silence = -1
-        
+
        start_idx = int(0.2 * sr)  # Skip first 0.2s
-        for i in range(start_idx, len(db) - window_size, window_size):
-            window = db[i:i+window_size]
+        for i in tqdm(
+            range(start_idx, len(db) - window_size, window_size),
+            desc="Checking for silence",
+        ):
+            window = db[i : i + window_size]
            if np.mean(window) < silence_threshold:
                silent_ratio = np.mean(window < silence_threshold)
                if silent_ratio > 0.9:
-                    if last_silence == -1 or (i/sr - last_silence) > 2.0:
+                    if last_silence == -1 or (i / sr - last_silence) > 2.0:
                        silence_count += 1
-                        last_silence = i/sr
-                        issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
-        
+                        last_silence = i / sr
+                        issues.append(
+                            f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
+                        )
+
        if silence_count > 2:
-            issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
+            issues.append(
+                f"WARNING: Multiple long silences found ({silence_count} total)"
+            )

        # Detect audio artifacts
        diff = np.diff(audio)
        abs_diff = np.abs(diff)
        window_size = min(int(0.005 * sr), 256)
-        window = np.ones(window_size)/window_size
-        local_avg_diff = np.convolve(abs_diff, window, mode='same')
-        
+        window = np.ones(window_size) / window_size
+        local_avg_diff = np.convolve(abs_diff, window, mode="same")
+
        spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
        artifact_indices = np.nonzero(spikes)[0]
-        
+
        artifacts = []
        if len(artifact_indices) > 0:
            gaps = np.diff(artifact_indices)
            min_gap = int(0.005 * sr)
            break_points = np.nonzero(gaps > min_gap)[0] + 1
            groups = np.split(artifact_indices, break_points)
-            
+
            for group in groups:
                if len(group) >= 5:
                    severity = np.max(abs_diff[group])
                    if severity > 0.2:
-                        center_idx = group[len(group)//2]
-                        artifacts.append({
-                            'time': float(center_idx/sr),  # Ensure float for consistent timing
-                            'severity': float(severity)
-                        })
+                        center_idx = group[len(group) // 2]
+                        artifacts.append(
+                            {
+                                "time": float(
+                                    center_idx / sr
+                                ),  # Ensure float for consistent timing
+                                "severity": float(severity),
+                            }
+                        )
                        issues.append(
                            f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
                            f"(severity: {severity:.3f})"
                        )

        # Check for repeated speech segments
-        for chunk_duration in [5.0, 10.0]:
+        for chunk_duration in tqdm(
+            [0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
+        ):
            chunk_size = int(chunk_duration * sr)
            overlap = int(0.2 * chunk_size)
-            
-            for i in range(0, len(audio) - 2*chunk_size, overlap):
-                chunk1 = audio[i:i+chunk_size]
-                chunk2 = audio[i+chunk_size:i+2*chunk_size]
-                
+
+            for i in range(0, len(audio) - 2 * chunk_size, overlap):
+                chunk1 = audio[i : i + chunk_size]
+                chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
+
                if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
                    continue
-                    
+
                try:
-                    correlation = np.corrcoef(chunk1, chunk2)[0,1]
+                    correlation = np.corrcoef(chunk1, chunk2)[0, 1]
                    if not np.isnan(correlation) and correlation > 0.92:
                        issues.append(
                            f"WARNING: Possible repeated speech at {i/sr:.1f}s "
@ -128,92 +150,113 @@ def validate_tts(wav_path: str) -> dict:
            "rms_level": f"{stats['rms']:.3f}",
            "dc_offset": f"{stats['dc_offset']:.3f}",
            "artifact_count": len(artifacts),
-            "artifact_locations": [a['time'] for a in artifacts],
-            "artifact_severities": [a['severity'] for a in artifacts],
+            "artifact_locations": [a["time"] for a in artifacts],
+            "artifact_severities": [a["severity"] for a in artifacts],
            "issues": issues,
-            "valid": len(issues) == 0
-        }
-        
-    except Exception as e:
-        return {
-            "file": wav_path,
-            "error": str(e),
-            "valid": False
+            "valid": len(issues) == 0,
        }

-def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
+    except Exception as e:
+        return {"file": wav_path, "error": str(e), "valid": False}
+
+
+def generate_analysis_plots(
+    wav_path: str, output_dir: str, validation_result: Dict[str, Any]
+):
    """
    Generate analysis plots for audio file with time-aligned visualizations.
    """
    import matplotlib.pyplot as plt
    from scipy.signal import spectrogram
-    
+
    # Load audio
    audio, sr = sf.read(wav_path)
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
-    
+
    # Create figure with shared x-axis
    fig = plt.figure(figsize=(15, 8))
    gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
-    
+
    # Calculate spectrogram
    nperseg = 2048
    noverlap = 1536
-    f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap, 
-                           window='hann', scaling='spectrum')
-    
+    f, t, Sxx = spectrogram(
+        audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
+    )
+
    # Plot spectrogram
-    im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), 
-                        shading='gouraud', cmap='viridis', 
-                        vmin=-100, vmax=-20)
-    ax1.set_ylabel('Frequency [Hz]', fontsize=10)
-    cbar = plt.colorbar(im, ax=ax1, label='dB')
-    ax1.set_title('Spectrogram', pad=10, fontsize=12)
-    
+    im = ax1.pcolormesh(
+        t,
+        f,
+        10 * np.log10(Sxx + 1e-10),
+        shading="gouraud",
+        cmap="viridis",
+        vmin=-100,
+        vmax=-20,
+    )
+    ax1.set_ylabel("Frequency [Hz]", fontsize=10)
+    cbar = plt.colorbar(im, ax=ax1, label="dB")
+    ax1.set_title("Spectrogram", pad=10, fontsize=12)
+
    # Plot waveform with exact time alignment
    times = np.arange(len(audio)) / sr
-    ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
-    ax2.set_ylabel('Amplitude', fontsize=10)
-    ax2.set_xlabel('Time [sec]', fontsize=10)
+    ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
+    ax2.set_ylabel("Amplitude", fontsize=10)
+    ax2.set_xlabel("Time [sec]", fontsize=10)
    ax2.grid(True, alpha=0.2)
-    
+
    # Add artifact markers
-    if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
-        for loc in validation_result['artifact_locations']:
-            ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
-            ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
-        
+    if (
+        "artifact_locations" in validation_result
+        and validation_result["artifact_locations"]
+    ):
+        for loc in validation_result["artifact_locations"]:
+            ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
+            ax2.axvline(
+                x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
+            )
+
        # Add legend to both plots
-        if len(validation_result['artifact_locations']) > 0:
-            ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
-            ax1.legend(loc='upper right', fontsize=8)
+        if len(validation_result["artifact_locations"]) > 0:
+            ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
+            ax1.legend(loc="upper right", fontsize=8)
            # Only add unique labels to legend
            handles, labels = ax2.get_legend_handles_labels()
            unique_labels = dict(zip(labels, handles))
-            ax2.legend(unique_labels.values(), unique_labels.keys(), 
-                      loc='upper right', fontsize=8)
-    
+            ax2.legend(
+                unique_labels.values(),
+                unique_labels.keys(),
+                loc="upper right",
+                fontsize=8,
+            )
+
    # Set common x limits
-    xlim = (0, len(audio)/sr)
+    xlim = (0, len(audio) / sr)
    ax1.set_xlim(xlim)
    ax2.set_xlim(xlim)
    og_filename = Path(wav_path).name.split(".")[0]
    # Save plot
-    plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
+    plt.savefig(
+        Path(output_dir) / f"{og_filename}_audio_analysis.png",
+        dpi=300,
+        bbox_inches="tight",
+    )
    plt.close()

-if __name__ == "__main__":
-    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
-    silent=False

+if __name__ == "__main__":
+    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
+    silent = False
+
+    print(f"\n\n Processing:\n\t{wav_file}")
    result = validate_tts(wav_file)
    if not silent:
        wav_root_dir = Path(wav_file).parent
        generate_analysis_plots(wav_file, wav_root_dir, result)
-    
+
    print(f"\nValidating: {result['file']}")
    if "error" in result:
        print(f"Error: {result['error']}")
@ -224,10 +267,10 @@ if __name__ == "__main__":
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
        print(f"Detected Artifacts: {result['artifact_count']}")
-        
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
                print(f"- {issue}")
        else:
-            print("\nNo issues found")
+            print("\nNo issues found")
--- a/examples/assorted_checks/validate_wavs.py
+++ b/examples/assorted_checks/validate_wavs.py
@ -1,7 +1,9 @@
 import argparse
 from pathlib import Path
+
 from validate_wav import validate_tts

+
 def print_validation_result(result: dict, rel_path: Path):
    """Print full validation details for a single file."""
    print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
-        
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
        else:
            print("\nNo issues found")

+
 def validate_directory(directory: str):
    """Validate all wav files in a directory with detailed output and summary."""
    dir_path = Path(directory)
-    
+
    # Find all wav files (including nested directories)
    wav_files = list(dir_path.rglob("*.wav"))
    wav_files.extend(dir_path.rglob("*.mp3"))  # Also check mp3s
    wav_files = sorted(wav_files)
-    
+
    if not wav_files:
        print(f"No .wav or .mp3 files found in {directory}")
        return
-        
+
    print(f"Found {len(wav_files)} files in {directory}")
    print("=" * 80)
-    
+
    # Store results for summary
    results = []
-    
+
    # Detailed validation output
    for wav_file in wav_files:
        result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
        print_validation_result(result, rel_path)
        results.append((rel_path, result))
        print("=" * 80)
-    
+
    # Summary with detailed issues
    print("\nSUMMARY:")
    for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
            issues = result["issues"]
            first_issue = issues[0].replace("WARNING: ", "")
            if len(issues) > 1:
-                print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
+                print(
+                    f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
+                )
            else:
                print(f"{rel_path}: FAIL - {first_issue}")
        else:
            print(f"{rel_path}: PASS")

+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
    parser.add_argument("directory", help="Directory containing wav files to validate")
    args = parser.parse_args()
-    
+
    validate_directory(args.directory)
--- a/examples/output.wav
+++ b/examples/output.wav
--- a/examples/output_audio_analysis.png
+++ b/examples/output_audio_analysis.png
--- a/examples/speech.mp3
+++ b/examples/speech.mp3
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ numpy==2.2.1
 scipy==1.14.1

 # Audio processing
-soundfile==0.12.1
+soundfile==0.13.0

 # Text processing
 phonemizer==3.3.0