-update soundfile version

-alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo
2025-08-05 16:48:53 +00:00 · 2025-01-06 03:32:41 -07:00 · 2025-01-06 03:32:41 -07:00 · 720c1fb97d
commit 720c1fb97d
parent 4c6cd83f85
77 changed files with 2945 additions and 5522 deletions
--- a/.coverage
+++ b/.coverage
--- a/README.md
+++ b/README.md
@ -129,7 +129,7 @@ response = requests.post(
 )
 ```
 <p align="center">
-  <img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>
 </details>
@ -144,7 +144,7 @@ response = requests.post(
 - pcm
 <p align="center">
-<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
+<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
 </p>
 </details>
@ -175,8 +175,8 @@ Benchmarking was performed on generation via the local API using text lengths up
 - H.G. Wells - The Time Machine (full text)
 <p align="center">
-  <img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
+  <img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
-  <img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
+  <img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
 </p>
 Key Performance Metrics:
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -18,6 +18,8 @@ class Settings(BaseSettings):
    onnx_model_path: str = "kokoro-v0_19.onnx"
    voices_dir: str = "voices"
    sample_rate: int = 24000
    max_chunk_size: int = 300  # Maximum size of text chunks for processing
    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
    # ONNX Optimization Settings
    onnx_num_threads: int = 4  # Number of threads for intra-op parallelism
--- a/api/src/core/don_quixote.txt
+++ b/api/src/core/don_quixote.txt
@ -0,0 +1,9 @@
 In a village of La Mancha, the name of which I have no desire to call
 to mind, there lived not long since one of those gentlemen that keep a
 lance in the lance-rack, an old buckler, a lean hack, and a greyhound
 for coursing. An olla of rather more beef than mutton, a salad on most
 nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
 extra on Sundays, made away with three-quarters of his income. The rest
 of it went in a doublet of fine cloth and velvet breeches and shoes to
 match for holidays, while on week-days he made a brave figure in his
 best homespun. 
--- a/api/src/main.py
+++ b/api/src/main.py
@ -22,10 +22,11 @@ async def lifespan(app: FastAPI):
    logger.info("Loading TTS model and voice packs...")
    # Initialize the main model with warm-up
-    voicepack_count = TTSModel.setup()
+    voicepack_count = await TTSModel.setup()
    # boundary = "█████╗"*9
-    boundary = "░" * 30
+    boundary = "░" * 24
    startup_msg =f"""
 {boundary}
    ╔═╗┌─┐┌─┐┌┬┐
@ -37,8 +38,9 @@ async def lifespan(app: FastAPI):
 {boundary}
                """
-    startup_msg += f"\nModel loaded and warmed up on {TTSModel.get_device()}"
+    # TODO: Improve CPU warmup, threads, memory, etc
-    startup_msg += f"\n{voicepack_count} voice packs loaded successfully\n"
+    startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
    startup_msg += f"\n{voicepack_count} voice packs loaded\n"
    startup_msg += f"\n{boundary}\n"
    logger.info(startup_msg)
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -83,8 +83,8 @@ async def create_speech(
                audio, 
                24000, 
                request.response_format,
-                is_first_chunk=True
+                is_first_chunk=True,
-            )
+                stream=False)
            return Response(
                content=content,
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,22 +4,30 @@ from io import BytesIO
 import numpy as np
 import soundfile as sf
 import scipy.io.wavfile as wavfile
 from loguru import logger
-
+from ..core.config import settings
 class AudioNormalizer:
    """Handles audio normalization state for a single stream"""
    def __init__(self):
        self.int16_max = np.iinfo(np.int16).max
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
-    def normalize(self, audio_data: np.ndarray) -> np.ndarray:
+    def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
-        """Normalize audio data to int16 range"""
+        """Normalize audio data to int16 range and trim chunk boundaries"""
        # Convert to float32 if not already
        audio_float = audio_data.astype(np.float32)
        # Normalize to [-1, 1] range first
        if np.max(np.abs(audio_float)) > 0:
            audio_float = audio_float / np.max(np.abs(audio_float))
        # Trim end of non-final chunks to reduce gaps
        if not is_last_chunk and len(audio_float) > self.samples_to_trim:
            audio_float = audio_float[:-self.samples_to_trim]
        # Scale to int16 range
        return (audio_float * self.int16_max).astype(np.int16)
@ -27,13 +35,30 @@ class AudioNormalizer:
 class AudioService:
    """Service for audio format conversions"""
    # Default audio format settings balanced for speed and compression
    DEFAULT_SETTINGS = {
        "mp3": {
            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
            "compression_level": 0.0,  # Balanced compression
        },
        "opus": {
            "compression_level": 0.0,  # Good balance for speech
        },
        "flac": {
            "compression_level": 0.0,  # Light compression, still fast
        }
    }
    @staticmethod
    def convert_audio(
        audio_data: np.ndarray, 
        sample_rate: int, 
        output_format: str, 
        is_first_chunk: bool = True,
-        normalizer: AudioNormalizer = None
+        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
        format_settings: dict = None,
        stream: bool = True
    ) -> bytes:
        """Convert audio data to specified format
@ -42,6 +67,19 @@ class AudioService:
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
            is_first_chunk: Whether this is the first chunk of a stream
            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
            format_settings: Optional dict of format-specific settings to override defaults
                Example: {
                    "mp3": {
                        "bitrate_mode": "VARIABLE",
                        "compression_level": 0.8
                    }
                }
                Default settings balance speed and compression:
                optimized for localhost @ 0.0
                - MP3: constant bitrate, no compression (0.0)
                - OPUS: no compression (0.0)
                - FLAC: no compression (0.0)
        Returns:
            Bytes of the converted audio
@ -50,31 +88,48 @@ class AudioService:
        try:
            # Always normalize audio to ensure proper amplitude scaling
-            if normalizer is None:
+            if stream:
-                normalizer = AudioNormalizer()
+                if normalizer is None:
-            normalized_audio = normalizer.normalize(audio_data)
+                    normalizer = AudioNormalizer()
                normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
            else:
                normalized_audio = audio_data
            if output_format == "pcm":
                logger.info("Writing PCM data...")
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
-                logger.info("Writing to WAV format...")
+                if stream:
-                # Always include WAV header for WAV format
+                    # Use soundfile for streaming to ensure proper headers
-                sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
+                    sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
                else:
                    # Trying scipy.io.wavfile for non-streaming WAV generation 
                    # seems faster than soundfile
                    # avoids overhead from header generation and PCM encoding
                    wavfile.write(buffer, sample_rate, normalized_audio)
            elif output_format == "mp3":
-                logger.info("Converting to MP3 format...")
+                # Use format settings or defaults
-                # Use lower bitrate for streaming
+                settings = format_settings.get("mp3", {}) if format_settings else {}
-                sf.write(buffer, normalized_audio, sample_rate, format="MP3")
+                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
                sf.write(
                    buffer, normalized_audio, 
                    sample_rate, format="MP3",
                    **settings
                    )
            elif output_format == "opus":
-                logger.info("Converting to Opus format...")
+                settings = format_settings.get("opus", {}) if format_settings else {}
-                # Use lower bitrate and smaller frame size for streaming
+                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
-                sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
+                sf.write(buffer, normalized_audio, sample_rate, format="OGG", 
                        subtype="OPUS", **settings)
            elif output_format == "flac":
-                logger.info("Converting to FLAC format...")
+                if is_first_chunk:
-                # Use smaller block size for streaming
+                    logger.info("Starting FLAC stream...")
                settings = format_settings.get("flac", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
                sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
-                        subtype='PCM_16')
+                        subtype='PCM_16', **settings)
            else:
                if output_format == "aac":
                    raise ValueError(
--- a/api/src/services/text_processing/chunker.py
+++ b/api/src/services/text_processing/chunker.py
@ -0,0 +1,52 @@
 """Text chunking service"""
 import re
 from ...core.config import settings
 def split_text(text: str, max_chunk=None):
    """Split text into chunks on natural pause points
    Args:
        text: Text to split into chunks
        max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
    """
    if max_chunk is None:
        max_chunk = settings.max_chunk_size
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    text = text.strip()
    if not text:
        return
    # First split into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text)
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        # For medium-length sentences, split on punctuation
        if len(sentence) > max_chunk:  # Lower threshold for more consistent sizes
            # First try splitting on semicolons and colons
            parts = re.split(r"(?<=[;:])\s+", sentence)
            for part in parts:
                part = part.strip()
                if not part:
                    continue
                # If part is still long, split on commas
                if len(part) > max_chunk:
                    subparts = re.split(r"(?<=,)\s+", part)
                    for subpart in subparts:
                        subpart = subpart.strip()
                        if subpart:
                            yield subpart
                else:
                    yield part
        else:
            yield sentence
--- a/api/src/services/tts_base.py
+++ b/api/src/services/tts_base.py
@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
    VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
    @classmethod
-    def setup(cls):
+    async def setup(cls):
        """Initialize model and setup voices"""
        with cls._lock:
            # Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
                            except Exception as e:
                                logger.error(f"Error copying voice {voice_name}: {str(e)}")
-            # Warm up with default voice
+            # Load warmup text
            try:
-                dummy_text = "Hello"
+                with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
-                voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
+                    warmup_text = f.read()
                dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
                # Process text and generate audio
                phonemes, tokens = cls.process_text(dummy_text, "a")
                cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
                logger.info("Model warm-up complete")
            except Exception as e:
-                logger.warning(f"Model warm-up failed: {e}")
+                logger.warning(f"Failed to load warmup text: {e}")
                warmup_text = "This is a warmup text that will be split into chunks for processing."
            # Use warmup service
            from .warmup import WarmupService
            warmup = WarmupService()
            # Load and warm up voices
            loaded_voices = warmup.load_voices()
            await warmup.warmup_voices(warmup_text, loaded_voices)
            logger.info("Model warm-up complete")
            # Count voices in directory
            voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -1,6 +1,7 @@
 import os
 import numpy as np
 import torch
 import time
 from loguru import logger
 from models import build_model
 from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
 from .tts_base import TTSBaseModel
 from ..core.config import settings
 # @torch.no_grad()
 # def forward(model, tokens, ref_s, speed):
 #     """Forward pass through the model"""
 #     device = ref_s.device
 #     tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
 #     input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
 #     text_mask = length_to_mask(input_lengths).to(device)
 #     bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
 #     d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
 #     s = ref_s[:, 128:]
 #     d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
 #     x, _ = model.predictor.lstm(d)
 #     duration = model.predictor.duration_proj(x)
 #     duration = torch.sigmoid(duration).sum(axis=-1) / speed
 #     pred_dur = torch.round(duration).clamp(min=1).long()
 #     pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
 #     c_frame = 0
 #     for i in range(pred_aln_trg.size(0)):
 #         pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
 #         c_frame += pred_dur[0, i].item()
 #     en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
 #     F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
 #     t_en = model.text_encoder(tokens, input_lengths, text_mask)
 #     asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
 #     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
 def forward(model, tokens, ref_s, speed):
-    """Forward pass through the model"""
+    """Forward pass through the model with light optimizations that preserve output quality"""
    device = ref_s.device
    # Keep original token handling but optimize device placement
    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
    text_mask = length_to_mask(input_lengths).to(device)
    # BERT and encoder pass
    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-    s = ref_s[:, 128:]
+    
-    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+    # Split reference signal once for efficiency
    s_content = ref_s[:, 128:]
    s_ref = ref_s[:, :128]
    # Predictor forward pass
    d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
    x, _ = model.predictor.lstm(d)
    # Duration prediction - keeping original logic
    duration = model.predictor.duration_proj(x)
    duration = torch.sigmoid(duration).sum(axis=-1) / speed
    pred_dur = torch.round(duration).clamp(min=1).long()
-    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    
    # Alignment matrix construction - keeping original approach for quality
    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
    c_frame = 0
    for i in range(pred_aln_trg.size(0)):
-        pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+        pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
        c_frame += pred_dur[0, i].item()
-    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+    
-    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+    # Matrix multiplications - reuse unsqueezed tensor
    pred_aln_trg = pred_aln_trg.unsqueeze(0)  # Do unsqueeze once
    en = d.transpose(-1, -2) @ pred_aln_trg
    F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
    # Text encoding and final decoding
    t_en = model.text_encoder(tokens, input_lengths, text_mask)
-    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+    asr = t_en @ pred_aln_trg
-    return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+    
    return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
 # def length_to_mask(lengths):
 #     """Create attention mask from lengths"""
 #     mask = (
 #         torch.arange(lengths.max())
 #         .unsqueeze(0)
 #         .expand(lengths.shape[0], -1)
 #         .type_as(lengths)
 #     )
 #     mask = torch.gt(mask + 1, lengths.unsqueeze(1))
 #     return mask
 def length_to_mask(lengths):
-    """Create attention mask from lengths"""
+    """Create attention mask from lengths - possibly optimized version"""
-    mask = (
+    max_len = lengths.max()
-        torch.arange(lengths.max())
+    # Create mask directly on the same device as lengths
-        .unsqueeze(0)
+    mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
-        .expand(lengths.shape[0], -1)
+    # Avoid type_as by using the correct dtype from the start
-        .type_as(lengths)
+    if lengths.dtype != mask.dtype:
-    )
+        mask = mask.to(dtype=lengths.dtype)
-    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
+    # Fuse operations  using broadcasting
-    return mask
+    return mask + 1 > lengths[:, None]
 class TTSGPUModel(TTSBaseModel):
    _instance = None
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -8,7 +8,7 @@ from functools import lru_cache
 import numpy as np
 import torch
 import scipy.io.wavfile as wavfile
-from .text_processing import normalize_text
+from .text_processing import normalize_text, chunker
 from loguru import logger
 from ..core.config import settings
@ -20,40 +20,6 @@ class TTSService:
    def __init__(self, output_dir: str = None):
        self.output_dir = output_dir
    def _split_text(self, text: str):
        """Generate text chunks one at a time, splitting on natural pause points"""
        if not isinstance(text, str):
            text = str(text) if text is not None else ""
        # First split into sentences
        sentences = re.split(r"(?<=[.!?])\s+", text)
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            # For longer sentences, split on commas and semicolons
            if len(sentence) > 300:  # Only split long sentences
                # Split on pause points while preserving the punctuation
                chunks = re.split(r"((?<=[,;])\s+)", sentence)
                # Reassemble chunks with their trailing punctuation
                current_chunk = ""
                for i, chunk in enumerate(chunks):
                    if i % 2 == 0:  # Text chunk
                        current_chunk += chunk
                    else:  # Punctuation/whitespace chunk
                        current_chunk += chunk
                        if current_chunk.strip():
                            yield current_chunk.strip()
                        current_chunk = ""
                # Yield any remaining text
                if current_chunk.strip():
                    yield current_chunk.strip()
            else:
                yield sentence
    @staticmethod
    @lru_cache(maxsize=20)  # Cache up to 8 most recently used voices
@ -96,28 +62,32 @@ class TTSService:
            # Load voice using cached loader
            voicepack = self._load_voice(voice_path)
-            # Generate audio with or without stitching
+            # For non-streaming, preprocess all chunks first
            if stitch_long_output:
-                audio_chunks = []
+                # Preprocess all chunks to phonemes/tokens
-                chunk_count = 0
+                chunks_data = []
-
+                for chunk in chunker.split_text(text):
                # Process chunks as they're generated
                for chunk in self._split_text(text):
                    try:
                        # Process text and generate audio
                        phonemes, tokens = TTSModel.process_text(chunk, voice[0])
                        chunks_data.append((chunk, tokens))
                    except Exception as e:
                        logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
                        continue
                if not chunks_data:
                    raise ValueError("No chunks were processed successfully")
                # Generate audio for all chunks
                audio_chunks = []
                for chunk, tokens in chunks_data:
                    try:
                        chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
                        if chunk_audio is not None:
                            audio_chunks.append(chunk_audio)
                            chunk_count += 1
                        else:
-                            logger.error(f"No audio generated for chunk {chunk_count + 1}")
+                            logger.error(f"No audio generated for chunk: '{chunk}'")
                    except Exception as e:
-                        logger.error(
+                        logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
                            f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
                        )
                        continue
                if not audio_chunks:
@ -138,53 +108,93 @@ class TTSService:
            raise
    async def generate_audio_stream(
-        self, text: str, voice: str, speed: float, output_format: str = "wav"
+        self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
    ):
        """Generate and yield audio chunks as they're generated for real-time streaming"""
        try:
            stream_start = time.time()
            # Create normalizer for consistent audio levels
            stream_normalizer = AudioNormalizer()
            # Input validation and preprocessing
            if not text:
                raise ValueError("Text is empty")
            preprocess_start = time.time()
            normalized = normalize_text(text)
            if not normalized:
                raise ValueError("Text is empty after preprocessing")
            text = str(normalized)
            logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
            # Voice validation and loading
            voice_start = time.time()
            voice_path = self._get_voice_path(voice)
            if not voice_path:
                raise ValueError(f"Voice not found: {voice}")
            voicepack = self._load_voice(voice_path)
            logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
            # Process chunks as they're generated
            is_first = True
-            for chunk in self._split_text(text):
+            chunks_processed = 0
            # last_chunk_end = time.time()
            # Process chunks as they come from generator
            chunk_gen = chunker.split_text(text)
            current_chunk = next(chunk_gen, None)
            while current_chunk is not None:
                next_chunk = next(chunk_gen, None)  # Peek at next chunk
                # chunk_start = time.time()
                chunks_processed += 1
                try:
                    # Process text and generate audio
-                    phonemes, tokens = TTSModel.process_text(chunk, voice[0])
+                    # text_process_start = time.time()
                    phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
                    # text_process_time = time.time() - text_process_start
                    # audio_gen_start = time.time()
                    chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
-
+                    # audio_gen_time = time.time() - audio_gen_start
                    if chunk_audio is not None:
                        # Convert chunk with proper header handling
                        convert_start = time.time()
                        chunk_bytes = AudioService.convert_audio(
                            chunk_audio,
                            24000,
                            output_format,
                            is_first_chunk=is_first,
-                            normalizer=stream_normalizer
+                            normalizer=stream_normalizer,
                            is_last_chunk=(next_chunk is None)  # Last if no next chunk
                        )
                        # convert_time = time.time() - convert_start
                        # Calculate gap from last chunk
                        # gap_time = chunk_start - last_chunk_end
                        # Log timing details if not silent
                        # if not silent:
                        #     logger.debug(
                        #         f"\nChunk {chunks_processed} timing:"
                        #         f"\n  Gap from last chunk: {gap_time*1000:.1f}ms"
                        #         f"\n  Text processing: {text_process_time*1000:.1f}ms"
                        #         f"\n  Audio generation: {audio_gen_time*1000:.1f}ms"
                        #         f"\n  Audio conversion: {convert_time*1000:.1f}ms"
                        #         f"\n  Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
                        #     )
                        yield chunk_bytes
                        is_first = False
                        # last_chunk_end = time.time()
                    else:
-                        logger.error(f"No audio generated for chunk: '{chunk}'")
+                        logger.error(f"No audio generated for chunk: '{current_chunk}'")
                except Exception as e:
-                    logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
+                    logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
-                    continue
+                
-
+                current_chunk = next_chunk  # Move to next chunk
        except Exception as e:
            logger.error(f"Error in audio generation stream: {str(e)}")
            raise
--- a/api/src/services/warmup.py
+++ b/api/src/services/warmup.py
@ -0,0 +1,52 @@
 import os
 from typing import List, Tuple
 import torch
 from loguru import logger
 from .tts_service import TTSService
 from .tts_model import TTSModel
 class WarmupService:
    """Service for warming up TTS models and voice caches"""
    def __init__(self):
        self.tts_service = TTSService()
    def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
        """Load and cache voices up to LRU limit"""
        # Get all voices sorted by filename length (shorter names first, usually base voices)
        voice_files = sorted(
            [f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
            key=len
        )
        # Load up to LRU cache limit (20)
        loaded_voices = []
        for voice_file in voice_files[:20]:
            try:
                voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
                voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
                loaded_voices.append((voice_file[:-3], voicepack))  # Store name and tensor
                # logger.info(f"Loaded voice {voice_file[:-3]} into cache")
            except Exception as e:
                logger.error(f"Failed to load voice {voice_file}: {e}")
        logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
        return loaded_voices
    async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
        """Warm up voice inference and streaming"""
        n_warmups = 1
        for voice_name, _ in loaded_voices[:n_warmups]:
            try:
                logger.info(f"Running warmup inference on voice {voice_name}")
                async for _ in self.tts_service.generate_audio_stream(
                    warmup_text,
                    voice_name,
                    1.0,
                    "pcm"
                ):
                    pass  # Process all chunks to properly warm up
                logger.info(f"Completed warmup for voice {voice_name}")
            except Exception as e:
                logger.warning(f"Warmup failed for voice {voice_name}: {e}")
--- a/api/tests/test_chunker.py
+++ b/api/tests/test_chunker.py
@ -0,0 +1,35 @@
 """Tests for text chunking service"""
 import pytest
 from api.src.services.text_processing import chunker
 def test_split_text():
    """Test text splitting into sentences"""
    text = "First sentence. Second sentence! Third sentence?"
    sentences = list(chunker.split_text(text))
    assert len(sentences) == 3
    assert sentences[0] == "First sentence."
    assert sentences[1] == "Second sentence!"
    assert sentences[2] == "Third sentence?"
 def test_split_text_empty():
    """Test splitting empty text"""
    assert list(chunker.split_text("")) == []
 def test_split_text_single_sentence():
    """Test splitting single sentence"""
    text = "Just one sentence."
    assert list(chunker.split_text(text)) == ["Just one sentence."]
 def test_split_text_with_custom_chunk_size():
    """Test splitting with custom max chunk size"""
    text = "First part, second part, third part."
    chunks = list(chunker.split_text(text, max_chunk=15))
    assert len(chunks) == 3
    assert chunks[0] == "First part,"
    assert chunks[1] == "second part,"
    assert chunks[2] == "third part."
--- a/api/tests/test_endpoints.py
+++ b/api/tests/test_endpoints.py
@ -1,7 +1,8 @@
-from unittest.mock import Mock
+from unittest.mock import Mock, AsyncMock
 import pytest
 import pytest_asyncio
 import asyncio
 from fastapi.testclient import TestClient
 from httpx import AsyncClient
@ -22,6 +23,12 @@ async def async_client():
 def mock_tts_service(monkeypatch):
    mock_service = Mock()
    mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
    # Create proper async generator mock
    async def mock_stream(*args, **kwargs):
        for chunk in [b"chunk1", b"chunk2"]:
            yield chunk
    mock_service.generate_audio_stream = mock_stream
    mock_service.list_voices.return_value = [
        "af",
        "bm_lewis",
@ -65,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
        "voice": "bm_lewis",
        "response_format": "wav",
        "speed": 1.0,
        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 200
@ -84,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
        "voice": "invalid_voice",
        "response_format": "wav",
        "speed": 1.0,
        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 400  # Bad request
@ -98,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": -1.0,  # Invalid speed
        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 422  # Validation error
@ -112,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
        "voice": "af",
        "response_format": "wav",
        "speed": 1.0,
        "stream": False  # Explicitly disable streaming
    }
    response = client.post("/v1/audio/speech", json=test_request)
    assert response.status_code == 500
@ -171,13 +182,14 @@ async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
        "input": "Hello world",
        "voice": "af",
        "response_format": "pcm",
        "stream": True
    }
-    # Mock streaming response
+    # Create streaming mock for this test
-    async def mock_stream():
+    async def mock_stream(*args, **kwargs):
-        yield b"chunk1"
+        for chunk in [b"chunk1", b"chunk2"]:
-        yield b"chunk2"
+            yield chunk
-    mock_tts_service.generate_audio_stream.return_value = mock_stream()
+    mock_tts_service.generate_audio_stream = mock_stream
    # Add streaming header
    headers = {"x-raw-response": "stream"}
@ -198,13 +210,14 @@ async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
        "input": "Hello world",
        "voice": "af",
        "response_format": "mp3",
        "stream": True
    }
-    # Mock streaming response
+    # Create streaming mock for this test
-    async def mock_stream():
+    async def mock_stream(*args, **kwargs):
-        yield b"mp3header"
+        for chunk in [b"mp3header", b"mp3data"]:
-        yield b"mp3data"
+            yield chunk
-    mock_tts_service.generate_audio_stream.return_value = mock_stream()
+    mock_tts_service.generate_audio_stream = mock_stream
    # Add streaming header
    headers = {"x-raw-response": "stream"}
@ -227,14 +240,14 @@ async def test_openai_speech_streaming_generator(mock_tts_service, async_client)
        "input": "Hello world",
        "voice": "af",
        "response_format": "pcm",
        "stream": True
    }
-    # Mock streaming response
+    # Create streaming mock for this test
-    async def mock_stream():
+    async def mock_stream(*args, **kwargs):
-        yield b"chunk1"
+        for chunk in [b"chunk1", b"chunk2"]:
-        yield b"chunk2"
+            yield chunk
-    
+    mock_tts_service.generate_audio_stream = mock_stream
    mock_tts_service.generate_audio_stream.return_value = mock_stream()
    # Add streaming header
    headers = {"x-raw-response": "stream"}
--- a/api/tests/test_main.py
+++ b/api/tests/test_main.py
@ -28,29 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
    """Test successful model warmup in lifespan"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
        mock_tts_model.setup.return_value = 3  # 3 voice files
        mock_tts_model.get_device.return_value = "cuda"
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
    # Start the context manager
    await async_gen.__aenter__()
    # Verify the expected logging sequence
    mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
-    # Check for the startup message containing the required info
+    # Create async mock
-    startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
+    async def async_setup():
-    startup_msg = next(msg for msg in startup_calls if "Model loaded and warmed up on" in msg)
+        return 3
-    assert "Model loaded and warmed up on cuda" in startup_msg
+    mock_tts_model.setup = MagicMock()
-    assert "3 voice packs loaded successfully" in startup_msg
+    mock_tts_model.setup.side_effect = async_setup
    mock_tts_model.get_device.return_value = "cuda"
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
        # Create an async generator from the lifespan context manager
        async_gen = lifespan(MagicMock())
        # Start the context manager
        await async_gen.__aenter__()
-    # Verify model setup was called
+        # Verify the expected logging sequence
-    mock_tts_model.setup.assert_called_once()
+        mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
        # Check for the startup message containing the required info
        startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
        startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
        assert "Model warmed up on" in startup_msg
        assert "3 voice packs loaded" in startup_msg
-    # Clean up
+        # Verify model setup was called
-    await async_gen.__aexit__(None, None, None)
+        mock_tts_model.setup.assert_called_once()
        # Clean up
        await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@ -81,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
    """Test model warmup specifically on CUDA"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    # Create async mock
    async def async_setup():
        return 2
    mock_tts_model.setup = MagicMock()
    mock_tts_model.setup.side_effect = async_setup
    mock_tts_model.get_device.return_value = "cuda"
    with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
-        mock_tts_model.setup.return_value = 2  # 2 voice files
+        # Create an async generator from the lifespan context manager
-        mock_tts_model.get_device.return_value = "cuda"
+        async_gen = lifespan(MagicMock())
        await async_gen.__aenter__()
-    # Create an async generator from the lifespan context manager
+        # Verify model setup was called
-    async_gen = lifespan(MagicMock())
+        mock_tts_model.setup.assert_called_once()
    await async_gen.__aenter__()
-    # Verify model setup was called
+        # Clean up
-    mock_tts_model.setup.assert_called_once()
+        await async_gen.__aexit__(None, None, None)
    # Clean up
    await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@patch("api.src.main.TTSModel")
 async def test_lifespan_cpu_fallback(mock_tts_model):
    """Test model warmup falling back to CPU"""
    # Mock file system for voice counting
    mock_tts_model.VOICES_DIR = "/mock/voices"
    with patch(
        "os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
    ):
        mock_tts_model.setup.return_value = 4  # 4 voice files
        mock_tts_model.get_device.return_value = "cpu"
    # Create an async generator from the lifespan context manager
    async_gen = lifespan(MagicMock())
    await async_gen.__aenter__()
    # Verify model setup was called
    mock_tts_model.setup.assert_called_once()
    # Clean up
    await async_gen.__aexit__(None, None, None)
--- a/api/tests/test_tts_implementations.py
+++ b/api/tests/test_tts_implementations.py
@ -16,13 +16,14 @@ def test_get_device_error():
    with pytest.raises(RuntimeError, match="Model not initialized"):
        TTSBaseModel.get_device()
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA available"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cuda"
    assert voice_count == 2
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
-def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
+async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
    """Test setup with CUDA unavailable"""
    TTSBaseModel._device = None
    mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
    TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
    TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
-    voice_count = TTSBaseModel.setup()
+    voice_count = await TTSBaseModel.setup()
    assert TTSBaseModel._device == "cpu"
    assert voice_count == 2
--- a/api/tests/test_tts_service.py
+++ b/api/tests/test_tts_service.py
@ -31,27 +31,6 @@ def sample_audio():
    return np.sin(2 * np.pi * frequency * t).astype(np.float32)
 def test_split_text(tts_service):
    """Test text splitting into sentences"""
    text = "First sentence. Second sentence! Third sentence?"
    sentences = tts_service._split_text(text)
    assert len(sentences) == 3
    assert sentences[0] == "First sentence."
    assert sentences[1] == "Second sentence!"
    assert sentences[2] == "Third sentence?"
 def test_split_text_empty(tts_service):
    """Test splitting empty text"""
    assert tts_service._split_text("") == []
 def test_split_text_single_sentence(tts_service):
    """Test splitting single sentence"""
    text = "Just one sentence."
    assert tts_service._split_text(text) == ["Just one sentence."]
 def test_audio_to_bytes(tts_service, sample_audio):
    """Test converting audio tensor to bytes"""
    audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
    mock_torch_load.return_value = torch.zeros((10, 24000))
    mock_generate.return_value = (None, None)
-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)
@ -185,7 +164,7 @@ def test_generate_audio_error(
    mock_exists.return_value = True
    mock_torch_load.return_value = torch.zeros((10, 24000))
-    with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
+    with pytest.raises(ValueError, match="No chunks were processed successfully"):
        tts_service._generate_audio("Test text", "af", 1.0)
--- a/assets/format_comparison.png
+++ b/assets/format_comparison.png
--- a/assets/gpu_first_token_latency_direct.png
+++ b/assets/gpu_first_token_latency_direct.png
--- a/assets/gpu_first_token_latency_openai.png
+++ b/assets/gpu_first_token_latency_openai.png
--- a/assets/gpu_first_token_timeline_direct.png
+++ b/assets/gpu_first_token_timeline_direct.png
--- a/assets/gpu_first_token_timeline_openai.png
+++ b/assets/gpu_first_token_timeline_openai.png
--- a/assets/gpu_processing_time.png
+++ b/assets/gpu_processing_time.png
--- a/assets/gpu_realtime_factor.png
+++ b/assets/gpu_realtime_factor.png
--- a/assets/gpu_total_time_latency_direct.png
+++ b/assets/gpu_total_time_latency_direct.png
--- a/assets/gpu_total_time_latency_openai.png
+++ b/assets/gpu_total_time_latency_openai.png
--- a/assets/voice_analysis.png
+++ b/assets/voice_analysis.png
--- a/docker-compose.cpu.yml
+++ b/docker-compose.cpu.yml
@ -43,6 +43,7 @@ services:
      - ONNX_OPTIMIZATION_LEVEL=all
      - ONNX_MEMORY_PATTERN=true
      - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
    depends_on:
      model-fetcher:
        condition: service_healthy
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,7 +2,7 @@ services:
  model-fetcher:
    image: datamachines/git-lfs:latest
    environment:
-      - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-true}
+      - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
    volumes:
      - ./Kokoro-82M:/app/Kokoro-82M
    working_dir: /app/Kokoro-82M
@ -32,10 +32,10 @@ services:
      start_period: 1s
  kokoro-tts:
-    image: ghcr.io/remsky/kokoro-fastapi:latest
+    # image: ghcr.io/remsky/kokoro-fastapi:latest
    # Uncomment below to build from source instead of using the released image
-    # build:
+    build:
-    #   context: .
+      context: .
    volumes:
      - ./api/src:/app/api/src
      - ./Kokoro-82M:/app/Kokoro-82M
@ -54,14 +54,14 @@ services:
      model-fetcher:
        condition: service_healthy
-  # # Gradio UI service [Comment out everything below if you don't need it]
+  # Gradio UI service [Comment out everything below if you don't need it]
-  # gradio-ui:
+  gradio-ui:
-  #   build:
+    build:
-  #     context: ./ui
+      context: ./ui
-  #   ports:
+    ports:
-  #     - "7860:7860"
+      - "7860:7860"
-  #   volumes:
+    volumes:
-  #     - ./ui/data:/app/ui/data
+      - ./ui/data:/app/ui/data
-  #     - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+      - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-  #   environment:
+    environment:
-  #     - GRADIO_WATCH=True  # Enable hot reloading
+      - GRADIO_WATCH=True  # Enable hot reloading
--- a/examples/assorted_checks/benchmarks/benchmark_first_token.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@ -1,15 +1,19 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
-import numpy as np
+import time
 import requests
 import pandas as pd
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
-def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
+import numpy as np
 import pandas as pd
 import requests
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_timeline, plot_correlation
 from lib.shared_benchmark_utils import enc, get_text_for_tokens
 def measure_first_token(
    text: str, output_dir: str, tokens: int, run_number: int
 ) -> dict:
    """Measure time to audio via API calls and save the audio output"""
    results = {
        "text_length": len(text),
@ -18,12 +22,12 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
-        "audio_length": None  # Length of output audio in seconds
+        "audio_length": None,  # Length of output audio in seconds
    }
-    
+
    try:
        start_time = time.time()
-        
+
        # Make request without streaming
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
@ -32,58 +36,62 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
                "input": text,
                "voice": "af",
                "response_format": "wav",
-                "stream": False
+                "stream": False,
            },
-            timeout=1800
+            timeout=1800,
        )
        response.raise_for_status()
-        
+
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
-        
+
        content = response.content
-        with open(audio_path, 'wb') as f:
+        with open(audio_path, "wb") as f:
            f.write(content)
-        
+
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["time_to_first_chunk"] = time.time() - start_time
-        
+
        results["total_time"] = time.time() - start_time
        return results
-        
+
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
+    with open(
        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
    ) as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [10, 25, 50, 100, 200, 500]
    all_results = []
-    
+
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
-        
+
        # Run test 3 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/3...")
@ -91,67 +99,74 @@ def main():
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
-            
+
            print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
-            
+
            if result["error"]:
                print(f"Error: {result['error']}")
-            
+
            all_results.append(result)
-    
+
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
-        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
+        matching_results = [
            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
        ]
        if matching_results:
-            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
+            avg_first_chunk = sum(
-            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
+                r["time_to_first_chunk"] for r in matching_results
-            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
+            ) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(
                matching_results
            )
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
                matching_results
            )
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
-                "num_successful_runs": len(matching_results)
+                "num_successful_runs": len(matching_results),
            }
-    
+
    # Save results
    # Save results
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    save_json_results(
-        results_data,
+        results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
        os.path.join(output_data_dir, "first_token_benchmark.json")
    )
-    
+
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
-    
+
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
-    
+
    # Create both plots
    plot_correlation(
-        df, "target_tokens", "time_to_first_chunk",
+        df,
        "target_tokens",
        "time_to_first_chunk",
        "Time to Audio vs Input Size",
        "Number of Input Tokens",
        "Time to Audio (seconds)",
-        os.path.join(output_plots_dir, "first_token_latency.png")
+        os.path.join(output_plots_dir, "first_token_latency.png"),
    )
-    
+
-    plot_timeline(
+    plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
-        df,
+
        os.path.join(output_plots_dir, "first_token_timeline.png")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
@ -1,193 +0,0 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
 import numpy as np
 import requests
 import pandas as pd
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
 def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
    """Measure time to audio via API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": len(enc.encode(text)),
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None  # Length of output audio in seconds
    }
    try:
        start_time = time.time()
        # Make request with streaming enabled
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": "af",
                "response_format": "pcm",
                "stream": True
            },
            stream=True,
            timeout=1800
        )
        response.raise_for_status()
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        chunks = []
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                if first_chunk_time is None:
                    first_chunk_time = time.time()
                    results["time_to_first_chunk"] = first_chunk_time - start_time
                chunks.append(chunk)
        # Concatenate all PCM chunks
        if not chunks:
            raise ValueError("No audio chunks received")
        all_audio_data = b''.join(chunks)
        # Write as WAV file
        import wave
        with wave.open(audio_path, 'wb') as wav_file:
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
            wav_file.setframerate(24000)  # Known sample rate for Kokoro
            wav_file.writeframes(all_audio_data)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {len(chunks)}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths with _stream suffix
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio_stream")
    output_data_dir = os.path.join(script_dir, "output_data")
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
    all_results = []
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens (streaming)")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        # Run test 3 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/3...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
            if result["error"]:
                print(f"Error: {result['error']}")
            all_results.append(result)
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
        if matching_results:
            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results)
            }
    # Save results with _stream suffix
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, "first_token_benchmark_stream.json")
    )
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create both plots with _stream suffix
    # Plot correlation for both metrics
    plot_correlation(
        df, "target_tokens", "time_to_first_chunk",
        "Time to First Audio vs Input Size (Streaming)",
        "Number of Input Tokens",
        "Time to First Audio (seconds)",
        os.path.join(output_plots_dir, "first_token_latency_stream.png")
    )
    plot_correlation(
        df, "target_tokens", "total_time",
        "Total Time vs Input Size (Streaming)",
        "Number of Input Tokens",
        "Total Time (seconds)",
        os.path.join(output_plots_dir, "total_time_latency_stream.png")
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
@ -1,184 +0,0 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
 import numpy as np
 import pandas as pd
 from openai import OpenAI
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
 def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
    """Measure time to audio via OpenAI API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": len(enc.encode(text)),
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None  # Length of output audio in seconds
    }
    try:
        start_time = time.time()
        # Initialize OpenAI client
        openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        all_audio_data = bytearray()
        chunk_count = 0
        # Make streaming request using OpenAI client
        with openai.audio.speech.with_streaming_response.create(
            model="kokoro",
            voice="af",
            response_format="pcm",
            input=text,
        ) as response:
            for chunk in response.iter_bytes(chunk_size=1024):
                if chunk:
                    chunk_count += 1
                    if first_chunk_time is None:
                        first_chunk_time = time.time()
                        results["time_to_first_chunk"] = first_chunk_time - start_time
                    all_audio_data.extend(chunk)
        # Write as WAV file
        import wave
        with wave.open(audio_path, 'wb') as wav_file:
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
            wav_file.setframerate(24000)  # Known sample rate for Kokoro
            wav_file.writeframes(all_audio_data)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {chunk_count}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths with _stream_openai suffix
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio_stream_openai")
    output_data_dir = os.path.join(script_dir, "output_data")
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [50, 100, 200, 500]
    all_results = []
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens (streaming)")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        # Run test 5 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/5...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
            if result["error"]:
                print(f"Error: {result['error']}")
            all_results.append(result)
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
        if matching_results:
            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results)
            }
    # Save results with _stream_openai suffix
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
    )
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create plots with _stream_openai suffix
    plot_correlation(
        df, "target_tokens", "time_to_first_chunk",
        "Time to First Audio vs Input Size (OpenAI Streaming)",
        "Number of Input Tokens",
        "Time to First Audio (seconds)",
        os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
    )
    plot_correlation(
        df, "target_tokens", "total_time",
        "Total Time vs Input Size (OpenAI Streaming)",
        "Number of Input Tokens",
        "Total Time (seconds)",
        os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
@ -0,0 +1,195 @@
 #!/usr/bin/env python3
 import os
 import time
 import requests
 from openai import OpenAI
 from lib.stream_utils import run_benchmark
 OPENAI_CLIENT = OpenAI(
    base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
 )
 def measure_first_token_requests(
    text: str, output_dir: str, tokens: int, run_number: int
 ) -> dict:
    """Measure time to audio via direct API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": None,  # Will be set by run_benchmark
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None,
    }
    try:
        start_time = time.time()
        # Make request with streaming enabled
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": "af",
                "response_format": "pcm",
                "stream": True,
            },
            stream=True,
            timeout=1800,
        )
        response.raise_for_status()
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        chunks = []
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                if first_chunk_time is None:
                    first_chunk_time = time.time()
                    results["time_to_first_chunk"] = first_chunk_time - start_time
                chunks.append(chunk)
        # Concatenate all PCM chunks
        if not chunks:
            raise ValueError("No audio chunks received")
        all_audio_data = b"".join(chunks)
        # Write as WAV file
        import wave
        with wave.open(audio_path, "wb") as wav_file:
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
            wav_file.setframerate(24000)  # Known sample rate for Kokoro
            wav_file.writeframes(all_audio_data)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {len(chunks)}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def measure_first_token_openai(
    text: str, output_dir: str, tokens: int, run_number: int
 ) -> dict:
    """Measure time to audio via OpenAI API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": None,  # Will be set by run_benchmark
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None,
    }
    try:
        start_time = time.time()
        # Initialize OpenAI client
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        all_audio_data = bytearray()
        chunk_count = 0
        # Make streaming request using OpenAI client
        with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
            model="kokoro",
            voice="af",
            response_format="pcm",
            input=text,
        ) as response:
            for chunk in response.iter_bytes(chunk_size=1024):
                if chunk:
                    chunk_count += 1
                    if first_chunk_time is None:
                        first_chunk_time = time.time()
                        results["time_to_first_chunk"] = first_chunk_time - start_time
                    all_audio_data.extend(chunk)
        # Write as WAV file
        import wave
        with wave.open(audio_path, "wb") as wav_file:
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
            wav_file.setframerate(24000)  # Known sample rate for Kokoro
            wav_file.writeframes(all_audio_data)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {chunk_count}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    script_dir = os.path.dirname(os.path.abspath(__file__))
    prefix='cpu'
    # Run requests benchmark
    print("\n=== Running Direct Requests Benchmark ===")
    run_benchmark(
        measure_first_token_requests,
        output_dir=os.path.join(script_dir, "output_audio_stream"),
        output_data_dir=os.path.join(script_dir, "output_data"),
        output_plots_dir=os.path.join(script_dir, "output_plots"),
        suffix="_stream",
        plot_title_suffix="(Streaming)",
        prefix=prefix
    )
    # Run OpenAI benchmark
    print("\n=== Running OpenAI Library Benchmark ===")
    run_benchmark(
        measure_first_token_openai,
        output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
        output_data_dir=os.path.join(script_dir, "output_data"),
        output_plots_dir=os.path.join(script_dir, "output_plots"),
        suffix="_stream_openai",
        plot_title_suffix="(OpenAI Streaming)",
        prefix=prefix
    )
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
+++ b/examples/assorted_checks/benchmarks/benchmark_tts_rtf.py
@ -1,30 +1,37 @@
 #!/usr/bin/env python3
 import os
 import sys
 import json
 import time
 import threading
 import queue
-import pandas as pd
+import threading
 import sys
 from datetime import datetime
-from lib.shared_plotting import plot_system_metrics, plot_correlation
+import pandas as pd
 from lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats,
+    real_time_factor,
-    real_time_factor
+    save_json_results,
    get_system_metrics,
    write_benchmark_stats,
 )
 from lib.shared_plotting import plot_correlation, plot_system_metrics
 from lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
    make_tts_request,
    get_text_for_tokens,
    generate_token_sizes,
 )
 class SystemMonitor:
    def __init__(self, interval=1.0):
        """Rough system tracker: Not always accurate"""
        self.interval = interval
        self.metrics_queue = queue.Queue()
        self.stop_event = threading.Event()
        self.metrics_timeline = []
        self.start_time = None
-        
+
    def _monitor_loop(self):
        """Background thread function to collect system metrics."""
        while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
            metrics["relative_time"] = time.time() - self.start_time
            self.metrics_queue.put(metrics)
            time.sleep(self.interval)
-    
+
    def start(self):
        """Start the monitoring thread."""
        self.start_time = time.time()
        self.monitor_thread = threading.Thread(target=self._monitor_loop)
        self.monitor_thread.daemon = True
        self.monitor_thread.start()
-    
+
    def stop(self):
        """Stop the monitoring thread and collect final metrics."""
        self.stop_event.set()
-        if hasattr(self, 'monitor_thread'):
+        if hasattr(self, "monitor_thread"):
            self.monitor_thread.join(timeout=2)
-        
+
        # Collect all metrics from queue
        while True:
            try:
@ -53,23 +60,24 @@ class SystemMonitor:
                self.metrics_timeline.append(metrics)
            except queue.Empty:
                break
-        
+
        return self.metrics_timeline
 def main():
    # Initialize system monitor
    monitor = SystemMonitor(interval=1.0)  # 1 second interval
    # Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
-    prefix = "gpu"
+    prefix = "cpu"
    # Generate token sizes
-    if 'gpu' in prefix:
+    if "gpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=5000, dense_step=150, 
+            max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
-            dense_max=1000, sparse_step=1000)
+        )
-    elif 'cpu' in prefix:
+    elif "cpu" in prefix:
        token_sizes = generate_token_sizes(
-            max_tokens=1000, dense_step=300, 
+            max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
-            dense_max=1000, sparse_step=0)
+        )
    else:
        token_sizes = generate_token_sizes(max_tokens=3000)
@ -78,7 +86,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
            filename = f"{prefix}_{filename}"
        return os.path.join(path, filename)
-    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
+    with open(
        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
    ) as f:
        text = f.read()
    total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():
    results = []
    test_start_time = time.time()
-    
+
    # Start system monitoring
    monitor.start()
@ -114,7 +124,8 @@ def main():
        processing_time, audio_length = make_tts_request(
            chunk,
            output_dir=output_dir,
-            prefix=prefix
+            prefix=prefix,
            stream=False,  # Use non-streaming mode for RTF benchmarking
        )
        if processing_time is None or audio_length is None:
            print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
        # Calculate RTF using the correct formula
        rtf = real_time_factor(processing_time, audio_length)
        print(f"Real-Time Factor: {rtf:.5f}")
-        
+
-        results.append({
+        results.append(
-            "tokens": actual_tokens,
+            {
-            "processing_time": processing_time,
+                "tokens": actual_tokens,
-            "output_length": audio_length,
+                "processing_time": processing_time,
-            "rtf": rtf,
+                "output_length": audio_length,
-            "elapsed_time": round(time.time() - test_start_time, 2),
+                "rtf": rtf,
-        })
+                "elapsed_time": round(time.time() - test_start_time, 5),
            }
        )
    df = pd.DataFrame(results)
    if df.empty:
@ -144,89 +157,101 @@ def main():
        {
            "title": "Benchmark Statistics (with correct RTF)",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
+                "Total tokens processed": df["tokens"].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
+                "Total test duration (s)": df["elapsed_time"].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
-                "Average RTF": df['rtf'].mean(),
+                "Average RTF": df["rtf"].mean(),
-                "Average Real Time Speed": 1/df['rtf'].mean()
+                "Average Real Time Speed": 1 / df["rtf"].mean(),
-            }
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
+                "Average chunk size (tokens)": df["tokens"].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
+                "Min chunk size (tokens)": df["tokens"].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
+                "Max chunk size (tokens)": df["tokens"].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
+                "Average processing time (s)": df["processing_time"].mean(),
-                "Average output length (s)": df['output_length'].mean()
+                "Average output length (s)": df["output_length"].mean(),
-            }
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
                "RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
-                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
+                "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
-            }
+            },
-        }
+        },
    ]
-    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
+    write_benchmark_stats(
        stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
    )
    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
        "tokens",
        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time_rtf.png")
+        prefix_path(output_plots_dir, "processing_time_rtf.png"),
    )
    # Plot RTF vs Token Count
    plot_correlation(
-        df, "tokens", "rtf",
+        df,
        "tokens",
        "rtf",
        "Real-Time Factor vs Input Size",
        "Number of Input Tokens",
        "Real-Time Factor (processing time / audio length)",
-        prefix_path(output_plots_dir, "realtime_factor_rtf.png")
+        prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
    )
    # Stop monitoring and get final metrics
    final_metrics = monitor.stop()
-    
+
    # Convert metrics timeline to DataFrame for stats
    metrics_df = pd.DataFrame(final_metrics)
-    
+
    # Add system usage stats
    if not metrics_df.empty:
-        stats.append({
+        stats.append(
-            "title": "System Usage Statistics",
+            {
-            "stats": {
+                "title": "System Usage Statistics",
-                "Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
+                "stats": {
-                "Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
+                    "Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
-                "Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
+                    "Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
-                "Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
+                    "Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
-                "Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
+                    "Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
-                "Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
+                    "Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
                    "Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
                },
            }
-        })
+        )
-        if 'gpu_memory_used' in metrics_df:
+        if "gpu_memory_used" in metrics_df:
-            stats[-1]["stats"].update({
+            stats[-1]["stats"].update(
-                "Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
+                {
-                "Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
+                    "Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
-            })
+                    "Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
-    
+                }
            )
    # Plot system metrics
-    plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
+    plot_system_metrics(
        final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
    )
    # Save final results
    save_json_results(
        {
            "results": results,
            "system_metrics": final_metrics,
-            "test_duration": time.time() - test_start_time
+            "test_duration": time.time() - test_start_time,
        },
-        prefix_path(output_data_dir, "benchmark_results_rtf.json")
+        prefix_path(output_data_dir, "benchmark_results_rtf.json"),
    )
    print("\nResults saved to:")
--- a/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
+++ b/examples/assorted_checks/benchmarks/depr_benchmark_tts.py
@ -1,19 +1,30 @@
 import os
 import json
 import time
 import pandas as pd
-from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
+
 from examples.assorted_checks.lib.shared_utils import (
-    get_system_metrics, save_json_results, write_benchmark_stats
+    save_json_results,
    get_system_metrics,
    write_benchmark_stats,
 )
 from examples.assorted_checks.lib.shared_plotting import (
    plot_correlation,
    plot_system_metrics,
 )
 from examples.assorted_checks.lib.shared_benchmark_utils import (
-    get_text_for_tokens, make_tts_request, generate_token_sizes, enc
+    enc,
    make_tts_request,
    get_text_for_tokens,
    generate_token_sizes,
 )
 def main():
    # Get optional prefix from first command line argument
    import sys
    prefix = sys.argv[1] if len(sys.argv) > 1 else ""
    # Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    output_plots_dir = os.path.join(script_dir, "output_plots")
-    
+
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
    total_tokens = len(enc.encode(text))
    print(f"Total tokens in file: {total_tokens}")
    token_sizes = generate_token_sizes(total_tokens)
    print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
        # Save intermediate results
        save_json_results(
            {"results": results, "system_metrics": system_metrics},
-            prefix_path(output_data_dir, "benchmark_results.json")
+            prefix_path(output_data_dir, "benchmark_results.json"),
        )
    # Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
        {
            "title": "Benchmark Statistics",
            "stats": {
-                "Total tokens processed": df['tokens'].sum(),
+                "Total tokens processed": df["tokens"].sum(),
-                "Total audio generated (s)": df['output_length'].sum(),
+                "Total audio generated (s)": df["output_length"].sum(),
-                "Total test duration (s)": df['elapsed_time'].max(),
+                "Total test duration (s)": df["elapsed_time"].max(),
-                "Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
+                "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
-                "Average realtime factor": df['realtime_factor'].mean()
+                "Average realtime factor": df["realtime_factor"].mean(),
-            }
+            },
        },
        {
            "title": "Per-chunk Stats",
            "stats": {
-                "Average chunk size (tokens)": df['tokens'].mean(),
+                "Average chunk size (tokens)": df["tokens"].mean(),
-                "Min chunk size (tokens)": df['tokens'].min(),
+                "Min chunk size (tokens)": df["tokens"].min(),
-                "Max chunk size (tokens)": df['tokens'].max(),
+                "Max chunk size (tokens)": df["tokens"].max(),
-                "Average processing time (s)": df['processing_time'].mean(),
+                "Average processing time (s)": df["processing_time"].mean(),
-                "Average output length (s)": df['output_length'].mean()
+                "Average output length (s)": df["output_length"].mean(),
-            }
+            },
        },
        {
            "title": "Performance Ranges",
            "stats": {
                "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
-                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
+                "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
-            }
+            },
-        }
+        },
    ]
    write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
    # Plot Processing Time vs Token Count
    plot_correlation(
-        df, "tokens", "processing_time",
+        df,
        "tokens",
        "processing_time",
        "Processing Time vs Input Size",
        "Number of Input Tokens",
        "Processing Time (seconds)",
-        prefix_path(output_plots_dir, "processing_time.png")
+        prefix_path(output_plots_dir, "processing_time.png"),
    )
    # Plot Realtime Factor vs Token Count
    plot_correlation(
-        df, "tokens", "realtime_factor",
+        df,
        "tokens",
        "realtime_factor",
        "Realtime Factor vs Input Size",
        "Number of Input Tokens",
        "Realtime Factor (output length / processing time)",
-        prefix_path(output_plots_dir, "realtime_factor.png")
+        prefix_path(output_plots_dir, "realtime_factor.png"),
    )
    # Plot system metrics
-    plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
+    plot_system_metrics(
        system_metrics, prefix_path(output_plots_dir, "system_usage.png")
    )
    print("\nResults saved to:")
    print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
--- a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
@ -1,11 +1,12 @@
 """Shared utilities specific to TTS benchmarking."""
 import time
-from typing import List, Optional, Tuple
+from typing import List, Tuple, Optional
 import requests
 import tiktoken
-from .shared_utils import get_audio_length, save_audio_file
+from .shared_utils import save_audio_file, get_audio_length
 # Global tokenizer instance
 enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")
 def get_text_for_tokens(text: str, num_tokens: int) -> str:
    """Get a slice of text that contains exactly num_tokens tokens.
-    
+
    Args:
        text: Input text to slice
        num_tokens: Desired number of tokens
-        
+
    Returns:
        str: Text slice containing exactly num_tokens tokens
    """
@ -31,44 +32,69 @@ def make_tts_request(
    text: str,
    output_dir: str = None,
    timeout: int = 1800,
-    prefix: str = ""
+    prefix: str = "",
    stream: bool = True,
 ) -> Tuple[Optional[float], Optional[float]]:
    """Make TTS request using OpenAI-compatible endpoint.
-    
+
    Args:
        text: Input text to convert to speech
        output_dir: Directory to save audio files. If None, audio won't be saved.
        timeout: Request timeout in seconds
        prefix: Optional prefix for output filenames
-        
+
    Returns:
        tuple: (processing_time, audio_length) in seconds, or (None, None) on error
    """
    try:
        start_time = time.time()
-        response = requests.post(
+        if stream:
-            "http://localhost:8880/v1/audio/speech",
+            # For streaming, we need to collect all chunks
-            json={
+            audio_chunks = []
-                "model": "kokoro",
+            response = requests.post(
-                "input": text,
+                "http://localhost:8880/v1/audio/speech",
-                "voice": "af",
+                json={
-                "response_format": "wav",
+                    "model": "kokoro",
-            },
+                    "input": text,
-            timeout=timeout,
+                    "voice": "af",
-        )
+                    "response_format": "wav",
-        response.raise_for_status()
+                    "stream": True,
                },
                timeout=timeout,
                stream=True,
            )
            response.raise_for_status()
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    audio_chunks.append(chunk)
            # Combine all chunks
            audio_data = b"".join(audio_chunks)
        else:
            response = requests.post(
                "http://localhost:8880/v1/audio/speech",
                json={
                    "model": "kokoro",
                    "input": text,
                    "voice": "af",
                    "response_format": "wav",
                    "stream": False,
                },
                timeout=timeout,
            )
            response.raise_for_status()
            audio_data = response.content
        processing_time = round(time.time() - start_time, 2)
-        # Calculate audio length from response content
+        # Calculate audio length from audio data
-        audio_length = get_audio_length(response.content)
+        audio_length = get_audio_length(audio_data)
-        
+
        # Save the audio file if output_dir is provided
        if output_dir:
            token_count = len(enc.encode(text))
            output_file = save_audio_file(
-                response.content,
+                audio_data, f"chunk_{token_count}_tokens", output_dir
                f"chunk_{token_count}_tokens",
                output_dir
            )
            print(f"Saved audio to {output_file}")
@ -86,26 +112,26 @@ def generate_token_sizes(
    max_tokens: int,
    dense_step: int = 100,
    dense_max: int = 1000,
-    sparse_step: int = 1000
+    sparse_step: int = 1000,
 ) -> List[int]:
    """Generate token size ranges with dense sampling at start.
-    
+
    Args:
        max_tokens: Maximum number of tokens to generate sizes up to
        dense_step: Step size for dense sampling range
        dense_max: Maximum value for dense sampling
        sparse_step: Step size for sparse sampling range
-        
+
    Returns:
        list: Sorted list of token sizes
    """
    # Dense sampling at start
    dense_range = list(range(dense_step, dense_max + 1, dense_step))
-    
+
    if max_tokens <= dense_max or sparse_step < dense_max:
        return sorted(dense_range)
    # Sparse sampling for larger sizes
    sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
-    
+
    # Combine and deduplicate
    return sorted(list(set(dense_range + sparse_range)))
--- a/examples/assorted_checks/benchmarks/lib/shared_plotting.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@ -1,7 +1,8 @@
 """Shared plotting utilities for benchmarks and tests."""
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
@ -12,66 +13,71 @@ STYLE_CONFIG = {
    "secondary_color": "#05d9e8",
    "grid_color": "#ffffff",
    "text_color": "#ffffff",
-    "font_sizes": {
+    "font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
        "title": 16,
        "label": 14,
        "tick": 12,
        "text": 10
    }
 }
 def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
    """Configure plot styling with consistent theme.
-    
+
    Args:
        fig: matplotlib figure object
        ax: matplotlib axis object
        title: str, plot title
        xlabel: str, optional x-axis label
        ylabel: str, optional y-axis label
-    
+
    Returns:
        tuple: (fig, ax) with applied styling
    """
    # Grid styling
    ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
-    
+
    # Title and labels
-    ax.set_title(title, pad=20, 
+    ax.set_title(
-                fontsize=STYLE_CONFIG["font_sizes"]["title"], 
+        title,
-                fontweight="bold", 
+        pad=20,
-                color=STYLE_CONFIG["text_color"])
+        fontsize=STYLE_CONFIG["font_sizes"]["title"],
-    
+        fontweight="bold",
        color=STYLE_CONFIG["text_color"],
    )
    if xlabel:
-        ax.set_xlabel(xlabel, 
+        ax.set_xlabel(
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
+            xlabel,
-                     fontweight="medium", 
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
-                     color=STYLE_CONFIG["text_color"])
+            fontweight="medium",
            color=STYLE_CONFIG["text_color"],
        )
    if ylabel:
-        ax.set_ylabel(ylabel, 
+        ax.set_ylabel(
-                     fontsize=STYLE_CONFIG["font_sizes"]["label"], 
+            ylabel,
-                     fontweight="medium", 
+            fontsize=STYLE_CONFIG["font_sizes"]["label"],
-                     color=STYLE_CONFIG["text_color"])
+            fontweight="medium",
-    
+            color=STYLE_CONFIG["text_color"],
        )
    # Tick styling
-    ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"], 
+    ax.tick_params(
-                  colors=STYLE_CONFIG["text_color"])
+        labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
-    
+    )
    # Spine styling
    for spine in ax.spines.values():
        spine.set_color(STYLE_CONFIG["text_color"])
        spine.set_alpha(0.3)
        spine.set_linewidth(0.5)
-    
+
    # Background colors
    ax.set_facecolor(STYLE_CONFIG["background_color"])
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    return fig, ax
 def plot_system_metrics(metrics_data, output_path):
    """Create plots for system metrics over time.
-    
+
    Args:
        metrics_data: list of dicts containing system metrics
        output_path: str, path to save the output plot
@ -79,68 +85,118 @@ def plot_system_metrics(metrics_data, output_path):
    df = pd.DataFrame(metrics_data)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
-    
+
    # Get baseline values
    baseline_cpu = df["cpu_percent"].iloc[0]
    baseline_ram = df["ram_used_gb"].iloc[0]
-    baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
+    baseline_gpu = (
-    
+        df["gpu_memory_used"].iloc[0] / 1024
        if "gpu_memory_used" in df.columns
        else None
    )
    # Convert GPU memory to GB if present
    if "gpu_memory_used" in df.columns:
        df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
-    
+
    plt.style.use("dark_background")
-    
+
    # Create subplots based on available metrics
    has_gpu = "gpu_memory_used" in df.columns
    num_plots = 3 if has_gpu else 2
    fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
    fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
-    
+
    # Smoothing window
    window = min(5, len(df) // 2)
-    
+
    # Plot CPU Usage
    smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], 
+    sns.lineplot(
-                color=STYLE_CONFIG["primary_color"], linewidth=2)
+        x=elapsed_time,
-    axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"], 
+        y=smoothed_cpu,
-                    linestyle="--", alpha=0.5, label="Baseline")
+        ax=axes[0],
-    setup_plot(fig, axes[0], "CPU Usage Over Time", 
+        color=STYLE_CONFIG["primary_color"],
-              xlabel="Time (seconds)", ylabel="CPU Usage (%)")
+        linewidth=2,
    )
    axes[0].axhline(
        y=baseline_cpu,
        color=STYLE_CONFIG["secondary_color"],
        linestyle="--",
        alpha=0.5,
        label="Baseline",
    )
    setup_plot(
        fig,
        axes[0],
        "CPU Usage Over Time",
        xlabel="Time (seconds)",
        ylabel="CPU Usage (%)",
    )
    axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
    axes[0].legend()
-    
+
    # Plot RAM Usage
    smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
-    sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], 
+    sns.lineplot(
-                color=STYLE_CONFIG["secondary_color"], linewidth=2)
+        x=elapsed_time,
-    axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"], 
+        y=smoothed_ram,
-                    linestyle="--", alpha=0.5, label="Baseline")
+        ax=axes[1],
-    setup_plot(fig, axes[1], "RAM Usage Over Time", 
+        color=STYLE_CONFIG["secondary_color"],
-              xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
+        linewidth=2,
    )
    axes[1].axhline(
        y=baseline_ram,
        color=STYLE_CONFIG["primary_color"],
        linestyle="--",
        alpha=0.5,
        label="Baseline",
    )
    setup_plot(
        fig,
        axes[1],
        "RAM Usage Over Time",
        xlabel="Time (seconds)",
        ylabel="RAM Usage (GB)",
    )
    axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
    axes[1].legend()
-    
+
    # Plot GPU Memory if available
    if has_gpu:
        smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
-        sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], 
+        sns.lineplot(
-                    color=STYLE_CONFIG["primary_color"], linewidth=2)
+            x=elapsed_time,
-        axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"], 
+            y=smoothed_gpu,
-                        linestyle="--", alpha=0.5, label="Baseline")
+            ax=axes[2],
-        setup_plot(fig, axes[2], "GPU Memory Usage Over Time", 
+            color=STYLE_CONFIG["primary_color"],
-                  xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
+            linewidth=2,
        )
        axes[2].axhline(
            y=baseline_gpu,
            color=STYLE_CONFIG["secondary_color"],
            linestyle="--",
            alpha=0.5,
            label="Baseline",
        )
        setup_plot(
            fig,
            axes[2],
            "GPU Memory Usage Over Time",
            xlabel="Time (seconds)",
            ylabel="GPU Memory (GB)",
        )
        axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
        axes[2].legend()
-    
+
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
-def plot_timeline(df, output_path, suffix=""):
+
 def plot_timeline(df, output_path, suffix="", prefix=""):
    """Create timeline plot showing latency for each run.
-    
+
    Args:
        df: pandas DataFrame containing run data with columns:
            - target_tokens: number of tokens
@ -149,124 +205,161 @@ def plot_timeline(df, output_path, suffix=""):
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
-    
+
    # Sort by tokens and run number
-    df = df.sort_values(['target_tokens', 'run_number'])
+    df = df.sort_values(["target_tokens", "run_number"])
-    
+
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))
-    
+
    # Calculate y positions for each run with tighter grouping
-    unique_tokens = sorted(df['target_tokens'].unique())
+    unique_tokens = sorted(df["target_tokens"].unique())
    y_positions = {}
    current_y = 0
    group_spacing = 0.8  # Space between groups
-    run_spacing = 0.2    # Space between runs in a group
+    run_spacing = 0.2  # Space between runs in a group
-    
+
    for tokens in unique_tokens:
-        runs = df[df['target_tokens'] == tokens]
+        runs = df[df["target_tokens"] == tokens]
        base_y = current_y
        for i, (_, run) in enumerate(runs.iterrows()):
-            y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
+            y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
        current_y = base_y + (len(runs) * run_spacing) + group_spacing
-    
+
    # Plot bars and points with more transparency
    bar_height = 0.15
    for _, row in df.iterrows():
-        y = y_positions[(row['target_tokens'], row['run_number'])]
+        y = y_positions[(row["target_tokens"], row["run_number"])]
-        latency = row['time_to_first_chunk']
+        latency = row["time_to_first_chunk"]
-        
+
        # Latency bar
-        ax.add_patch(patches.Rectangle(
+        ax.add_patch(
-            (0, y - bar_height/2),
+            patches.Rectangle(
-            latency,
+                (0, y - bar_height / 2),
-            bar_height,
+                latency,
-            facecolor=STYLE_CONFIG["primary_color"],
+                bar_height,
-            alpha=0.3
+                facecolor=STYLE_CONFIG["primary_color"],
-        ))
+                alpha=0.3,
-        
+            )
        )
        # End point
-        ax.plot(latency, y, 'o', 
+        ax.plot(
-                color=STYLE_CONFIG["secondary_color"],
+            latency,
-                markersize=4,
+            y,
-                alpha=0.5)
+            "o",
-    
+            color=STYLE_CONFIG["secondary_color"],
            markersize=4,
            alpha=0.5,
        )
    # Add mean lines and values for each token group
    for tokens in unique_tokens:
-        token_runs = df[df['target_tokens'] == tokens]
+        token_runs = df[df["target_tokens"] == tokens]
-        mean_latency = token_runs['time_to_first_chunk'].mean()
+        mean_latency = token_runs["time_to_first_chunk"].mean()
-        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
+        y_positions_for_token = [
            y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
        ]
        min_y = min(y_positions_for_token)
        max_y = max(y_positions_for_token)
        group_center = (min_y + max_y) / 2
-        
+
        # Plot mean line with gradient alpha
        gradient = np.linspace(0.2, 0.8, 100)
-        for i in range(len(gradient)-1):
+        for i in range(len(gradient) - 1):
-            y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
+            y1 = (
-            y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
+                min_y
-            ax.plot([mean_latency, mean_latency], [y1, y2],
+                - bar_height
-                   '-', color=STYLE_CONFIG["secondary_color"],
+                + (max_y - min_y + 2 * bar_height) * (i / len(gradient))
-                   linewidth=3, alpha=gradient[i])
+            )
-        
+            y2 = (
                min_y
                - bar_height
                + (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
            )
            ax.plot(
                [mean_latency, mean_latency],
                [y1, y2],
                "-",
                color=STYLE_CONFIG["secondary_color"],
                linewidth=3,
                alpha=gradient[i],
            )
        # Add mean value label with background
-        label_text = f'Mean: {mean_latency:.3f}s'
+        label_text = f"Mean: {mean_latency:.3f}s"
        bbox_props = dict(
            facecolor=STYLE_CONFIG["background_color"],
            edgecolor=STYLE_CONFIG["secondary_color"],
            alpha=0.8,
            pad=3,
-            linewidth=1
+            linewidth=1,
        )
-        ax.text(mean_latency + 0.02, group_center,
+        ax.text(
-                label_text,
+            mean_latency + 0.02,
-                color=STYLE_CONFIG["secondary_color"],
+            group_center,
-                va='center',
+            label_text,
-                fontsize=10,
+            color=STYLE_CONFIG["secondary_color"],
-                fontweight='bold',
+            va="center",
-                bbox=bbox_props)
+            fontsize=10,
-    
+            fontweight="bold",
            bbox=bbox_props,
        )
    # Customize plot
    ax.set_ylim(-1, current_y)
-    ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3)  # Extra space for labels
+    ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3)  # Extra space for labels
-    
+
    # Add labels for token groups with tighter spacing
    group_positions = {}
    for tokens in unique_tokens:
-        runs = df[df['target_tokens'] == tokens]
+        runs = df[df["target_tokens"] == tokens]
-        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
+        y_positions_for_token = [
-        group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
+            y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
-        plt.axhline(y=min(y_positions_for_token) - bar_height, 
+        ]
-                   color='white', alpha=0.1, linestyle='-')
+        group_positions[tokens] = sum(y_positions_for_token) / len(
-    
+            y_positions_for_token
        )
        plt.axhline(
            y=min(y_positions_for_token) - bar_height,
            color="white",
            alpha=0.1,
            linestyle="-",
        )
    # Calculate mean audio length for each token group
    audio_lengths = {}
    for tokens in unique_tokens:
-        token_runs = df[df['target_tokens'] == tokens]
+        token_runs = df[df["target_tokens"] == tokens]
-        audio_lengths[tokens] = token_runs['audio_length'].mean()
+        audio_lengths[tokens] = token_runs["audio_length"].mean()
    # Set y-ticks at group centers with token counts and audio lengths
    plt.yticks(
        list(group_positions.values()),
-        [f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
+        [
-        fontsize=10
+            f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
            for tokens in group_positions.keys()
        ],
        fontsize=10,
    )
-    
+
    # Customize appearance
    setup_plot(
-        fig, ax,
+        fig,
-        "Time-To-Audio Latency" + suffix,
+        ax,
        prefix.upper() + " Time-To-Audio Latency " + suffix,
        xlabel="Time (seconds)",
-        ylabel="Input Size"
+        ylabel="Input Size",
    )
-    
+
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
 def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
    """Create correlation plot with regression line and correlation coefficient.
-    
+
    Args:
        df: pandas DataFrame containing the data
        x: str, column name for x-axis
@ -277,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
-    
+
    fig, ax = plt.subplots(figsize=(12, 8))
-    
+
    # Scatter plot
-    sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6, 
+    sns.scatterplot(
-                    color=STYLE_CONFIG["primary_color"])
+        data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
-    
+    )
    # Regression line
-    sns.regplot(data=df, x=x, y=y, scatter=False, 
+    sns.regplot(
-                color=STYLE_CONFIG["secondary_color"], 
+        data=df,
-                line_kws={"linewidth": 2})
+        x=x,
-    
+        y=y,
        scatter=False,
        color=STYLE_CONFIG["secondary_color"],
        line_kws={"linewidth": 2},
    )
    # Add correlation coefficient
    corr = df[x].corr(df[y])
-    plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", 
+    plt.text(
-             transform=ax.transAxes, 
+        0.05,
-             fontsize=STYLE_CONFIG["font_sizes"]["text"], 
+        0.95,
-             color=STYLE_CONFIG["text_color"],
+        f"Correlation: {corr:.2f}",
-             bbox=dict(facecolor=STYLE_CONFIG["background_color"], 
+        transform=ax.transAxes,
-                      edgecolor=STYLE_CONFIG["text_color"], 
+        fontsize=STYLE_CONFIG["font_sizes"]["text"],
-                      alpha=0.7))
+        color=STYLE_CONFIG["text_color"],
-    
+        bbox=dict(
            facecolor=STYLE_CONFIG["background_color"],
            edgecolor=STYLE_CONFIG["text_color"],
            alpha=0.7,
        ),
    )
    setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
--- a/examples/assorted_checks/benchmarks/lib/shared_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_utils.py
@ -1,9 +1,10 @@
 """Shared utilities for benchmarks and tests."""
 import os
 import json
 import subprocess
 from typing import Any, Dict, List, Union, Optional
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Union
 import psutil
 import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
 TORCH_AVAILABLE = False
 try:
    import torch
    TORCH_AVAILABLE = torch.cuda.is_available()
 except ImportError:
    pass
 def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
    """Check if an audio file is silent by comparing peak amplitude to a threshold.
    Args:
        audio_path: Path to the audio file
        threshold: Peak amplitude threshold for silence
    Returns:
        bool: True if audio is silent, False otherwise
    """
    rate, data = wavfile.read(audio_path)
    peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0  # 16-bit audio
    return peak_amplitude < threshold
 def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
    """Get audio length in seconds from bytes data.
-    
+
    Args:
        audio_data: Raw audio bytes
        temp_dir: Directory for temporary file. If None, uses system temp directory.
-        
+
    Returns:
        float: Audio length in seconds
    """
    if temp_dir is None:
        import tempfile
        temp_dir = tempfile.gettempdir()
-    
+
    temp_path = os.path.join(temp_dir, "temp.wav")
    os.makedirs(temp_dir, exist_ok=True)
-    
+
    with open(temp_path, "wb") as f:
        f.write(audio_data)
@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
 def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
    """Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
-    
+
    Args:
        average: If True and multiple GPUs present, returns average memory usage.
                If False, returns list of memory usage per GPU.
-    
+
    Returns:
        float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
        If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
        n_gpus = torch.cuda.device_count()
        memory_used = []
        for i in range(n_gpus):
-            memory_used.append(torch.cuda.memory_allocated(i) / 1024**2)  # Convert to MB
+            memory_used.append(
-        
+                torch.cuda.memory_allocated(i) / 1024**2
            )  # Convert to MB
        if average and len(memory_used) > 0:
            return sum(memory_used) / len(memory_used)
        return memory_used if len(memory_used) > 1 else memory_used[0]
-    
+
    # Fall back to nvidia-smi
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
        )
-        memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
+        memory_values = [
-        
+            float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
        ]
        if average and len(memory_values) > 0:
            return sum(memory_values) / len(memory_values)
        return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
 def get_system_metrics() -> Dict[str, Union[str, float]]:
    """Get current system metrics including CPU, RAM, and GPU if available.
-    
+
    Returns:
        dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
    """
    # Get per-CPU percentages and calculate average
    cpu_percentages = psutil.cpu_percent(percpu=True)
    avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
-    
+
    metrics = {
        "timestamp": datetime.now().isoformat(),
        "cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:
 def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
    """Save audio data to a file with proper naming and directory creation.
-    
+
    Args:
        audio_data: Raw audio bytes
        identifier: String to identify this audio file (e.g. token count, test name)
        output_dir: Directory to save the file
-        
+
    Returns:
        str: Path to the saved audio file
    """
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{identifier}.wav")
-    
+
    with open(output_file, "wb") as f:
        f.write(audio_data)
-        
+
    return output_file
 def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
    """Write benchmark statistics to a file in a clean, organized format.
-    
+
    Args:
        stats: List of dictionaries containing stat name/value pairs
        output_file: Path to output file
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    
+
    with open(output_file, "w") as f:
        for section in stats:
            # Write section header
            f.write(f"=== {section['title']} ===\n\n")
-            
+
            # Write stats
-            for label, value in section['stats'].items():
+            for label, value in section["stats"].items():
                if isinstance(value, float):
                    f.write(f"{label}: {value:.2f}\n")
                else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None
 def save_json_results(results: Dict[str, Any], output_file: str) -> None:
    """Save benchmark results to a JSON file with proper formatting.
-    
+
    Args:
        results: Dictionary of results to save
        output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
        json.dump(results, f, indent=2)
-def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
+def real_time_factor(
    processing_time: float, audio_length: float, decimals: int = 2
 ) -> float:
    """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
-    
+
    Args:
        processing_time: Time taken to process/generate audio
        audio_length: Length of the generated audio
        decimals: Number of decimal places to round to
-        
+
    Returns:
        float: RTF value
    """
--- a/examples/assorted_checks/benchmarks/lib/stream_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/stream_utils.py
@ -0,0 +1,205 @@
 #!/usr/bin/env python3
 import os
 import time
 import wave
 from typing import Any, Dict, List, Callable, Optional
 import pandas as pd
 import scipy.io.wavfile as wavfile
 from .shared_utils import save_json_results
 from .shared_plotting import plot_timeline, plot_correlation
 from .shared_benchmark_utils import enc, get_text_for_tokens
 def check_audio_silence(audio_path: str) -> bool:
    """Check if audio file contains only silence"""
    sample_rate, audio_data = wavfile.read(audio_path)
    # Convert to float for RMS calculation
    audio_float = audio_data.astype(float)
    # Calculate RMS value
    rms = (audio_float**2).mean() ** 0.5
    # Define silence threshold (adjust if needed)
    SILENCE_THRESHOLD = 50.0
    return rms < SILENCE_THRESHOLD
 def process_benchmark_results(
    all_results: List[Dict[str, Any]], token_sizes: List[int]
 ) -> Dict[str, Any]:
    """Process benchmark results and generate summary"""
    summary = {}
    for tokens in token_sizes:
        matching_results = [
            r for r in all_results if r["target_tokens"] == tokens and not r["error"]
        ]
        if matching_results:
            avg_first_chunk = sum(
                r["time_to_first_chunk"] for r in matching_results
            ) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(
                matching_results
            )
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
                matching_results
            )
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results),
            }
    return summary
 def save_benchmark_results(
    all_results: List[Dict[str, Any]],
    summary: Dict[str, Any],
    output_data_dir: str,
    output_plots_dir: str,
    suffix: str,
    plot_title_suffix: str,
    prefix: str = "",
 ):
    """Save benchmark results and generate plots"""
    # Save results
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
    )
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create plots
    plot_correlation(
        df,
        "target_tokens",
        "time_to_first_chunk",
        f"Time to First Audio vs Input Size {plot_title_suffix}",
        "Number of Input Tokens",
        "Time to First Audio (seconds)",
        os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
    )
    plot_correlation(
        df,
        "target_tokens",
        "total_time",
        f"Total Time vs Input Size {plot_title_suffix}",
        "Number of Input Tokens",
        "Total Time (seconds)",
        os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
        suffix=plot_title_suffix,
    )
 def run_benchmark(
    measure_func: Callable,
    output_dir: str,
    output_data_dir: str,
    output_plots_dir: str,
    suffix: str = "",
    plot_title_suffix: str = "",
    num_runs: int = 5,
    client=None,
    prefix="",
 ):
    """Run benchmark with the given measurement function"""
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    os.makedirs(output_plots_dir, exist_ok=True)
    # Load sample text
    script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    with open(
        os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
    ) as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [10, 50, 100, 250, 500]
    all_results = []
    silent_files = []
    for tokens in token_sizes:
        print(
            f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
        )
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        for i in range(num_runs):
            print(f"Run {i+1}/{num_runs}...")
            result = measure_func(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            # Handle time to first audio
            first_chunk = result.get('time_to_first_chunk')
            print(
                f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
            )
            # Handle total time
            total_time = result.get('total_time')
            print(
                f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
            )
            # Handle audio length
            audio_length = result.get('audio_length')
            print(
                f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
            )
            # Calculate streaming overhead only if both values exist
            if total_time is not None and first_chunk is not None:
                print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
            else:
                print("Streaming overhead: N/A")
            if result["error"]:
                print(f"Error: {result['error']}")
            elif result["audio_path"] and check_audio_silence(result["audio_path"]):
                silent_files.append(result["audio_path"])
            all_results.append(result)
    # Process and save results
    summary = process_benchmark_results(all_results, token_sizes)
    save_benchmark_results(
        all_results,
        summary,
        output_data_dir,
        output_plots_dir,
        suffix,
        plot_title_suffix,
    )
    # Print paths
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
    print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
    print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
    # Print silence check summary
    if silent_files:
        print("\nWARNING: The following files contain only silence:")
        for file in silent_files:
            print(f"- {file}")
    else:
        print("\nAll generated audio files contain valid audio content.")
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results.json
@ -1,111 +0,0 @@
 {
  "results": [
    {
      "tokens": 100,
      "processing_time": 18.833295583724976,
      "output_length": 31.15,
      "realtime_factor": 1.6539856161403135,
      "elapsed_time": 19.024322748184204
    },
    {
      "tokens": 200,
      "processing_time": 38.95506024360657,
      "output_length": 62.6,
      "realtime_factor": 1.6069799304257042,
      "elapsed_time": 58.21527123451233
    },
    {
      "tokens": 300,
      "processing_time": 49.74252939224243,
      "output_length": 96.325,
      "realtime_factor": 1.9364716908630366,
      "elapsed_time": 108.19673728942871
    },
    {
      "tokens": 400,
      "processing_time": 61.349056243896484,
      "output_length": 128.575,
      "realtime_factor": 2.095794261102292,
      "elapsed_time": 169.733656167984
    },
    {
      "tokens": 500,
      "processing_time": 82.86568236351013,
      "output_length": 158.575,
      "realtime_factor": 1.9136389815071193,
      "elapsed_time": 252.7968451976776
    }
  ],
  "system_metrics": [
    {
      "timestamp": "2025-01-03T00:13:49.865330",
      "cpu_percent": 8.0,
      "ram_percent": 39.4,
      "ram_used_gb": 25.03811264038086,
      "gpu_memory_used": 1204.0
    },
    {
      "timestamp": "2025-01-03T00:14:08.781551",
      "cpu_percent": 26.8,
      "ram_percent": 42.6,
      "ram_used_gb": 27.090862274169922,
      "gpu_memory_used": 1225.0
    },
    {
      "timestamp": "2025-01-03T00:14:08.916973",
      "cpu_percent": 16.1,
      "ram_percent": 42.6,
      "ram_used_gb": 27.089553833007812,
      "gpu_memory_used": 1225.0
    },
    {
      "timestamp": "2025-01-03T00:14:47.979053",
      "cpu_percent": 31.5,
      "ram_percent": 43.6,
      "ram_used_gb": 27.714427947998047,
      "gpu_memory_used": 1225.0
    },
    {
      "timestamp": "2025-01-03T00:14:48.098976",
      "cpu_percent": 20.0,
      "ram_percent": 43.6,
      "ram_used_gb": 27.704315185546875,
      "gpu_memory_used": 1211.0
    },
    {
      "timestamp": "2025-01-03T00:15:37.944729",
      "cpu_percent": 29.7,
      "ram_percent": 38.6,
      "ram_used_gb": 24.53925323486328,
      "gpu_memory_used": 1217.0
    },
    {
      "timestamp": "2025-01-03T00:15:38.071915",
      "cpu_percent": 8.6,
      "ram_percent": 38.5,
      "ram_used_gb": 24.51690673828125,
      "gpu_memory_used": 1208.0
    },
    {
      "timestamp": "2025-01-03T00:16:39.525449",
      "cpu_percent": 23.4,
      "ram_percent": 38.8,
      "ram_used_gb": 24.71230697631836,
      "gpu_memory_used": 1221.0
    },
    {
      "timestamp": "2025-01-03T00:16:39.612442",
      "cpu_percent": 5.5,
      "ram_percent": 38.9,
      "ram_used_gb": 24.72066879272461,
      "gpu_memory_used": 1221.0
    },
    {
      "timestamp": "2025-01-03T00:18:02.569076",
      "cpu_percent": 27.4,
      "ram_percent": 39.1,
      "ram_used_gb": 24.868202209472656,
      "gpu_memory_used": 1264.0
    }
  ]
 }
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_cpu.json
@ -1,216 +0,0 @@
 {
  "results": [
    {
      "tokens": 100,
      "processing_time": 14.349808931350708,
      "output_length": 31.15,
      "rtf": 0.46,
      "elapsed_time": 14.716031074523926
    },
    {
      "tokens": 200,
      "processing_time": 28.341803312301636,
      "output_length": 62.6,
      "rtf": 0.45,
      "elapsed_time": 43.44207406044006
    },
    {
      "tokens": 300,
      "processing_time": 43.352553606033325,
      "output_length": 96.325,
      "rtf": 0.45,
      "elapsed_time": 87.26906609535217
    },
    {
      "tokens": 400,
      "processing_time": 71.02449822425842,
      "output_length": 128.575,
      "rtf": 0.55,
      "elapsed_time": 158.7198133468628
    },
    {
      "tokens": 500,
      "processing_time": 70.92521691322327,
      "output_length": 158.575,
      "rtf": 0.45,
      "elapsed_time": 230.01379895210266
    },
    {
      "tokens": 600,
      "processing_time": 83.6328592300415,
      "output_length": 189.25,
      "rtf": 0.44,
      "elapsed_time": 314.02610969543457
    },
    {
      "tokens": 700,
      "processing_time": 103.0810194015503,
      "output_length": 222.075,
      "rtf": 0.46,
      "elapsed_time": 417.5678551197052
    },
    {
      "tokens": 800,
      "processing_time": 127.02162909507751,
      "output_length": 253.85,
      "rtf": 0.5,
      "elapsed_time": 545.0128681659698
    },
    {
      "tokens": 900,
      "processing_time": 130.49781227111816,
      "output_length": 283.775,
      "rtf": 0.46,
      "elapsed_time": 675.8943417072296
    },
    {
      "tokens": 1000,
      "processing_time": 154.76425909996033,
      "output_length": 315.475,
      "rtf": 0.49,
      "elapsed_time": 831.0677945613861
    }
  ],
  "system_metrics": [
    {
      "timestamp": "2025-01-03T00:23:52.896889",
      "cpu_percent": 4.5,
      "ram_percent": 39.1,
      "ram_used_gb": 24.86032485961914,
      "gpu_memory_used": 1281.0
    },
    {
      "timestamp": "2025-01-03T00:24:07.429461",
      "cpu_percent": 4.5,
      "ram_percent": 39.1,
      "ram_used_gb": 24.847564697265625,
      "gpu_memory_used": 1285.0
    },
    {
      "timestamp": "2025-01-03T00:24:07.620587",
      "cpu_percent": 2.7,
      "ram_percent": 39.1,
      "ram_used_gb": 24.846607208251953,
      "gpu_memory_used": 1275.0
    },
    {
      "timestamp": "2025-01-03T00:24:36.140754",
      "cpu_percent": 5.4,
      "ram_percent": 39.1,
      "ram_used_gb": 24.857810974121094,
      "gpu_memory_used": 1267.0
    },
    {
      "timestamp": "2025-01-03T00:24:36.340675",
      "cpu_percent": 6.2,
      "ram_percent": 39.1,
      "ram_used_gb": 24.85773468017578,
      "gpu_memory_used": 1267.0
    },
    {
      "timestamp": "2025-01-03T00:25:19.905634",
      "cpu_percent": 29.1,
      "ram_percent": 39.2,
      "ram_used_gb": 24.920318603515625,
      "gpu_memory_used": 1256.0
    },
    {
      "timestamp": "2025-01-03T00:25:20.182219",
      "cpu_percent": 20.0,
      "ram_percent": 39.2,
      "ram_used_gb": 24.930198669433594,
      "gpu_memory_used": 1256.0
    },
    {
      "timestamp": "2025-01-03T00:26:31.414760",
      "cpu_percent": 5.3,
      "ram_percent": 39.5,
      "ram_used_gb": 25.127891540527344,
      "gpu_memory_used": 1259.0
    },
    {
      "timestamp": "2025-01-03T00:26:31.617256",
      "cpu_percent": 3.6,
      "ram_percent": 39.5,
      "ram_used_gb": 25.126346588134766,
      "gpu_memory_used": 1252.0
    },
    {
      "timestamp": "2025-01-03T00:27:42.736097",
      "cpu_percent": 10.5,
      "ram_percent": 39.5,
      "ram_used_gb": 25.100231170654297,
      "gpu_memory_used": 1249.0
    },
    {
      "timestamp": "2025-01-03T00:27:42.912870",
      "cpu_percent": 5.3,
      "ram_percent": 39.5,
      "ram_used_gb": 25.098285675048828,
      "gpu_memory_used": 1249.0
    },
    {
      "timestamp": "2025-01-03T00:29:06.725264",
      "cpu_percent": 8.9,
      "ram_percent": 39.5,
      "ram_used_gb": 25.123123168945312,
      "gpu_memory_used": 1239.0
    },
    {
      "timestamp": "2025-01-03T00:29:06.928826",
      "cpu_percent": 5.5,
      "ram_percent": 39.5,
      "ram_used_gb": 25.128646850585938,
      "gpu_memory_used": 1239.0
    },
    {
      "timestamp": "2025-01-03T00:30:50.206349",
      "cpu_percent": 49.6,
      "ram_percent": 39.6,
      "ram_used_gb": 25.162948608398438,
      "gpu_memory_used": 1245.0
    },
    {
      "timestamp": "2025-01-03T00:30:50.491837",
      "cpu_percent": 14.8,
      "ram_percent": 39.5,
      "ram_used_gb": 25.13379669189453,
      "gpu_memory_used": 1245.0
    },
    {
      "timestamp": "2025-01-03T00:32:57.721467",
      "cpu_percent": 6.2,
      "ram_percent": 39.6,
      "ram_used_gb": 25.187721252441406,
      "gpu_memory_used": 1384.0
    },
    {
      "timestamp": "2025-01-03T00:32:57.913350",
      "cpu_percent": 3.6,
      "ram_percent": 39.6,
      "ram_used_gb": 25.199390411376953,
      "gpu_memory_used": 1384.0
    },
    {
      "timestamp": "2025-01-03T00:35:08.608730",
      "cpu_percent": 6.3,
      "ram_percent": 39.8,
      "ram_used_gb": 25.311710357666016,
      "gpu_memory_used": 1330.0
    },
    {
      "timestamp": "2025-01-03T00:35:08.791851",
      "cpu_percent": 5.3,
      "ram_percent": 39.8,
      "ram_used_gb": 25.326683044433594,
      "gpu_memory_used": 1333.0
    },
    {
      "timestamp": "2025-01-03T00:37:43.782406",
      "cpu_percent": 6.8,
      "ram_percent": 40.6,
      "ram_used_gb": 25.803058624267578,
      "gpu_memory_used": 1409.0
    }
  ]
 }
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_results_rtf.json
@ -1,300 +0,0 @@
 {
  "results": [
    {
      "tokens": 100,
      "processing_time": 0.96,
      "output_length": 31.1,
      "rtf": 0.03,
      "elapsed_time": 1.11
    },
    {
      "tokens": 250,
      "processing_time": 2.23,
      "output_length": 77.17,
      "rtf": 0.03,
      "elapsed_time": 3.49
    },
    {
      "tokens": 400,
      "processing_time": 4.05,
      "output_length": 128.05,
      "rtf": 0.03,
      "elapsed_time": 7.77
    },
    {
      "tokens": 550,
      "processing_time": 4.06,
      "output_length": 171.45,
      "rtf": 0.02,
      "elapsed_time": 12.0
    },
    {
      "tokens": 700,
      "processing_time": 6.01,
      "output_length": 221.6,
      "rtf": 0.03,
      "elapsed_time": 18.16
    },
    {
      "tokens": 850,
      "processing_time": 6.9,
      "output_length": 269.1,
      "rtf": 0.03,
      "elapsed_time": 25.21
    },
    {
      "tokens": 1000,
      "processing_time": 7.65,
      "output_length": 315.05,
      "rtf": 0.02,
      "elapsed_time": 33.03
    },
    {
      "tokens": 6000,
      "processing_time": 48.7,
      "output_length": 1837.1,
      "rtf": 0.03,
      "elapsed_time": 82.21
    },
    {
      "tokens": 11000,
      "processing_time": 92.44,
      "output_length": 3388.57,
      "rtf": 0.03,
      "elapsed_time": 175.46
    },
    {
      "tokens": 16000,
      "processing_time": 163.61,
      "output_length": 4977.32,
      "rtf": 0.03,
      "elapsed_time": 340.46
    },
    {
      "tokens": 21000,
      "processing_time": 209.72,
      "output_length": 6533.3,
      "rtf": 0.03,
      "elapsed_time": 551.92
    },
    {
      "tokens": 26000,
      "processing_time": 329.35,
      "output_length": 8068.15,
      "rtf": 0.04,
      "elapsed_time": 883.37
    },
    {
      "tokens": 31000,
      "processing_time": 473.52,
      "output_length": 9611.48,
      "rtf": 0.05,
      "elapsed_time": 1359.28
    },
    {
      "tokens": 36000,
      "processing_time": 650.98,
      "output_length": 11157.15,
      "rtf": 0.06,
      "elapsed_time": 2012.9
    }
  ],
  "system_metrics": [
    {
      "timestamp": "2025-01-03T14:41:01.331735",
      "cpu_percent": 7.5,
      "ram_percent": 50.2,
      "ram_used_gb": 31.960269927978516,
      "gpu_memory_used": 3191.0
    },
    {
      "timestamp": "2025-01-03T14:41:02.357116",
      "cpu_percent": 17.01,
      "ram_percent": 50.2,
      "ram_used_gb": 31.96163558959961,
      "gpu_memory_used": 3426.0
    },
    {
      "timestamp": "2025-01-03T14:41:02.445009",
      "cpu_percent": 9.5,
      "ram_percent": 50.3,
      "ram_used_gb": 31.966781616210938,
      "gpu_memory_used": 3426.0
    },
    {
      "timestamp": "2025-01-03T14:41:04.742152",
      "cpu_percent": 18.27,
      "ram_percent": 50.4,
      "ram_used_gb": 32.08788299560547,
      "gpu_memory_used": 3642.0
    },
    {
      "timestamp": "2025-01-03T14:41:04.847795",
      "cpu_percent": 16.27,
      "ram_percent": 50.5,
      "ram_used_gb": 32.094364166259766,
      "gpu_memory_used": 3640.0
    },
    {
      "timestamp": "2025-01-03T14:41:09.019590",
      "cpu_percent": 15.97,
      "ram_percent": 50.7,
      "ram_used_gb": 32.23244094848633,
      "gpu_memory_used": 3640.0
    },
    {
      "timestamp": "2025-01-03T14:41:09.110324",
      "cpu_percent": 3.54,
      "ram_percent": 50.7,
      "ram_used_gb": 32.234458923339844,
      "gpu_memory_used": 3640.0
    },
    {
      "timestamp": "2025-01-03T14:41:13.252607",
      "cpu_percent": 13.4,
      "ram_percent": 50.6,
      "ram_used_gb": 32.194271087646484,
      "gpu_memory_used": 3935.0
    },
    {
      "timestamp": "2025-01-03T14:41:13.327557",
      "cpu_percent": 4.69,
      "ram_percent": 50.6,
      "ram_used_gb": 32.191776275634766,
      "gpu_memory_used": 3935.0
    },
    {
      "timestamp": "2025-01-03T14:41:19.413633",
      "cpu_percent": 12.92,
      "ram_percent": 50.9,
      "ram_used_gb": 32.3467903137207,
      "gpu_memory_used": 4250.0
    },
    {
      "timestamp": "2025-01-03T14:41:19.492758",
      "cpu_percent": 7.5,
      "ram_percent": 50.8,
      "ram_used_gb": 32.34375,
      "gpu_memory_used": 4250.0
    },
    {
      "timestamp": "2025-01-03T14:41:26.467284",
      "cpu_percent": 13.09,
      "ram_percent": 51.2,
      "ram_used_gb": 32.56281280517578,
      "gpu_memory_used": 4249.0
    },
    {
      "timestamp": "2025-01-03T14:41:26.553559",
      "cpu_percent": 8.39,
      "ram_percent": 51.2,
      "ram_used_gb": 32.56183624267578,
      "gpu_memory_used": 4249.0
    },
    {
      "timestamp": "2025-01-03T14:41:34.284362",
      "cpu_percent": 12.61,
      "ram_percent": 51.7,
      "ram_used_gb": 32.874778747558594,
      "gpu_memory_used": 4250.0
    },
    {
      "timestamp": "2025-01-03T14:41:34.362353",
      "cpu_percent": 1.25,
      "ram_percent": 51.7,
      "ram_used_gb": 32.87461471557617,
      "gpu_memory_used": 4250.0
    },
    {
      "timestamp": "2025-01-03T14:42:23.471312",
      "cpu_percent": 11.64,
      "ram_percent": 54.9,
      "ram_used_gb": 34.90264129638672,
      "gpu_memory_used": 4647.0
    },
    {
      "timestamp": "2025-01-03T14:42:23.547203",
      "cpu_percent": 5.31,
      "ram_percent": 54.9,
      "ram_used_gb": 34.91563415527344,
      "gpu_memory_used": 4647.0
    },
    {
      "timestamp": "2025-01-03T14:43:56.724933",
      "cpu_percent": 12.97,
      "ram_percent": 59.5,
      "ram_used_gb": 37.84241485595703,
      "gpu_memory_used": 4655.0
    },
    {
      "timestamp": "2025-01-03T14:43:56.815453",
      "cpu_percent": 11.75,
      "ram_percent": 59.5,
      "ram_used_gb": 37.832679748535156,
      "gpu_memory_used": 4655.0
    },
    {
      "timestamp": "2025-01-03T14:46:41.705155",
      "cpu_percent": 12.94,
      "ram_percent": 66.3,
      "ram_used_gb": 42.1534538269043,
      "gpu_memory_used": 4729.0
    },
    {
      "timestamp": "2025-01-03T14:46:41.835177",
      "cpu_percent": 7.73,
      "ram_percent": 66.2,
      "ram_used_gb": 42.13554000854492,
      "gpu_memory_used": 4729.0
    },
    {
      "timestamp": "2025-01-03T14:50:13.166236",
      "cpu_percent": 11.62,
      "ram_percent": 73.4,
      "ram_used_gb": 46.71288299560547,
      "gpu_memory_used": 4676.0
    },
    {
      "timestamp": "2025-01-03T14:50:13.261611",
      "cpu_percent": 8.16,
      "ram_percent": 73.4,
      "ram_used_gb": 46.71356201171875,
      "gpu_memory_used": 4676.0
    },
    {
      "timestamp": "2025-01-03T14:55:44.623607",
      "cpu_percent": 12.92,
      "ram_percent": 82.8,
      "ram_used_gb": 52.65533447265625,
      "gpu_memory_used": 4636.0
    },
    {
      "timestamp": "2025-01-03T14:55:44.735410",
      "cpu_percent": 15.29,
      "ram_percent": 82.7,
      "ram_used_gb": 52.63290786743164,
      "gpu_memory_used": 4636.0
    },
    {
      "timestamp": "2025-01-03T15:03:40.534449",
      "cpu_percent": 13.88,
      "ram_percent": 85.0,
      "ram_used_gb": 54.050071716308594,
      "gpu_memory_used": 4771.0
    },
    {
      "timestamp": "2025-01-03T15:03:40.638708",
      "cpu_percent": 12.21,
      "ram_percent": 85.0,
      "ram_used_gb": 54.053733825683594,
      "gpu_memory_used": 4771.0
    },
    {
      "timestamp": "2025-01-03T15:14:34.159142",
      "cpu_percent": 14.51,
      "ram_percent": 78.1,
      "ram_used_gb": 49.70396423339844,
      "gpu_memory_used": 4739.0
    }
  ]
 }
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_cpu.txt
@ -1,19 +0,0 @@
 === Benchmark Statistics (with correct RTF) ===
 Overall Stats:
 Total tokens processed: 5500
 Total audio generated: 1741.65s
 Total test duration: 831.07s
 Average processing rate: 6.72 tokens/second
 Average RTF: 0.47x
 Per-chunk Stats:
 Average chunk size: 550.00 tokens
 Min chunk size: 100.00 tokens
 Max chunk size: 1000.00 tokens
 Average processing time: 82.70s
 Average output length: 174.17s
 Performance Ranges:
 Processing rate range: 5.63 - 7.17 tokens/second
 RTF range: 0.44x - 0.55x
--- a/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/benchmark_stats_rtf.txt
@ -1,9 +0,0 @@
 === Benchmark Statistics (with correct RTF) ===
 Overall Stats:
 Total tokens processed: 150850
 Total audio generated: 46786.59s
 Total test duration: 2012.90s
 Average processing rate: 104.34 tokens/second
 Average RTF: 0.03x
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
+++ b/examples/assorted_checks/benchmarks/output_data/cpu_benchmark_stats_8_4_par.txt
@ -1,23 +0,0 @@
 === Benchmark Statistics (with correct RTF) ===
 Total tokens processed: 1800
 Total audio generated (s): 568.53
 Total test duration (s): 244.10
 Average processing rate (tokens/s): 7.34
 Average RTF: 0.43
 Average Real Time Speed: 2.33
 === Per-chunk Stats ===
 Average chunk size (tokens): 600.00
 Min chunk size (tokens): 300
 Max chunk size (tokens): 900
 Average processing time (s): 81.30
 Average output length (s): 189.51
 === Performance Ranges ===
 Processing rate range (tokens/s): 7.21 - 7.47
 RTF range: 0.43x - 0.43x
 Real Time Speed range: 2.33x - 2.33x
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
@ -1,403 +0,0 @@
 {
  "individual_runs": [
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.16574740409851074,
      "time_to_first_chunk": 0.16574740409851074,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 1
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.18812799453735352,
      "time_to_first_chunk": 0.18812799453735352,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 2
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.18645429611206055,
      "time_to_first_chunk": 0.18645429611206055,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 3
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.17632031440734863,
      "time_to_first_chunk": 0.17632031440734863,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 4
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.13381195068359375,
      "time_to_first_chunk": 0.13381195068359375,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 5
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2086498737335205,
      "time_to_first_chunk": 0.2086498737335205,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 1
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2727653980255127,
      "time_to_first_chunk": 0.2727653980255127,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 2
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2096250057220459,
      "time_to_first_chunk": 0.2096250057220459,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 3
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2256758213043213,
      "time_to_first_chunk": 0.2256758213043213,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 4
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.1945042610168457,
      "time_to_first_chunk": 0.1945042610168457,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 5
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4975121021270752,
      "time_to_first_chunk": 0.4975121021270752,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4518404006958008,
      "time_to_first_chunk": 0.4518404006958008,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5640325546264648,
      "time_to_first_chunk": 0.5640325546264648,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5305957794189453,
      "time_to_first_chunk": 0.5305957794189453,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5540030002593994,
      "time_to_first_chunk": 0.5540030002593994,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.7963137626647949,
      "time_to_first_chunk": 0.7963137626647949,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9320805072784424,
      "time_to_first_chunk": 0.9320805072784424,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.824256181716919,
      "time_to_first_chunk": 0.824256181716919,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9034836292266846,
      "time_to_first_chunk": 0.9034836292266846,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.8364357948303223,
      "time_to_first_chunk": 0.8364357948303223,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8122682571411133,
      "time_to_first_chunk": 1.8122682571411133,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 1
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.7290427684783936,
      "time_to_first_chunk": 1.7290427684783936,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 2
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 2.141728401184082,
      "time_to_first_chunk": 2.141728401184082,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 3
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 2.0155680179595947,
      "time_to_first_chunk": 2.0155680179595947,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 4
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8707575798034668,
      "time_to_first_chunk": 1.8707575798034668,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 5
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.822713851928711,
      "time_to_first_chunk": 4.822713851928711,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.227782726287842,
      "time_to_first_chunk": 4.227782726287842,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.414916276931763,
      "time_to_first_chunk": 4.414916276931763,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.579505681991577,
      "time_to_first_chunk": 4.579505681991577,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.332529067993164,
      "time_to_first_chunk": 4.332529067993164,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "10": {
      "avg_time_to_first_chunk": 0.17,
      "avg_total_time": 0.17,
      "avg_audio_length": 3.45,
      "num_successful_runs": 5
    },
    "25": {
      "avg_time_to_first_chunk": 0.222,
      "avg_total_time": 0.222,
      "avg_audio_length": 7.225,
      "num_successful_runs": 5
    },
    "50": {
      "avg_time_to_first_chunk": 0.52,
      "avg_total_time": 0.52,
      "avg_audio_length": 16.325,
      "num_successful_runs": 5
    },
    "100": {
      "avg_time_to_first_chunk": 0.859,
      "avg_total_time": 0.859,
      "avg_audio_length": 31.1,
      "num_successful_runs": 5
    },
    "200": {
      "avg_time_to_first_chunk": 1.914,
      "avg_total_time": 1.914,
      "avg_audio_length": 62.625,
      "num_successful_runs": 5
    },
    "500": {
      "avg_time_to_first_chunk": 4.475,
      "avg_total_time": 4.475,
      "avg_audio_length": 157.875,
      "num_successful_runs": 5
    }
  },
  "timestamp": "2025-01-04 13:52:28"
 }
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@ -1,271 +1,337 @@
 {
  "individual_runs": [
    {
-      "text_length": 212,
+      "text_length": 37,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.7278211116790771,
+      "total_time": 0.4376556873321533,
-      "time_to_first_chunk": 0.3613290786743164,
+      "time_to_first_chunk": 0.4189143180847168,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
-      "audio_length": 16.325,
+      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 1
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.37163758277893066,
      "time_to_first_chunk": 0.34892702102661133,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 2
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.2654602527618408,
      "time_to_first_chunk": 0.2409076690673828,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 3
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.24376440048217773,
      "time_to_first_chunk": 0.23003816604614258,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 4
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.25968003273010254,
      "time_to_first_chunk": 0.24081206321716309,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 5
    },
    {
      "text_length": 212,
      "token_count": null,
      "total_time": 1.049060344696045,
      "time_to_first_chunk": 0.3336215019226074,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.4556088447570801,
+      "total_time": 0.8934676647186279,
-      "time_to_first_chunk": 0.18642044067382812,
+      "time_to_first_chunk": 0.3011031150817871,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.5538768768310547,
+      "total_time": 0.9444286823272705,
-      "time_to_first_chunk": 0.2720797061920166,
+      "time_to_first_chunk": 0.3198091983795166,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.4395604133605957,
+      "total_time": 0.9735183715820312,
-      "time_to_first_chunk": 0.15613913536071777,
+      "time_to_first_chunk": 0.369948148727417,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.45748305320739746,
+      "total_time": 0.8089118003845215,
-      "time_to_first_chunk": 0.18805718421936035,
+      "time_to_first_chunk": 0.30179858207702637,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 0.7347762584686279,
+      "total_time": 1.641003131866455,
-      "time_to_first_chunk": 0.16963744163513184,
+      "time_to_first_chunk": 0.2979745864868164,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 0.8288509845733643,
+      "total_time": 1.3709619045257568,
-      "time_to_first_chunk": 0.20123004913330078,
+      "time_to_first_chunk": 0.4272146224975586,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 0.7503848075866699,
+      "total_time": 1.2554471492767334,
-      "time_to_first_chunk": 0.21662068367004395,
+      "time_to_first_chunk": 0.29790568351745605,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 0.694899320602417,
+      "total_time": 1.3761844635009766,
-      "time_to_first_chunk": 0.1966841220855713,
+      "time_to_first_chunk": 0.32633328437805176,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 0.68701171875,
+      "total_time": 1.56705904006958,
-      "time_to_first_chunk": 0.19341063499450684,
+      "time_to_first_chunk": 0.32801246643066406,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.6845426559448242,
+      "total_time": 5.086699962615967,
-      "time_to_first_chunk": 0.21096158027648926,
+      "time_to_first_chunk": 0.33925390243530273,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 1
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.3545098304748535,
+      "total_time": 3.827953338623047,
-      "time_to_first_chunk": 0.18648386001586914,
+      "time_to_first_chunk": 0.39266157150268555,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 2
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.426060676574707,
+      "total_time": 3.9389824867248535,
-      "time_to_first_chunk": 0.20081472396850586,
+      "time_to_first_chunk": 0.3231511116027832,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 3
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.4084081649780273,
+      "total_time": 3.942399740219116,
-      "time_to_first_chunk": 0.18551135063171387,
+      "time_to_first_chunk": 0.34731340408325195,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 4
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.4703152179718018,
+      "total_time": 3.7748308181762695,
-      "time_to_first_chunk": 0.17750859260559082,
+      "time_to_first_chunk": 0.40787601470947266,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 5
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.289574384689331,
+      "total_time": 9.003147840499878,
-      "time_to_first_chunk": 0.1997976303100586,
+      "time_to_first_chunk": 0.5455703735351562,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 3.7089381217956543,
+      "total_time": 10.081491231918335,
-      "time_to_first_chunk": 0.25969815254211426,
+      "time_to_first_chunk": 0.4591703414916992,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.138366222381592,
+      "total_time": 9.767668962478638,
-      "time_to_first_chunk": 0.1831505298614502,
+      "time_to_first_chunk": 0.31237053871154785,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 3.980635643005371,
+      "total_time": 9.090342998504639,
-      "time_to_first_chunk": 0.20493030548095703,
+      "time_to_first_chunk": 0.41753244400024414,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.1370298862457275,
+      "total_time": 9.876578330993652,
-      "time_to_first_chunk": 0.19150757789611816,
+      "time_to_first_chunk": 0.3965120315551758,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "10": {
      "avg_time_to_first_chunk": 0.296,
      "avg_total_time": 0.316,
      "avg_audio_length": 3.45,
      "num_successful_runs": 5
    },
    "50": {
-      "avg_time_to_first_chunk": 0.233,
+      "avg_time_to_first_chunk": 0.325,
-      "avg_total_time": 0.527,
+      "avg_total_time": 0.934,
-      "avg_audio_length": 16.325,
+      "avg_audio_length": 15.925,
      "num_successful_runs": 5
    },
    "100": {
-      "avg_time_to_first_chunk": 0.196,
+      "avg_time_to_first_chunk": 0.335,
-      "avg_total_time": 0.739,
+      "avg_total_time": 1.442,
-      "avg_audio_length": 31.1,
+      "avg_audio_length": 30.5,
      "num_successful_runs": 5
    },
-    "200": {
+    "250": {
-      "avg_time_to_first_chunk": 0.192,
+      "avg_time_to_first_chunk": 0.362,
-      "avg_total_time": 1.469,
+      "avg_total_time": 4.114,
-      "avg_audio_length": 62.625,
+      "avg_audio_length": 78.775,
      "num_successful_runs": 5
    },
    "500": {
-      "avg_time_to_first_chunk": 0.208,
+      "avg_time_to_first_chunk": 0.426,
-      "avg_total_time": 4.051,
+      "avg_total_time": 9.564,
-      "avg_audio_length": 157.875,
+      "avg_audio_length": 156.475,
      "num_successful_runs": 5
    }
  },
-  "timestamp": "2025-01-04 22:16:30"
+  "timestamp": "2025-01-06 00:00:43"
 }
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
@ -1,271 +1,337 @@
 {
  "individual_runs": [
    {
-      "text_length": 212,
+      "text_length": 37,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 1.149611473083496,
+      "total_time": 0.7105245590209961,
-      "time_to_first_chunk": 0.8767304420471191,
+      "time_to_first_chunk": 0.6905441284179688,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 1
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.35063982009887695,
      "time_to_first_chunk": 0.32647228240966797,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 2
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.43519043922424316,
      "time_to_first_chunk": 0.41011548042297363,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 3
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.33886170387268066,
      "time_to_first_chunk": 0.32068943977355957,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 4
    },
    {
      "text_length": 37,
      "token_count": null,
      "total_time": 0.31725525856018066,
      "time_to_first_chunk": 0.29624342918395996,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 5
    },
    {
      "text_length": 212,
      "token_count": null,
      "total_time": 1.0215234756469727,
      "time_to_first_chunk": 0.38323354721069336,
      "error": null,
      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.9325947761535645,
+      "total_time": 1.38511061668396,
-      "time_to_first_chunk": 0.5965914726257324,
+      "time_to_first_chunk": 0.47052764892578125,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 0.9205234050750732,
+      "total_time": 1.0185234546661377,
-      "time_to_first_chunk": 0.5961906909942627,
+      "time_to_first_chunk": 0.3535764217376709,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 1.1321916580200195,
+      "total_time": 0.8875925540924072,
-      "time_to_first_chunk": 0.6946916580200195,
+      "time_to_first_chunk": 0.3373105525970459,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
-      "token_count": 50,
+      "token_count": null,
-      "total_time": 1.1146185398101807,
+      "total_time": 0.9557526111602783,
-      "time_to_first_chunk": 0.6918885707855225,
+      "time_to_first_chunk": 0.3364882469177246,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
-      "audio_length": 16.325,
+      "audio_length": 15.925,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 1.3645410537719727,
+      "total_time": 1.569596767425537,
-      "time_to_first_chunk": 0.6802399158477783,
+      "time_to_first_chunk": 0.42070746421813965,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 1.4154777526855469,
+      "total_time": 1.5172030925750732,
-      "time_to_first_chunk": 0.7297353744506836,
+      "time_to_first_chunk": 0.3982264995574951,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 1.3589520454406738,
+      "total_time": 1.5318474769592285,
-      "time_to_first_chunk": 0.698603630065918,
+      "time_to_first_chunk": 0.3533785343170166,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 1.2276430130004883,
+      "total_time": 1.3858752250671387,
-      "time_to_first_chunk": 0.6705801486968994,
+      "time_to_first_chunk": 0.3360786437988281,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
-      "token_count": 100,
+      "token_count": null,
-      "total_time": 1.0949454307556152,
+      "total_time": 1.7841475009918213,
-      "time_to_first_chunk": 0.5698442459106445,
+      "time_to_first_chunk": 0.34446048736572266,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
-      "audio_length": 31.1,
+      "audio_length": 30.5,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.8211240768432617,
+      "total_time": 4.334965467453003,
-      "time_to_first_chunk": 0.6070489883422852,
+      "time_to_first_chunk": 0.4336512088775635,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 1
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.8376774787902832,
+      "total_time": 5.265941858291626,
-      "time_to_first_chunk": 0.6538689136505127,
+      "time_to_first_chunk": 0.5461773872375488,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 2
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.6953792572021484,
+      "total_time": 5.66066575050354,
-      "time_to_first_chunk": 0.5554308891296387,
+      "time_to_first_chunk": 0.4757547378540039,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 3
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.887030839920044,
+      "total_time": 9.289174318313599,
-      "time_to_first_chunk": 0.5866930484771729,
+      "time_to_first_chunk": 0.40159058570861816,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 4
    },
    {
-      "text_length": 906,
+      "text_length": 1140,
-      "token_count": 200,
+      "token_count": null,
-      "total_time": 1.7908406257629395,
+      "total_time": 4.425869703292847,
-      "time_to_first_chunk": 0.5897490978240967,
+      "time_to_first_chunk": 0.40808558464050293,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
-      "audio_length": 62.625,
+      "audio_length": 78.775,
-      "target_tokens": 200,
+      "target_tokens": 250,
-      "actual_tokens": 200,
+      "actual_tokens": 250,
      "run_number": 5
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.228837013244629,
+      "total_time": 9.600461483001709,
-      "time_to_first_chunk": 0.5315976142883301,
+      "time_to_first_chunk": 0.3966805934906006,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.489210367202759,
+      "total_time": 8.82239580154419,
-      "time_to_first_chunk": 0.5261838436126709,
+      "time_to_first_chunk": 0.3900904655456543,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.5290446281433105,
+      "total_time": 10.99152159690857,
-      "time_to_first_chunk": 0.6186764240264893,
+      "time_to_first_chunk": 0.4041757583618164,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.209261178970337,
+      "total_time": 9.12995958328247,
-      "time_to_first_chunk": 0.5990591049194336,
+      "time_to_first_chunk": 0.43430614471435547,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
-      "token_count": 500,
+      "token_count": null,
-      "total_time": 4.218762636184692,
+      "total_time": 10.043727159500122,
-      "time_to_first_chunk": 0.5466251373291016,
+      "time_to_first_chunk": 0.41181445121765137,
      "error": null,
-      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
+      "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
-      "audio_length": 157.875,
+      "audio_length": 156.475,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "10": {
      "avg_time_to_first_chunk": 0.409,
      "avg_total_time": 0.43,
      "avg_audio_length": 3.45,
      "num_successful_runs": 5
    },
    "50": {
-      "avg_time_to_first_chunk": 0.691,
+      "avg_time_to_first_chunk": 0.376,
-      "avg_total_time": 1.05,
+      "avg_total_time": 1.054,
-      "avg_audio_length": 16.325,
+      "avg_audio_length": 15.925,
      "num_successful_runs": 5
    },
    "100": {
-      "avg_time_to_first_chunk": 0.67,
+      "avg_time_to_first_chunk": 0.371,
-      "avg_total_time": 1.292,
+      "avg_total_time": 1.558,
-      "avg_audio_length": 31.1,
+      "avg_audio_length": 30.5,
      "num_successful_runs": 5
    },
-    "200": {
+    "250": {
-      "avg_time_to_first_chunk": 0.599,
+      "avg_time_to_first_chunk": 0.453,
-      "avg_total_time": 1.806,
+      "avg_total_time": 5.795,
-      "avg_audio_length": 62.625,
+      "avg_audio_length": 78.775,
      "num_successful_runs": 5
    },
    "500": {
-      "avg_time_to_first_chunk": 0.564,
+      "avg_time_to_first_chunk": 0.407,
-      "avg_total_time": 4.335,
+      "avg_total_time": 9.718,
-      "avg_audio_length": 157.875,
+      "avg_audio_length": 156.475,
      "num_successful_runs": 5
    }
  },
-  "timestamp": "2025-01-04 22:18:03"
+  "timestamp": "2025-01-06 00:02:21"
 }
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_results_rtf.json
--- a/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
+++ b/examples/assorted_checks/benchmarks/output_data/gpu_benchmark_stats_rtf.txt
@ -1,23 +1,23 @@
 === Benchmark Statistics (with correct RTF) ===
-Total tokens processed: 17150
+Total tokens processed: 3150
-Total audio generated (s): 5296.38
+Total audio generated (s): 1056.03
-Total test duration (s): 155.23
+Total test duration (s): 70.20
-Average processing rate (tokens/s): 102.86
+Average processing rate (tokens/s): 46.46
-Average RTF: 0.03
+Average RTF: 0.07
-Average Real Time Speed: 31.25
+Average Real Time Speed: 15.00
 === Per-chunk Stats ===
-Average chunk size (tokens): 1715.00
+Average chunk size (tokens): 525.00
 Min chunk size (tokens): 150
-Max chunk size (tokens): 5000
+Max chunk size (tokens): 900
-Average processing time (s): 15.39
+Average processing time (s): 11.57
-Average output length (s): 529.64
+Average output length (s): 176.00
 === Performance Ranges ===
-Processing rate range (tokens/s): 80.65 - 125.10
+Processing rate range (tokens/s): 40.07 - 53.57
-RTF range: 0.03x - 0.04x
+RTF range: 0.06x - 0.08x
-Real Time Speed range: 25.00x - 33.33x
+Real Time Speed range: 12.50x - 16.67x
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/cpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
+++ b/examples/assorted_checks/benchmarks/output_plots/format_comparison.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_processing_time_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_realtime_factor_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_system_usage_rtf.png
--- a/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
+++ b/examples/assorted_checks/benchmarks/output_plots/gpu_usage.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
--- a/examples/assorted_checks/generate_readme_plots.py
+++ b/examples/assorted_checks/generate_readme_plots.py
@ -0,0 +1,198 @@
 #!/usr/bin/env python3
 """Script to generate all plots needed for the README."""
 import os
 import sys
 import shutil
 from pathlib import Path
 from validate_wav import validate_tts
 # Get absolute paths
 script_dir = Path(__file__).parent.resolve()
 project_root = script_dir.parent.parent
 # Add directories to Python path for imports
 sys.path.append(str(script_dir))
 sys.path.append(str(script_dir / "benchmarks"))
 # Import test scripts
 from benchmark_tts_rtf import main as benchmark_rtf
 from test_formats.test_audio_formats import main as test_formats
 from benchmark_first_token_stream_unified import main as benchmark_stream
 from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
 # Remove directories from path after imports
 sys.path.remove(str(script_dir))
 sys.path.remove(str(script_dir / "benchmarks"))
 def ensure_assets_dir():
    """Create assets directory if it doesn't exist."""
    assets_dir = project_root / "assets"
    assets_dir.mkdir(exist_ok=True)
    return assets_dir
 def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
    """Copy a plot to the assets directory with a new name."""
    if os.path.exists(src_path):
        shutil.copy2(src_path, assets_dir / dest_name)
        print(f"Copied {src_path} to {assets_dir / dest_name}")
    else:
        print(f"Warning: Source plot not found at {src_path}")
 def validate_and_print(wav_path: str, category: str):
    """Validate a WAV file and print results."""
    if not os.path.exists(wav_path):
        print(f"Warning: WAV file not found at {wav_path}")
        return
    print(f"\n=== Validating {category} Audio ===")
    result = validate_tts(wav_path)
    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print(f"Duration: {result['duration']}")
        print(f"Sample Rate: {result['sample_rate']} Hz")
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
                print(f"- {issue}")
        else:
            print("\nNo issues found")
 def main():
    """Generate all plots needed for the README."""
    # Ensure assets directory exists
    prefix = "gpu"
    assets_dir = ensure_assets_dir()
    print("\n=== Generating Format Comparison Plot ===")
    test_formats()
    copy_plot(
        str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
        "format_comparison.png",
        assets_dir,
    )
    # Validate WAV output from format test
    validate_and_print(
        str(script_dir / "test_formats/output/test_formats/speech.wav"),
        "Format Test WAV",
    )
    print("\n=== Generating Voice Analysis Plot ===")
    test_voice_analysis()
    copy_plot(
        str(script_dir / "test_combinations/output/analysis_comparison.png"),
        "voice_analysis.png",
        assets_dir,
    )
    # Validate combined voice output
    validate_and_print(
        str(
            script_dir
            / "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
        ),
        "Combined Voice",
    )
    print("\n=== Generating Performance Benchmark Plots ===")
    benchmark_rtf()
    copy_plot(
        str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
        f"{prefix}_processing_time.png",
        assets_dir,
    )
    copy_plot(
        str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
        f"{prefix}_realtime_factor.png",
        assets_dir,
    )
    # Validate RTF benchmark output (~500 tokens)
    validate_and_print(
        str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
        "RTF Benchmark",
    )
    print("\n=== Generating Streaming Benchmark Plots ===")
    benchmark_stream()
    # Copy direct streaming plots
    copy_plot(
        str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
        f"{prefix}_first_token_latency_direct.png",
        assets_dir,
    )
    copy_plot(
        str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
        f"{prefix}_first_token_timeline_direct.png",
        assets_dir,
    )
    copy_plot(
        str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
        f"{prefix}_total_time_latency_direct.png",
        assets_dir,
    )
    # Copy OpenAI streaming plots
    copy_plot(
        str(
            script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
        ),
        f"{prefix}_first_token_latency_openai.png",
        assets_dir,
    )
    copy_plot(
        str(
            script_dir
            / "benchmarks/output_plots/first_token_timeline_stream_openai.png"
        ),
        f"{prefix}_first_token_timeline_openai.png",
        assets_dir,
    )
    copy_plot(
        str(
            script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
        ),
        f"{prefix}_total_time_latency_openai.png",
        assets_dir,
    )
    # Wait a moment for files to be generated
    import time
    time.sleep(2)
    # Validate streaming outputs (~500 tokens)
    validate_and_print(
        str(
            script_dir
            / "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
        ),
        "Direct Streaming",
    )
    validate_and_print(
        str(
            script_dir
            / "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
        ),
        "OpenAI Streaming",
    )
    validate_and_print(
        str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
        "Format Test WAV",
    )
    print("\nAll plots have been generated and copied to the assets directory")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
+++ b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
@ -73,6 +73,7 @@ def generate_speech(
                "voice": voice,
                "speed": 1.0,
                "response_format": "wav",  # Use WAV for analysis
                "stream": False,
            },
        )
@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    fig.patch.set_facecolor("#1a1a2e")
    num_files = len(audio_files)
-    # Create subplot grid with proper spacing
+    # Create subplot grid with proper spacing for waveforms and metrics
    total_rows = num_files + 2  # Add one more row for metrics
    gs = plt.GridSpec(
-        num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
+        total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
    )
    # Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
    # Colors for voices
    colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]
-    # Create two subplots for metrics with similar scales
+    # Create metrics for each subplot
-    # Left subplot: Brightness and Volume
+    metrics = [
    ax1 = plt.subplot(gs[num_files, 0])
    metrics1 = [
        (
-            "Brightness",
+            plt.subplot(gs[num_files, 0]),
-            [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
+            [
-            "kHz",
+                (
-        ),
+                    "Volume",
-        ("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
+                    [chars["rms"] * 100 for chars in all_chars.values()],
-    ]
+                    "RMS×100",
-
+                )
-    # Right subplot: Voice Pitch and Texture
+            ],
    ax2 = plt.subplot(gs[num_files, 1])
    metrics2 = [
        (
            "Voice Pitch",
            [min(chars["dominant_frequencies"]) for chars in all_chars.values()],
            "Hz",
        ),
        (
-            "Texture",
+            plt.subplot(gs[num_files, 1]),
-            [chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
+            [
-            "ZCR×1000",
+                (
                    "Brightness",
                    [chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
                    "kHz",
                )
            ],
        ),
        (
            plt.subplot(gs[num_files + 1, 0]),
            [
                (
                    "Voice Pitch",
                    [
                        min(chars["dominant_frequencies"])
                        for chars in all_chars.values()
                    ],
                    "Hz",
                )
            ],
        ),
        (
            plt.subplot(gs[num_files + 1, 1]),
            [
                (
                    "Texture",
                    [
                        chars["zero_crossing_rate"] * 1000
                        for chars in all_chars.values()
                    ],
                    "ZCR×1000",
                )
            ],
        ),
    ]
-    def plot_grouped_bars(ax, metrics, show_legend=True):
+    # Plot each metric
-        n_groups = len(metrics)
+    for i, (ax, metric_data) in enumerate(metrics):
        n_voices = len(audio_files)
        bar_width = 0.25
        indices = np.array([0])
-        indices = np.arange(n_groups)
+        values = metric_data[0][1]
        max_val = max(values)
-        # Get max value for y-axis scaling
+        for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
-        max_val = max(max(m[1]) for m in metrics)
+            offset = (j - n_voices / 2 + 0.5) * bar_width
        for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
            values = [m[1][i] for m in metrics]
            offset = (i - n_voices / 2 + 0.5) * bar_width
            bars = ax.bar(
-                indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
+                indices + offset,
                [values[j]],
                bar_width,
                label=voice,
                color=color,
                alpha=0.8,
            )
            # Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                )
        ax.set_xticks(indices)
-        ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
+        ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
        # Set y-axis limits with some padding
        ax.set_ylim(0, max_val * 1.2)
        ax.set_ylabel("Value")
-        if show_legend:
+        # Only show legend on first metric plot
        if i == 0:
            ax.legend(
                bbox_to_anchor=(1.05, 1),
                loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
                edgecolor="#ffffff",
            )
-    # Plot both subplots
+        # Style the subplot
-    plot_grouped_bars(ax1, metrics1, show_legend=True)
+        setup_plot(fig, ax, metric_data[0][0])
    plot_grouped_bars(ax2, metrics2, show_legend=False)
-    # Style both subplots
+    # Adjust the figure size and padding
-    setup_plot(fig, ax1, "Brightness and Volume")
+    fig.set_size_inches(15, 20)
    setup_plot(fig, ax2, "Voice Pitch and Texture")
    # Add y-axis labels
    ax1.set_ylabel("Value")
    ax2.set_ylabel("Value")
    # Adjust the figure size to accommodate the legend
    fig.set_size_inches(15, 15)
    # Add padding around the entire figure
    plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
    plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
    print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
    )
    parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
    parser.add_argument(
-        "--output-dir", 
+        "--output-dir",
        default="examples/assorted_checks/test_combinations/output",
        help="Output directory for audio files",
    )
--- a/examples/assorted_checks/test_formats/test_audio_formats.py
+++ b/examples/assorted_checks/test_formats/test_audio_formats.py
@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
    for i, stat in enumerate(stats):
        format_name = stat["format"].upper()
        try:
-            # Handle PCM format differently
+            file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
            if stat["format"] == "pcm":
                # Read raw PCM data (16-bit mono)
                with open(
                    os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
                ) as f:
                    raw_data = f.read()
                data = np.frombuffer(raw_data, dtype=np.int16)
                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
                sr = 24000
            else:
                # Read other formats with soundfile
                data, sr = sf.read(
                    os.path.join(output_dir, f"test_audio.{stat['format']}")
                )
-            # Plot waveform
+            if stat["format"] == "wav":
                # Use scipy.io.wavfile for WAV files
                sr, data = wavfile.read(file_path)
                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
            elif stat["format"] == "pcm":
                # Read raw 16-bit signed little-endian PCM data at 24kHz
                data = np.frombuffer(
                    open(file_path, "rb").read(), dtype="<i2"
                )  # '<i2' means little-endian 16-bit signed int
                data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
                sr = 24000  # Known sample rate for our endpoint
            else:
                # Use soundfile for other formats (mp3, opus, flac)
                data, sr = sf.read(file_path)
            # Plot waveform with consistent normalization
            ax = plt.subplot(gs_waves[i])
            time = np.arange(len(data)) / sr
-            plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
+            plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
            ax.set_xlabel("Time (seconds)")
            ax.set_ylabel("")
            ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
    """Get audio file statistics"""
    file_size = os.path.getsize(file_path)
    file_size_kb = file_size / 1024  # Convert to KB
    format_name = Path(file_path).suffix[1:]
-    try:
+    if format_name == "wav":
-        # Try reading with soundfile first
+        # Use scipy.io.wavfile for WAV files
        sample_rate, data = wavfile.read(file_path)
        data = data.astype(np.float32) / 32768.0  # Convert to float [-1, 1]
        duration = len(data) / sample_rate
        channels = 1 if len(data.shape) == 1 else data.shape[1]
    elif format_name == "pcm":
        # For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
        data = np.frombuffer(
            open(file_path, "rb").read(), dtype="<i2"
        )  # '<i2' means little-endian 16-bit signed int
        data = data.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
        sample_rate = 24000  # Known sample rate for our endpoint
        duration = len(data) / sample_rate
        channels = 1
    else:
        # Use soundfile for other formats (mp3, opus, flac)
        data, sample_rate = sf.read(file_path)
        duration = len(data) / sample_rate
        channels = 1 if len(data.shape) == 1 else data.shape[1]
-        # Calculate audio statistics
+    # Calculate audio statistics
-        stats = {
+    stats = {
-            "format": Path(file_path).suffix[1:],
+        "format": format_name,
-            "file_size_kb": round(file_size_kb, 2),
+        "file_size_kb": round(file_size_kb, 2),
-            "duration_seconds": round(duration, 2),
+        "duration_seconds": round(duration, 2),
-            "sample_rate": sample_rate,
+        "sample_rate": sample_rate,
-            "channels": channels,
+        "channels": channels,
-            "min_amplitude": float(np.min(data)),
+        "min_amplitude": float(np.min(data)),
-            "max_amplitude": float(np.max(data)),
+        "max_amplitude": float(np.max(data)),
-            "mean_amplitude": float(np.mean(np.abs(data))),
+        "mean_amplitude": float(np.mean(np.abs(data))),
-            "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
+        "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
-        }
+    }
-        return stats
+    return stats
    except:
        # For PCM, read raw bytes and estimate duration
        with open(file_path, "rb") as f:
            data = f.read()
            # Assuming 16-bit PCM mono at 24kHz
            samples = len(data) // 2  # 2 bytes per sample
            duration = samples / 24000
            return {
                "format": "pcm",
                "file_size_kb": round(file_size_kb, 2),
                "duration_seconds": round(duration, 2),
                "sample_rate": 24000,
                "channels": 1,
                "note": "PCM stats are estimated from raw bytes",
            }
 def main():
@ -254,13 +256,49 @@ def main():
        # Generate and save
        start_time = time.time()
-        response = client.audio.speech.create(
+
-            model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
+        # Use requests with stream=False for consistent data handling
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "voice": voice,
                "input": SAMPLE_TEXT,
                "response_format": fmt,
                "stream": False,  # Explicitly disable streaming to get single complete chunk
            },
            stream=False,
            headers={"Accept": f"audio/{fmt}"},  # Explicitly request audio format
        )
        generation_time = time.time() - start_time
-        with open(output_path, "wb") as f:
+        print(f"\nResponse headers for {fmt}:")
-            f.write(response.content)
+        for header, value in response.headers.items():
            print(f"{header}: {value}")
        print(f"Content length: {len(response.content)} bytes")
        print(f"First few bytes: {response.content[:20].hex()}")
        # Write the file and verify it was written correctly
        try:
            with open(output_path, "wb") as f:
                f.write(response.content)
            # Verify file was written
            if not output_path.exists():
                raise Exception(f"Failed to write {fmt} file")
            # Check file size matches content length
            written_size = output_path.stat().st_size
            if written_size != len(response.content):
                raise Exception(
                    f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
                )
            print(f"Successfully wrote {fmt} file")
        except Exception as e:
            print(f"Error writing {fmt} file: {e}")
            continue
        # Get stats
        file_stats = get_audio_stats(str(output_path))
--- a/examples/assorted_checks/test_normalizer.py
+++ b/examples/assorted_checks/test_normalizer.py
@ -4,15 +4,19 @@ import random
 import string
 from typing import List, Tuple
 def create_test_cases() -> List[str]:
    """Create a variety of test cases with different characteristics"""
-    
+
    # Helper to create random text with specific patterns
    def random_text(length: int) -> str:
-        return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
+        return "".join(
-    
+            random.choice(string.ascii_letters + string.digits + " .,!?")
            for _ in range(length)
        )
    test_cases = []
-    
+
    # Base test cases that hit specific patterns
    base_cases = [
        "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
@ -21,10 +25,10 @@ def create_test_cases() -> List[str]:
        "X's and Y's properties cost £50 million in the 1990s",
        "こんにちは。今日は！",
    ]
-    
+
    # Add base cases
    test_cases.extend(base_cases)
-    
+
    # Add variations with random content
    for length in [100, 1000, 10000]:
        # Create 3 variations of each length
@ -35,23 +39,24 @@ def create_test_cases() -> List[str]:
            text = text.replace(text[30:40], "$1,234.56")
            text = text.replace(text[50:60], "A.B.C. xyz")
            test_cases.append(text)
-    
+
    return test_cases
 class TextNormalizerInline:
    """Text normalizer using inline patterns"""
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        text = re.sub(r"[^\S \n]", " ", text)
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -61,108 +66,132 @@ class TextNormalizerInline:
        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
-        text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
+        text = re.sub(
            r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
            split_num,
            text,
        )
        text = re.sub(r"(?<=\d),(?=\d)", "", text)
-        text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
+        text = re.sub(
            r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
            handle_money,
            text,
        )
        text = re.sub(r"\d*\.\d+", handle_decimal, text)
        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
        text = re.sub(r"(?<=\d)S", " S", text)
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
-        text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
+        text = re.sub(
            r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
        )
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
-        
+
        return text.strip()
 class TextNormalizerCompiled:
    """Text normalizer using all compiled patterns"""
-    
+
    def __init__(self):
        self.patterns = {
-            'whitespace': re.compile(r"[^\S \n]"),
+            "whitespace": re.compile(r"[^\S \n]"),
-            'multi_space': re.compile(r"  +"),
+            "multi_space": re.compile(r"  +"),
-            'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
+            "newline_space": re.compile(r"(?<=\n) +(?=\n)"),
-            'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
+            "doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
-            'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
+            "mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
-            'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
+            "miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
-            'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
+            "mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
-            'etc': re.compile(r"\betc\.(?! [A-Z])"),
+            "etc": re.compile(r"\betc\.(?! [A-Z])"),
-            'yeah': re.compile(r"(?i)\b(y)eah?\b"),
+            "yeah": re.compile(r"(?i)\b(y)eah?\b"),
-            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
+            "numbers": re.compile(
-            'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
-            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
+            ),
-            'decimal': re.compile(r"\d*\.\d+"),
+            "comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
-            'range': re.compile(r"(?<=\d)-(?=\d)"),
+            "money": re.compile(
-            's_after_number': re.compile(r"(?<=\d)S"),
+                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
-            'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
+            ),
-            'x_possessive': re.compile(r"(?<=X')S\b"),
+            "decimal": re.compile(r"\d*\.\d+"),
-            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
+            "range": re.compile(r"(?<=\d)-(?=\d)"),
-            'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
+            "s_after_number": re.compile(r"(?<=\d)S"),
            "possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
            "x_possessive": re.compile(r"(?<=X')S\b"),
            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
            "single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
        }
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        # Use compiled patterns
-        text = self.patterns['whitespace'].sub(" ", text)
+        text = self.patterns["whitespace"].sub(" ", text)
-        text = self.patterns['multi_space'].sub(" ", text)
+        text = self.patterns["multi_space"].sub(" ", text)
-        text = self.patterns['newline_space'].sub("", text)
+        text = self.patterns["newline_space"].sub("", text)
-        text = self.patterns['doctor'].sub("Doctor", text)
+        text = self.patterns["doctor"].sub("Doctor", text)
-        text = self.patterns['mister'].sub("Mister", text)
+        text = self.patterns["mister"].sub("Mister", text)
-        text = self.patterns['miss'].sub("Miss", text)
+        text = self.patterns["miss"].sub("Miss", text)
-        text = self.patterns['mrs'].sub("Mrs", text)
+        text = self.patterns["mrs"].sub("Mrs", text)
-        text = self.patterns['etc'].sub("etc", text)
+        text = self.patterns["etc"].sub("etc", text)
-        text = self.patterns['yeah'].sub(r"\1e'a", text)
+        text = self.patterns["yeah"].sub(r"\1e'a", text)
-        text = self.patterns['numbers'].sub(split_num, text)
+        text = self.patterns["numbers"].sub(split_num, text)
-        text = self.patterns['comma_in_number'].sub("", text)
+        text = self.patterns["comma_in_number"].sub("", text)
-        text = self.patterns['money'].sub(handle_money, text)
+        text = self.patterns["money"].sub(handle_money, text)
-        text = self.patterns['decimal'].sub(handle_decimal, text)
+        text = self.patterns["decimal"].sub(handle_decimal, text)
-        text = self.patterns['range'].sub(" to ", text)
+        text = self.patterns["range"].sub(" to ", text)
-        text = self.patterns['s_after_number'].sub(" S", text)
+        text = self.patterns["s_after_number"].sub(" S", text)
-        text = self.patterns['possessive_s'].sub("'S", text)
+        text = self.patterns["possessive_s"].sub("'S", text)
-        text = self.patterns['x_possessive'].sub("s", text)
+        text = self.patterns["x_possessive"].sub("s", text)
-        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
+        text = self.patterns["initials"].sub(
-        text = self.patterns['single_initial'].sub("-", text)
+            lambda m: m.group().replace(".", "-"), text
-        
+        )
        text = self.patterns["single_initial"].sub("-", text)
        return text.strip()
 class TextNormalizerHybrid:
    """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
-    
+
    def __init__(self):
        # Only compile patterns that are complex or frequently used
        self.patterns = {
-            'whitespace': re.compile(r"[^\S \n]"),
+            "whitespace": re.compile(r"[^\S \n]"),
-            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
+            "numbers": re.compile(
-            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
+                r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
-            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
+            ),
            "money": re.compile(
                r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
            ),
            "initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
        }
-    
+
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
-        
+
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
-        
+
        # Use compiled patterns for complex operations
-        text = self.patterns['whitespace'].sub(" ", text)
+        text = self.patterns["whitespace"].sub(" ", text)
-        text = self.patterns['numbers'].sub(split_num, text)
+        text = self.patterns["numbers"].sub(split_num, text)
-        text = self.patterns['money'].sub(handle_money, text)
+        text = self.patterns["money"].sub(handle_money, text)
-        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
+        text = self.patterns["initials"].sub(
-        
+            lambda m: m.group().replace(".", "-"), text
        )
        # Use inline patterns for simpler operations
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -179,9 +208,10 @@ class TextNormalizerHybrid:
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
-        
+
        return text.strip()
 def split_num(match: re.Match) -> str:
    """Split numbers for TTS processing"""
    num = match.group(0)
@ -192,61 +222,70 @@ def split_num(match: re.Match) -> str:
        return f"{num[:-1]} s"
    return num
 def handle_money(match: re.Match) -> str:
    """Format money strings for TTS"""
    text = match.group(0)
    return text.replace("$", " dollars ").replace("£", " pounds ")
 def handle_decimal(match: re.Match) -> str:
    """Format decimal numbers for TTS"""
    num = match.group(0)
    return num.replace(".", " point ")
-def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
+
 def benchmark_normalizers(
    test_cases: List[str], iterations: int = 100
 ) -> Tuple[float, float, float]:
    """Benchmark all three implementations"""
-    
+
    normalizers = {
-        'inline': TextNormalizerInline(),
+        "inline": TextNormalizerInline(),
-        'compiled': TextNormalizerCompiled(),
+        "compiled": TextNormalizerCompiled(),
-        'hybrid': TextNormalizerHybrid()
+        "hybrid": TextNormalizerHybrid(),
    }
-    
+
    results = {}
-    
+
    # Test each normalizer
    for name, normalizer in normalizers.items():
        start = time.perf_counter()
-        
+
        # Run normalizations
        for _ in range(iterations):
            for test in test_cases:
                normalizer.normalize(test)
-        
+
        results[name] = time.perf_counter() - start
-    
+
    return results
 def verify_outputs(test_cases: List[str]) -> bool:
    """Verify that all implementations produce identical output"""
    normalizers = {
-        'inline': TextNormalizerInline(),
+        "inline": TextNormalizerInline(),
-        'compiled': TextNormalizerCompiled(),
+        "compiled": TextNormalizerCompiled(),
-        'hybrid': TextNormalizerHybrid()
+        "hybrid": TextNormalizerHybrid(),
    }
-    
+
    for test in test_cases:
        results = [norm.normalize(test) for norm in normalizers.values()]
        if not all(r == results[0] for r in results):
            return False
    return True
 def main():
    # Create test cases
    print("Generating test cases...")
    test_cases = create_test_cases()
    total_chars = sum(len(t) for t in test_cases)
-    print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
+    print(
-    
+        f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
    )
    # Verify output consistency
    print("\nVerifying output consistency...")
    if verify_outputs(test_cases):
@ -254,15 +293,16 @@ def main():
    else:
        print("✗ Warning: Implementations produce different outputs!")
        return
-    
+
    # Run benchmarks
    print("\nRunning benchmarks...")
    iterations = 100
    results = benchmark_normalizers(test_cases, iterations)
-    
+
    # Print results
    print(f"\nResults for {iterations} iterations: ")
    for name, time_taken in results.items():
        print(f"{name.capitalize()}: {time_taken:.3f}s")
-main()
+
 main()
--- a/examples/assorted_checks/validate_wav.py
+++ b/examples/assorted_checks/validate_wav.py
@ -1,8 +1,11 @@
 import argparse
 from typing import Any, Dict
 from pathlib import Path
 import numpy as np
 import soundfile as sf
-import argparse
+from tqdm import tqdm
-from pathlib import Path
+
 from typing import Dict, Any
 def validate_tts(wav_path: str) -> dict:
    """
@ -13,34 +16,40 @@ def validate_tts(wav_path: str) -> dict:
        audio, sr = sf.read(wav_path)
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)
-        
+
        duration = len(audio) / sr
        issues = []
-        
+
        # Basic quality checks
        abs_audio = np.abs(audio)
        stats = {
-            'rms': float(np.sqrt(np.mean(audio**2))),
+            "rms": float(np.sqrt(np.mean(audio**2))),
-            'peak': float(np.max(abs_audio)),
+            "peak": float(np.max(abs_audio)),
-            'dc_offset': float(np.mean(audio))
+            "dc_offset": float(np.mean(audio)),
        }
-        
+
        clip_count = np.sum(abs_audio >= 0.99)
        clip_percent = (clip_count / len(audio)) * 100
-        
+
        if duration < 0.1:
-            issues.append("WARNING: Audio is suspiciously short - possible failed generation")
+            issues.append(
-            
+                "WARNING: Audio is suspiciously short - possible failed generation"
-        if stats['peak'] >= 1.0:
+            )
        if stats["peak"] >= 1.0:
            if clip_percent > 1.0:
-                issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
+                issues.append(
                    f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
                )
            elif clip_percent > 0.01:
-                issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
+                issues.append(
-            
+                    f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
-        if stats['rms'] < 0.01:
+                )
        if stats["rms"] < 0.01:
            issues.append("WARNING: Audio is very quiet - possible failed generation")
-            
+
-        if abs(stats['dc_offset']) > 0.1:
+        if abs(stats["dc_offset"]) > 0.1:
            issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
        # Check for long silence gaps
@ -51,66 +60,79 @@ def validate_tts(wav_path: str) -> dict:
        window_size = int(min_silence * sr)
        silence_count = 0
        last_silence = -1
-        
+
        start_idx = int(0.2 * sr)  # Skip first 0.2s
-        for i in range(start_idx, len(db) - window_size, window_size):
+        for i in tqdm(
-            window = db[i:i+window_size]
+            range(start_idx, len(db) - window_size, window_size),
            desc="Checking for silence",
        ):
            window = db[i : i + window_size]
            if np.mean(window) < silence_threshold:
                silent_ratio = np.mean(window < silence_threshold)
                if silent_ratio > 0.9:
-                    if last_silence == -1 or (i/sr - last_silence) > 2.0:
+                    if last_silence == -1 or (i / sr - last_silence) > 2.0:
                        silence_count += 1
-                        last_silence = i/sr
+                        last_silence = i / sr
-                        issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
+                        issues.append(
-        
+                            f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
                        )
        if silence_count > 2:
-            issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
+            issues.append(
                f"WARNING: Multiple long silences found ({silence_count} total)"
            )
        # Detect audio artifacts
        diff = np.diff(audio)
        abs_diff = np.abs(diff)
        window_size = min(int(0.005 * sr), 256)
-        window = np.ones(window_size)/window_size
+        window = np.ones(window_size) / window_size
-        local_avg_diff = np.convolve(abs_diff, window, mode='same')
+        local_avg_diff = np.convolve(abs_diff, window, mode="same")
-        
+
        spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
        artifact_indices = np.nonzero(spikes)[0]
-        
+
        artifacts = []
        if len(artifact_indices) > 0:
            gaps = np.diff(artifact_indices)
            min_gap = int(0.005 * sr)
            break_points = np.nonzero(gaps > min_gap)[0] + 1
            groups = np.split(artifact_indices, break_points)
-            
+
            for group in groups:
                if len(group) >= 5:
                    severity = np.max(abs_diff[group])
                    if severity > 0.2:
-                        center_idx = group[len(group)//2]
+                        center_idx = group[len(group) // 2]
-                        artifacts.append({
+                        artifacts.append(
-                            'time': float(center_idx/sr),  # Ensure float for consistent timing
+                            {
-                            'severity': float(severity)
+                                "time": float(
-                        })
+                                    center_idx / sr
                                ),  # Ensure float for consistent timing
                                "severity": float(severity),
                            }
                        )
                        issues.append(
                            f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
                            f"(severity: {severity:.3f})"
                        )
        # Check for repeated speech segments
-        for chunk_duration in [5.0, 10.0]:
+        for chunk_duration in tqdm(
            [0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
        ):
            chunk_size = int(chunk_duration * sr)
            overlap = int(0.2 * chunk_size)
-            
+
-            for i in range(0, len(audio) - 2*chunk_size, overlap):
+            for i in range(0, len(audio) - 2 * chunk_size, overlap):
-                chunk1 = audio[i:i+chunk_size]
+                chunk1 = audio[i : i + chunk_size]
-                chunk2 = audio[i+chunk_size:i+2*chunk_size]
+                chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
-                
+
                if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
                    continue
-                    
+
                try:
-                    correlation = np.corrcoef(chunk1, chunk2)[0,1]
+                    correlation = np.corrcoef(chunk1, chunk2)[0, 1]
                    if not np.isnan(correlation) and correlation > 0.92:
                        issues.append(
                            f"WARNING: Possible repeated speech at {i/sr:.1f}s "
@ -128,92 +150,113 @@ def validate_tts(wav_path: str) -> dict:
            "rms_level": f"{stats['rms']:.3f}",
            "dc_offset": f"{stats['dc_offset']:.3f}",
            "artifact_count": len(artifacts),
-            "artifact_locations": [a['time'] for a in artifacts],
+            "artifact_locations": [a["time"] for a in artifacts],
-            "artifact_severities": [a['severity'] for a in artifacts],
+            "artifact_severities": [a["severity"] for a in artifacts],
            "issues": issues,
-            "valid": len(issues) == 0
+            "valid": len(issues) == 0,
        }
    except Exception as e:
        return {
            "file": wav_path,
            "error": str(e),
            "valid": False
        }
-def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
+    except Exception as e:
        return {"file": wav_path, "error": str(e), "valid": False}
 def generate_analysis_plots(
    wav_path: str, output_dir: str, validation_result: Dict[str, Any]
 ):
    """
    Generate analysis plots for audio file with time-aligned visualizations.
    """
    import matplotlib.pyplot as plt
    from scipy.signal import spectrogram
-    
+
    # Load audio
    audio, sr = sf.read(wav_path)
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
-    
+
    # Create figure with shared x-axis
    fig = plt.figure(figsize=(15, 8))
    gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
-    
+
    # Calculate spectrogram
    nperseg = 2048
    noverlap = 1536
-    f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap, 
+    f, t, Sxx = spectrogram(
-                           window='hann', scaling='spectrum')
+        audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
-    
+    )
    # Plot spectrogram
-    im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), 
+    im = ax1.pcolormesh(
-                        shading='gouraud', cmap='viridis', 
+        t,
-                        vmin=-100, vmax=-20)
+        f,
-    ax1.set_ylabel('Frequency [Hz]', fontsize=10)
+        10 * np.log10(Sxx + 1e-10),
-    cbar = plt.colorbar(im, ax=ax1, label='dB')
+        shading="gouraud",
-    ax1.set_title('Spectrogram', pad=10, fontsize=12)
+        cmap="viridis",
-    
+        vmin=-100,
        vmax=-20,
    )
    ax1.set_ylabel("Frequency [Hz]", fontsize=10)
    cbar = plt.colorbar(im, ax=ax1, label="dB")
    ax1.set_title("Spectrogram", pad=10, fontsize=12)
    # Plot waveform with exact time alignment
    times = np.arange(len(audio)) / sr
-    ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
+    ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
-    ax2.set_ylabel('Amplitude', fontsize=10)
+    ax2.set_ylabel("Amplitude", fontsize=10)
-    ax2.set_xlabel('Time [sec]', fontsize=10)
+    ax2.set_xlabel("Time [sec]", fontsize=10)
    ax2.grid(True, alpha=0.2)
-    
+
    # Add artifact markers
-    if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
+    if (
-        for loc in validation_result['artifact_locations']:
+        "artifact_locations" in validation_result
-            ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
+        and validation_result["artifact_locations"]
-            ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
+    ):
-        
+        for loc in validation_result["artifact_locations"]:
            ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
            ax2.axvline(
                x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
            )
        # Add legend to both plots
-        if len(validation_result['artifact_locations']) > 0:
+        if len(validation_result["artifact_locations"]) > 0:
-            ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
+            ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
-            ax1.legend(loc='upper right', fontsize=8)
+            ax1.legend(loc="upper right", fontsize=8)
            # Only add unique labels to legend
            handles, labels = ax2.get_legend_handles_labels()
            unique_labels = dict(zip(labels, handles))
-            ax2.legend(unique_labels.values(), unique_labels.keys(), 
+            ax2.legend(
-                      loc='upper right', fontsize=8)
+                unique_labels.values(),
-    
+                unique_labels.keys(),
                loc="upper right",
                fontsize=8,
            )
    # Set common x limits
-    xlim = (0, len(audio)/sr)
+    xlim = (0, len(audio) / sr)
    ax1.set_xlim(xlim)
    ax2.set_xlim(xlim)
    og_filename = Path(wav_path).name.split(".")[0]
    # Save plot
-    plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
+    plt.savefig(
        Path(output_dir) / f"{og_filename}_audio_analysis.png",
        dpi=300,
        bbox_inches="tight",
    )
    plt.close()
 if __name__ == "__main__":
    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
    silent=False
 if __name__ == "__main__":
    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
    silent = False
    print(f"\n\n Processing:\n\t{wav_file}")
    result = validate_tts(wav_file)
    if not silent:
        wav_root_dir = Path(wav_file).parent
        generate_analysis_plots(wav_file, wav_root_dir, result)
-    
+
    print(f"\nValidating: {result['file']}")
    if "error" in result:
        print(f"Error: {result['error']}")
@ -224,10 +267,10 @@ if __name__ == "__main__":
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
        print(f"Detected Artifacts: {result['artifact_count']}")
-        
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
                print(f"- {issue}")
        else:
-            print("\nNo issues found")
+            print("\nNo issues found")
--- a/examples/assorted_checks/validate_wavs.py
+++ b/examples/assorted_checks/validate_wavs.py
@ -1,7 +1,9 @@
 import argparse
 from pathlib import Path
 from validate_wav import validate_tts
 def print_validation_result(result: dict, rel_path: Path):
    """Print full validation details for a single file."""
    print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
-        
+
        if result["issues"]:
            print("\nIssues Found:")
            for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
        else:
            print("\nNo issues found")
 def validate_directory(directory: str):
    """Validate all wav files in a directory with detailed output and summary."""
    dir_path = Path(directory)
-    
+
    # Find all wav files (including nested directories)
    wav_files = list(dir_path.rglob("*.wav"))
    wav_files.extend(dir_path.rglob("*.mp3"))  # Also check mp3s
    wav_files = sorted(wav_files)
-    
+
    if not wav_files:
        print(f"No .wav or .mp3 files found in {directory}")
        return
-        
+
    print(f"Found {len(wav_files)} files in {directory}")
    print("=" * 80)
-    
+
    # Store results for summary
    results = []
-    
+
    # Detailed validation output
    for wav_file in wav_files:
        result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
        print_validation_result(result, rel_path)
        results.append((rel_path, result))
        print("=" * 80)
-    
+
    # Summary with detailed issues
    print("\nSUMMARY:")
    for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
            issues = result["issues"]
            first_issue = issues[0].replace("WARNING: ", "")
            if len(issues) > 1:
-                print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
+                print(
                    f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
                )
            else:
                print(f"{rel_path}: FAIL - {first_issue}")
        else:
            print(f"{rel_path}: PASS")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
    parser.add_argument("directory", help="Directory containing wav files to validate")
    args = parser.parse_args()
-    
+
    validate_directory(args.directory)
--- a/examples/output.wav
+++ b/examples/output.wav
--- a/examples/output_audio_analysis.png
+++ b/examples/output_audio_analysis.png
--- a/examples/speech.mp3
+++ b/examples/speech.mp3
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ numpy==2.2.1
 scipy==1.14.1
 # Audio processing
-soundfile==0.12.1
+soundfile==0.13.0
 # Text processing
 phonemizer==3.3.0