First streaming attempt

2025-08-05 16:48:53 +00:00 · 2025-01-04 17:54:54 -07:00 · 2025-01-04 17:54:54 -07:00 · f1eb1d9590
commit f1eb1d9590
parent 65bf15f153
24 changed files with 1583 additions and 199 deletions
--- a/.coverage
+++ b/.coverage
--- a/.gitignore
+++ b/.gitignore
@ -23,4 +23,5 @@ examples/assorted_checks/test_openai/output/*
 examples/assorted_checks/test_voices/output/*
 examples/assorted_checks/test_formats/output/*
 examples/assorted_checks/benchmarks/output_audio_stream/*
 ui/RepoScreenshot.png
--- a/api/src/main.py
+++ b/api/src/main.py
@ -23,8 +23,16 @@ async def lifespan(app: FastAPI):
    # Initialize the main model with warm-up
    voicepack_count = TTSModel.setup()
    logger.info("""
    ███████╗█████╗█████████████████╗  ██╗██████╗██╗  ██╗██████╗ 
    ██╔════██╔══████╔════╚══██╔══██║ ██╔██╔═══████║ ██╔██╔═══██╗
    █████╗ ██████████████╗  ██║  █████╔╝██║   ███████╔╝██║   ██║
    ██╔══╝ ██╔══██╚════██║  ██║  ██╔═██╗██║   ████╔═██╗██║   ██║
    ██║    ██║  █████████║  ██║  ██║  ██╚██████╔██║  ██╚██████╔╝
    ╚═╝    ╚═╝  ╚═╚══════╝  ╚═╝  ╚═╝  ╚═╝╚═════╝╚═╝  ╚═╝╚═════╝ """)
    logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
    logger.info(f"{voicepack_count} voice packs loaded successfully")
    logger.info("#" * 80)
    yield
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -2,10 +2,12 @@ from typing import List
 from loguru import logger
 from fastapi import Depends, Response, APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
 from ..services.tts_service import TTSService
 from ..services.audio import AudioService
 from ..structures.schemas import OpenAISpeechRequest
 from typing import AsyncGenerator
 router = APIRouter(
    tags=["OpenAI Compatible TTS"],
@ -18,6 +20,16 @@ def get_tts_service() -> TTSService:
    return TTSService()  # Initialize TTSService with default settings
 async def stream_audio_chunks(tts_service: TTSService, request: OpenAISpeechRequest) -> AsyncGenerator[bytes, None]:
    """Stream audio chunks as they're generated"""
    async for chunk in tts_service.generate_audio_stream(
        text=request.input,
        voice=request.voice,
        speed=request.speed,
        output_format=request.response_format
    ):
        yield chunk
@router.post("/audio/speech")
 async def create_speech(
    request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
@ -31,24 +43,52 @@ async def create_speech(
                f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
            )
-        # Generate audio directly using TTSService's method
+        # Set content type based on format
-        audio, _ = tts_service._generate_audio(
+        content_type = {
-            text=request.input,
+            "mp3": "audio/mpeg",
-            voice=request.voice,
+            "opus": "audio/opus",
-            speed=request.speed,
+            "aac": "audio/aac",
-            stitch_long_output=True,
+            "flac": "audio/flac",
-        )
+            "wav": "audio/wav",
            "pcm": "audio/pcm",
        }.get(request.response_format, f"audio/{request.response_format}")
-        # Convert to requested format
+        if request.stream:
-        content = AudioService.convert_audio(audio, 24000, request.response_format)
+            # Stream audio chunks as they're generated
            return StreamingResponse(
                stream_audio_chunks(tts_service, request),
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "X-Accel-Buffering": "no",  # Disable proxy buffering
                    "Cache-Control": "no-cache",  # Prevent caching
                },
            )
        else:
            # Generate complete audio
            audio, _ = tts_service._generate_audio(
                text=request.input,
                voice=request.voice,
                speed=request.speed,
                stitch_long_output=True,
            )
-        return Response(
+            # Convert to requested format
-            content=content,
+            content = AudioService.convert_audio(
-            media_type=f"audio/{request.response_format}",
+                audio, 
-            headers={
+                24000, 
-                "Content-Disposition": f"attachment; filename=speech.{request.response_format}"
+                request.response_format,
-            },
+                is_first_chunk=True
-        )
+            )
            return Response(
                content=content,
                media_type=content_type,
                headers={
                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                    "Cache-Control": "no-cache",  # Prevent caching
                },
            )
    except ValueError as e:
        logger.error(f"Invalid request: {str(e)}")
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -7,12 +7,35 @@ import soundfile as sf
 from loguru import logger
 class AudioNormalizer:
    """Handles audio normalization state for a single stream"""
    def __init__(self):
        self.int16_max = np.iinfo(np.int16).max
    def normalize(self, audio_data: np.ndarray) -> np.ndarray:
        """Normalize audio data to int16 range"""
        # Convert to float64 for accurate scaling
        audio_float = audio_data.astype(np.float64)
        # Scale to int16 range while preserving relative amplitudes
        max_val = np.abs(audio_float).max()
        if max_val > 0:
            scaling = self.int16_max / max_val
            audio_float *= scaling
        # Clip to int16 range and convert
        return np.clip(audio_float, -self.int16_max, self.int16_max).astype(np.int16)
 class AudioService:
    """Service for audio format conversions"""
    @staticmethod
    def convert_audio(
-        audio_data: np.ndarray, sample_rate: int, output_format: str
+        audio_data: np.ndarray, 
        sample_rate: int, 
        output_format: str, 
        is_first_chunk: bool = True,
        normalizer: AudioNormalizer = None
    ) -> bytes:
        """Convert audio data to specified format
@ -20,6 +43,7 @@ class AudioService:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
            is_first_chunk: Whether this is the first chunk of a stream
        Returns:
            Bytes of the converted audio
@ -27,30 +51,34 @@ class AudioService:
        buffer = BytesIO()
        try:
-            if output_format == "wav":
+            # Normalize audio if normalizer provided, otherwise just convert to int16
            if normalizer is not None:
                normalized_audio = normalizer.normalize(audio_data)
            else:
                normalized_audio = audio_data.astype(np.int16)
            if output_format == "pcm":
                logger.info("Writing PCM data...")
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
                logger.info("Writing to WAV format...")
-                # Ensure audio_data is in int16 format for WAV
+                # Always include WAV header for WAV format
-                audio_data_wav = (
+                sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
-                    audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
+            elif output_format in ["mp3", "aac"]:
-                ).astype(np.int16)  # Normalize
+                logger.info(f"Converting to {output_format.upper()} format...")
-                sf.write(buffer, audio_data_wav, sample_rate, format="WAV")
+                # Use lower bitrate for streaming
-            elif output_format == "mp3":
+                sf.write(buffer, normalized_audio, sample_rate, format=output_format.upper(), 
-                logger.info("Converting to MP3 format...")
+                        subtype='COMPRESSED')
                # soundfile can write MP3 if ffmpeg or libsox is installed
                sf.write(buffer, audio_data, sample_rate, format="MP3")
            elif output_format == "opus":
                logger.info("Converting to Opus format...")
-                sf.write(buffer, audio_data, sample_rate, format="OGG", subtype="OPUS")
+                # Use lower bitrate and smaller frame size for streaming
                sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
            elif output_format == "flac":
                logger.info("Converting to FLAC format...")
-                sf.write(buffer, audio_data, sample_rate, format="FLAC")
+                # Use smaller block size for streaming
-            elif output_format == "pcm":
+                sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
-                logger.info("Extracting PCM data...")
+                        subtype='PCM_16')
                # Ensure audio_data is in int16 format for PCM
                audio_data_pcm = (
                    audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
                ).astype(np.int16)  # Normalize
                buffer.write(audio_data_pcm.tobytes())
            else:
                raise ValueError(
                    f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -1,4 +1,5 @@
 import re
 from functools import lru_cache
 def split_num(num: re.Match) -> str:
    """Handle number splitting for various formats"""
@ -48,6 +49,7 @@ def handle_decimal(num: re.Match) -> str:
    a, b = num.group().split(".")
    return " point ".join([a, " ".join(b)])
@lru_cache(maxsize=1000)  # Cache normalized text results
 def normalize_text(text: str) -> str:
    """Normalize text for TTS processing
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -3,6 +3,7 @@ import os
 import re
 import time
 from typing import List, Tuple, Optional
 from functools import lru_cache
 import numpy as np
 import torch
@ -12,6 +13,7 @@ from loguru import logger
 from ..core.config import settings
 from .tts_model import TTSModel
 from .audio import AudioService, AudioNormalizer
 class TTSService:
@ -24,6 +26,12 @@ class TTSService:
            text = str(text) if text is not None else ""
        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    @staticmethod
    @lru_cache(maxsize=20)  # Cache up to 8 most recently used voices
    def _load_voice(voice_path: str) -> torch.Tensor:
        """Load and cache a voice model"""
        return torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
    def _get_voice_path(self, voice_name: str) -> Optional[str]:
        """Get the path to a voice file"""
        voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
@ -31,6 +39,13 @@ class TTSService:
    def _generate_audio(
        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
    ) -> Tuple[torch.Tensor, float]:
        """Generate complete audio and return with processing time"""
        audio, processing_time = self._generate_audio_internal(text, voice, speed, stitch_long_output)
        return audio, processing_time
    def _generate_audio_internal(
        self, text: str, voice: str, speed: float, stitch_long_output: bool = True
    ) -> Tuple[torch.Tensor, float]:
        """Generate audio and measure processing time"""
        start_time = time.time()
@ -49,10 +64,8 @@ class TTSService:
            if not voice_path:
                raise ValueError(f"Voice not found: {voice}")
-            # Load voice
+            # Load voice using cached loader
-            voicepack = torch.load(
+            voicepack = self._load_voice(voice_path)
                voice_path, map_location=TTSModel.get_device(), weights_only=True
            )
            # Generate audio with or without stitching
            if stitch_long_output:
@ -97,6 +110,78 @@ class TTSService:
            logger.error(f"Error in audio generation: {str(e)}")
            raise
    async def generate_audio_stream(
        self, text: str, voice: str, speed: float, output_format: str = "wav"
    ):
        """Generate and yield audio chunks as they're generated for real-time streaming"""
        try:
            # Create normalizer for consistent audio levels
            stream_normalizer = AudioNormalizer()
            # Input validation and preprocessing
            if not text:
                raise ValueError("Text is empty")
            normalized = normalize_text(text)
            if not normalized:
                raise ValueError("Text is empty after preprocessing")
            text = str(normalized)
            # Voice validation and loading
            voice_path = self._get_voice_path(voice)
            if not voice_path:
                raise ValueError(f"Voice not found: {voice}")
            voicepack = self._load_voice(voice_path)
            # Split text into smaller chunks for faster streaming
            # Use shorter chunks for real-time delivery
            chunks = []
            sentences = self._split_text(text)
            current_chunk = []
            current_length = 0
            target_length = 100  # Target ~100 characters per chunk for faster processing
            for sentence in sentences:
                current_chunk.append(sentence)
                current_length += len(sentence)
                if current_length >= target_length:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                    current_length = 0
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            # Process and stream chunks
            for i, chunk in enumerate(chunks):
                try:
                    # Process text and generate audio
                    phonemes, tokens = TTSModel.process_text(chunk, voice[0])
                    chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
                    if chunk_audio is not None:
                        # Convert chunk with proper header handling
                        chunk_bytes = AudioService.convert_audio(
                            chunk_audio,
                            24000,
                            output_format,
                            is_first_chunk=(i == 0),
                            normalizer=stream_normalizer
                        )
                        yield chunk_bytes
                    else:
                        logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
                except Exception as e:
                    logger.error(
                        f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
                    )
                    continue
        except Exception as e:
            logger.error(f"Error in audio generation stream: {str(e)}")
            raise
    def _save_audio(self, audio: torch.Tensor, filepath: str):
        """Save audio to file"""
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -22,7 +22,7 @@ class OpenAISpeechRequest(BaseModel):
    )
    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
        default="mp3",
-        description="The format to return audio in. Supported formats: mp3, opus, flac, wav. AAC and PCM are not currently supported.",
+        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
    )
    speed: float = Field(
        default=1.0,
@ -30,3 +30,7 @@ class OpenAISpeechRequest(BaseModel):
        le=4.0,
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
    stream: bool = Field(
        default=False,
        description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.",
    )
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -46,14 +46,14 @@ services:
      model-fetcher:
        condition: service_healthy
-  # Gradio UI service [Comment out everything below if you don't need it]
+  # # Gradio UI service [Comment out everything below if you don't need it]
-  gradio-ui:
+  # gradio-ui:
-    build:
+  #   build:
-      context: ./ui
+  #     context: ./ui
-    ports:
+  #   ports:
-      - "7860:7860"
+  #     - "7860:7860"
-    volumes:
+  #   volumes:
-      - ./ui/data:/app/ui/data
+  #     - ./ui/data:/app/ui/data
-      - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+  #     - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-    environment:
+  #   environment:
-      - GRADIO_WATCH=True  # Enable hot reloading
+  #     - GRADIO_WATCH=True  # Enable hot reloading
--- a/examples/assorted_checks/benchmarks/benchmark_first_token.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@ -0,0 +1,157 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
 import numpy as np
 import requests
 import pandas as pd
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
 def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
    """Measure time to audio via API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": len(enc.encode(text)),
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None  # Length of output audio in seconds
    }
    try:
        start_time = time.time()
        # Make request without streaming
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": "af",
                "response_format": "wav",
                "stream": False
            },
            timeout=1800
        )
        response.raise_for_status()
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        content = response.content
        with open(audio_path, 'wb') as f:
            f.write(content)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["time_to_first_chunk"] = time.time() - start_time
        results["total_time"] = time.time() - start_time
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio")
    output_data_dir = os.path.join(script_dir, "output_data")
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [10, 25, 50, 100, 200, 500]
    all_results = []
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        # Run test 3 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/3...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
            if result["error"]:
                print(f"Error: {result['error']}")
            all_results.append(result)
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
        if matching_results:
            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results)
            }
    # Save results
    # Save results
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, "first_token_benchmark.json")
    )
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create both plots
    plot_correlation(
        df, "target_tokens", "time_to_first_chunk",
        "Time to Audio vs Input Size",
        "Number of Input Tokens",
        "Time to Audio (seconds)",
        os.path.join(output_plots_dir, "first_token_latency.png")
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, "first_token_timeline.png")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
 import numpy as np
 import requests
 import pandas as pd
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
 def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
    """Measure time to audio via API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": len(enc.encode(text)),
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None  # Length of output audio in seconds
    }
    try:
        start_time = time.time()
        # Make request with streaming enabled
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": "af",
                "response_format": "wav",
                "stream": True
            },
            stream=True,
            timeout=1800
        )
        response.raise_for_status()
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        chunks = []
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                if first_chunk_time is None:
                    first_chunk_time = time.time()
                    results["time_to_first_chunk"] = first_chunk_time - start_time
                chunks.append(chunk)
        # Extract WAV header and data separately
        # First chunk has header + data, subsequent chunks are raw PCM
        if not chunks:
            raise ValueError("No audio chunks received")
        first_chunk = chunks[0]
        remaining_chunks = chunks[1:]
        # Find end of WAV header (44 bytes for standard WAV)
        header = first_chunk[:44]
        first_data = first_chunk[44:]
        # Concatenate all PCM data
        all_data = first_data + b''.join(remaining_chunks)
        # Update WAV header with total data size
        import struct
        data_size = len(all_data)
        # Update data size field (bytes 4-7)
        header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
        # Update subchunk2 size field (bytes 40-43)
        header = header[:40] + struct.pack('<I', data_size) + header[44:]
        # Write complete WAV file
        complete_audio = header + all_data
        with open(audio_path, 'wb') as f:
            f.write(complete_audio)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(complete_audio)} bytes")
        print(f"Number of chunks received: {len(chunks)}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths with _stream suffix
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio_stream")
    output_data_dir = os.path.join(script_dir, "output_data")
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [50, 100, 200, 500]
    all_results = []
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens (streaming)")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        # Run test 3 times for each size to get average
        for i in range(3):
            print(f"Run {i+1}/3...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
            if result["error"]:
                print(f"Error: {result['error']}")
            all_results.append(result)
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
        if matching_results:
            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results)
            }
    # Save results with _stream suffix
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, "first_token_benchmark_stream.json")
    )
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create both plots with _stream suffix
    # Plot correlation for both metrics
    plot_correlation(
        df, "target_tokens", "time_to_first_chunk",
        "Time to First Audio vs Input Size (Streaming)",
        "Number of Input Tokens",
        "Time to First Audio (seconds)",
        os.path.join(output_plots_dir, "first_token_latency_stream.png")
    )
    plot_correlation(
        df, "target_tokens", "total_time",
        "Total Time vs Input Size (Streaming)",
        "Number of Input Tokens",
        "Total Time (seconds)",
        os.path.join(output_plots_dir, "total_time_latency_stream.png")
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, "first_token_timeline_stream.png")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/lib/shared_plotting.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@ -1,7 +1,9 @@
 """Shared plotting utilities for benchmarks and tests."""
 import pandas as pd
 import seaborn as sns
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 # Common style configurations
 STYLE_CONFIG = {
@ -136,6 +138,132 @@ def plot_system_metrics(metrics_data, output_path):
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
 def plot_timeline(df, output_path):
    """Create timeline plot showing latency for each run.
    Args:
        df: pandas DataFrame containing run data with columns:
            - target_tokens: number of tokens
            - run_number: run iteration
            - time_to_first_chunk: latency to first token
        output_path: str, path to save the output plot
    """
    plt.style.use("dark_background")
    # Sort by tokens and run number
    df = df.sort_values(['target_tokens', 'run_number'])
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))
    # Calculate y positions for each run with tighter grouping
    unique_tokens = sorted(df['target_tokens'].unique())
    y_positions = {}
    current_y = 0
    group_spacing = 0.8  # Space between groups
    run_spacing = 0.2    # Space between runs in a group
    for tokens in unique_tokens:
        runs = df[df['target_tokens'] == tokens]
        base_y = current_y
        for i, (_, run) in enumerate(runs.iterrows()):
            y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
        current_y = base_y + (len(runs) * run_spacing) + group_spacing
    # Plot bars and points with more transparency
    bar_height = 0.15
    for _, row in df.iterrows():
        y = y_positions[(row['target_tokens'], row['run_number'])]
        latency = row['time_to_first_chunk']
        # Latency bar
        ax.add_patch(patches.Rectangle(
            (0, y - bar_height/2),
            latency,
            bar_height,
            facecolor=STYLE_CONFIG["primary_color"],
            alpha=0.3
        ))
        # End point
        ax.plot(latency, y, 'o', 
                color=STYLE_CONFIG["secondary_color"],
                markersize=4,
                alpha=0.5)
    # Add mean lines and values for each token group
    for tokens in unique_tokens:
        token_runs = df[df['target_tokens'] == tokens]
        mean_latency = token_runs['time_to_first_chunk'].mean()
        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
        min_y = min(y_positions_for_token)
        max_y = max(y_positions_for_token)
        group_center = (min_y + max_y) / 2
        # Plot mean line with gradient alpha
        gradient = np.linspace(0.2, 0.8, 100)
        for i in range(len(gradient)-1):
            y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
            y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
            ax.plot([mean_latency, mean_latency], [y1, y2],
                   '-', color=STYLE_CONFIG["secondary_color"],
                   linewidth=3, alpha=gradient[i])
        # Add mean value label with background
        label_text = f'Mean: {mean_latency:.3f}s'
        bbox_props = dict(
            facecolor=STYLE_CONFIG["background_color"],
            edgecolor=STYLE_CONFIG["secondary_color"],
            alpha=0.8,
            pad=3,
            linewidth=1
        )
        ax.text(mean_latency + 0.02, group_center,
                label_text,
                color=STYLE_CONFIG["secondary_color"],
                va='center',
                fontsize=10,
                fontweight='bold',
                bbox=bbox_props)
    # Customize plot
    ax.set_ylim(-1, current_y)
    ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3)  # Extra space for labels
    # Add labels for token groups with tighter spacing
    group_positions = {}
    for tokens in unique_tokens:
        runs = df[df['target_tokens'] == tokens]
        y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
        group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
        plt.axhline(y=min(y_positions_for_token) - bar_height, 
                   color='white', alpha=0.1, linestyle='-')
    # Calculate mean audio length for each token group
    audio_lengths = {}
    for tokens in unique_tokens:
        token_runs = df[df['target_tokens'] == tokens]
        audio_lengths[tokens] = token_runs['audio_length'].mean()
    # Set y-ticks at group centers with token counts and audio lengths
    plt.yticks(
        list(group_positions.values()),
        [f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
        fontsize=10
    )
    # Customize appearance
    setup_plot(
        fig, ax,
        "Time-To-Audio Latency",
        xlabel="Time (seconds)",
        ylabel="Input Size"
    )
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
 def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
    """Create correlation plot with regression line and correlation coefficient.
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark.json
@ -0,0 +1,403 @@
 {
  "individual_runs": [
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.16574740409851074,
      "time_to_first_chunk": 0.16574740409851074,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 1
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.18812799453735352,
      "time_to_first_chunk": 0.18812799453735352,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 2
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.18645429611206055,
      "time_to_first_chunk": 0.18645429611206055,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 3
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.17632031440734863,
      "time_to_first_chunk": 0.17632031440734863,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 4
    },
    {
      "text_length": 37,
      "token_count": 10,
      "total_time": 0.13381195068359375,
      "time_to_first_chunk": 0.13381195068359375,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
      "audio_length": 3.45,
      "target_tokens": 10,
      "actual_tokens": 10,
      "run_number": 5
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2086498737335205,
      "time_to_first_chunk": 0.2086498737335205,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 1
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2727653980255127,
      "time_to_first_chunk": 0.2727653980255127,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 2
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2096250057220459,
      "time_to_first_chunk": 0.2096250057220459,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 3
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.2256758213043213,
      "time_to_first_chunk": 0.2256758213043213,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 4
    },
    {
      "text_length": 102,
      "token_count": 25,
      "total_time": 0.1945042610168457,
      "time_to_first_chunk": 0.1945042610168457,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
      "audio_length": 7.225,
      "target_tokens": 25,
      "actual_tokens": 25,
      "run_number": 5
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4975121021270752,
      "time_to_first_chunk": 0.4975121021270752,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4518404006958008,
      "time_to_first_chunk": 0.4518404006958008,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5640325546264648,
      "time_to_first_chunk": 0.5640325546264648,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5305957794189453,
      "time_to_first_chunk": 0.5305957794189453,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5540030002593994,
      "time_to_first_chunk": 0.5540030002593994,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.7963137626647949,
      "time_to_first_chunk": 0.7963137626647949,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9320805072784424,
      "time_to_first_chunk": 0.9320805072784424,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.824256181716919,
      "time_to_first_chunk": 0.824256181716919,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9034836292266846,
      "time_to_first_chunk": 0.9034836292266846,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.8364357948303223,
      "time_to_first_chunk": 0.8364357948303223,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8122682571411133,
      "time_to_first_chunk": 1.8122682571411133,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 1
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.7290427684783936,
      "time_to_first_chunk": 1.7290427684783936,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 2
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 2.141728401184082,
      "time_to_first_chunk": 2.141728401184082,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 3
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 2.0155680179595947,
      "time_to_first_chunk": 2.0155680179595947,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 4
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8707575798034668,
      "time_to_first_chunk": 1.8707575798034668,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 5
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.822713851928711,
      "time_to_first_chunk": 4.822713851928711,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.227782726287842,
      "time_to_first_chunk": 4.227782726287842,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.414916276931763,
      "time_to_first_chunk": 4.414916276931763,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.579505681991577,
      "time_to_first_chunk": 4.579505681991577,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.332529067993164,
      "time_to_first_chunk": 4.332529067993164,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "10": {
      "avg_time_to_first_chunk": 0.17,
      "avg_total_time": 0.17,
      "avg_audio_length": 3.45,
      "num_successful_runs": 5
    },
    "25": {
      "avg_time_to_first_chunk": 0.222,
      "avg_total_time": 0.222,
      "avg_audio_length": 7.225,
      "num_successful_runs": 5
    },
    "50": {
      "avg_time_to_first_chunk": 0.52,
      "avg_total_time": 0.52,
      "avg_audio_length": 16.325,
      "num_successful_runs": 5
    },
    "100": {
      "avg_time_to_first_chunk": 0.859,
      "avg_total_time": 0.859,
      "avg_audio_length": 31.1,
      "num_successful_runs": 5
    },
    "200": {
      "avg_time_to_first_chunk": 1.914,
      "avg_total_time": 1.914,
      "avg_audio_length": 62.625,
      "num_successful_runs": 5
    },
    "500": {
      "avg_time_to_first_chunk": 4.475,
      "avg_total_time": 4.475,
      "avg_audio_length": 157.875,
      "num_successful_runs": 5
    }
  },
  "timestamp": "2025-01-04 13:52:28"
 }
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@ -0,0 +1,175 @@
 {
  "individual_runs": [
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.9603095054626465,
      "time_to_first_chunk": 0.5916037559509277,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
      "audio_length": 15.45,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.5130870342254639,
      "time_to_first_chunk": 0.27448558807373047,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
      "audio_length": 15.45,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4667215347290039,
      "time_to_first_chunk": 0.22882533073425293,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
      "audio_length": 15.45,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9051008224487305,
      "time_to_first_chunk": 0.2526383399963379,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
      "audio_length": 30.25,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.8579132556915283,
      "time_to_first_chunk": 0.25691914558410645,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
      "audio_length": 30.25,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.9683890342712402,
      "time_to_first_chunk": 0.26229000091552734,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
      "audio_length": 30.25,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8075971603393555,
      "time_to_first_chunk": 0.22536945343017578,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
      "audio_length": 60.75,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 1
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.493518590927124,
      "time_to_first_chunk": 0.21502947807312012,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
      "audio_length": 60.75,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 2
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.4910809993743896,
      "time_to_first_chunk": 0.21600556373596191,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
      "audio_length": 60.75,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 3
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.223623275756836,
      "time_to_first_chunk": 0.20010590553283691,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
      "audio_length": 147.775,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 3.8811349868774414,
      "time_to_first_chunk": 0.24638962745666504,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
      "audio_length": 147.775,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.045536994934082,
      "time_to_first_chunk": 0.2252039909362793,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
      "audio_length": 147.775,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    }
  ],
  "summary": {
    "50": {
      "avg_time_to_first_chunk": 0.365,
      "avg_total_time": 0.647,
      "avg_audio_length": 15.45,
      "num_successful_runs": 3
    },
    "100": {
      "avg_time_to_first_chunk": 0.257,
      "avg_total_time": 0.91,
      "avg_audio_length": 30.25,
      "num_successful_runs": 3
    },
    "200": {
      "avg_time_to_first_chunk": 0.219,
      "avg_total_time": 1.597,
      "avg_audio_length": 60.75,
      "num_successful_runs": 3
    },
    "500": {
      "avg_time_to_first_chunk": 0.224,
      "avg_total_time": 4.05,
      "avg_audio_length": 147.775,
      "num_successful_runs": 3
    }
  },
  "timestamp": "2025-01-04 14:59:28"
 }
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
--- a/examples/assorted_checks/validate_wav.py
+++ b/examples/assorted_checks/validate_wav.py
@ -2,199 +2,134 @@ import numpy as np
 import soundfile as sf
 import argparse
 from pathlib import Path
 from typing import Dict, Any
 def validate_tts(wav_path: str) -> dict:
    """
-    Quick validation checks for TTS-generated audio files to detect common artifacts.
+    Validation checks for TTS-generated audio files to detect common artifacts.
    Checks for:
    - Unnatural silence gaps
    - Audio glitches and artifacts
    - Repeated speech segments (stuck/looping)
    - Abrupt changes in speech
    - Audio quality issues
    Args:
        wav_path: Path to audio file (wav, mp3, etc)
    Returns:
        Dictionary with validation results
    """
    try:
-        # Load audio
+        # Load and process audio
        audio, sr = sf.read(wav_path)
        if len(audio.shape) > 1:
-            audio = audio.mean(axis=1)  # Convert to mono
+            audio = np.mean(audio, axis=1)
        # Basic audio stats
        duration = len(audio) / sr
        rms = np.sqrt(np.mean(audio**2))
        peak = np.max(np.abs(audio))
        dc_offset = np.mean(audio)
        # Calculate clipping stats if we're near peak
        clip_count = np.sum(np.abs(audio) >= 0.99)
        clip_percent = (clip_count / len(audio)) * 100
        if clip_percent > 0:
            clip_stats = f" ({clip_percent:.2e} ratio near peak)"
        else:
            clip_stats = " (no samples near peak)"
        # Convert to dB for analysis
        eps = np.finfo(float).eps
        db = 20 * np.log10(np.abs(audio) + eps)
        issues = []
-        # Check if audio is too short (likely failed generation)
+        # Basic quality checks
-        if duration < 0.1:  # Less than 100ms
+        abs_audio = np.abs(audio)
        stats = {
            'rms': float(np.sqrt(np.mean(audio**2))),
            'peak': float(np.max(abs_audio)),
            'dc_offset': float(np.mean(audio))
        }
        clip_count = np.sum(abs_audio >= 0.99)
        clip_percent = (clip_count / len(audio)) * 100
        if duration < 0.1:
            issues.append("WARNING: Audio is suspiciously short - possible failed generation")
-        # 1. Check for basic audio quality
+        if stats['peak'] >= 1.0:
-        if peak >= 1.0:
+            if clip_percent > 1.0:
            # Calculate percentage of samples that are clipping
            clip_count = np.sum(np.abs(audio) >= 0.99)
            clip_percent = (clip_count / len(audio)) * 100
            if clip_percent > 1.0:  # Only warn if more than 1% of samples clip
                issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
-            elif clip_percent > 0.01:  # Add info if more than 0.01% but less than 1%
+            elif clip_percent > 0.01:
-                issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples) - likely intentional normalization")
+                issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
-        if rms < 0.01:
+        if stats['rms'] < 0.01:
            issues.append("WARNING: Audio is very quiet - possible failed generation")
        if abs(dc_offset) > 0.1:  # DC offset is particularly bad for speech
            issues.append(f"WARNING: High DC offset ({dc_offset:.3f}) - may cause audio artifacts")
-        # 2. Check for long silence gaps (potential TTS failures)
+        if abs(stats['dc_offset']) > 0.1:
            issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
        # Check for long silence gaps
        eps = np.finfo(float).eps
        db = 20 * np.log10(abs_audio + eps)
        silence_threshold = -45  # dB
-        min_silence = 2.0  # Only detect silences longer than 2 seconds
+        min_silence = 2.0  # seconds
        window_size = int(min_silence * sr)
        silence_count = 0
        last_silence = -1
-        # Skip the first 0.2s for silence detection (avoid false positives at start)
+        start_idx = int(0.2 * sr)  # Skip first 0.2s
        start_idx = int(0.2 * sr)
        for i in range(start_idx, len(db) - window_size, window_size):
            window = db[i:i+window_size]
            if np.mean(window) < silence_threshold:
                # Verify the entire window is mostly silence
                silent_ratio = np.mean(window < silence_threshold)
-                if silent_ratio > 0.9:  # 90% of the window should be below threshold
+                if silent_ratio > 0.9:
-                    if last_silence == -1 or (i/sr - last_silence) > 2.0:  # Only count silences more than 2s apart
+                    if last_silence == -1 or (i/sr - last_silence) > 2.0:
                        silence_count += 1
                        last_silence = i/sr
                        issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
-        if silence_count > 2:  # Only warn if there are multiple long silences
+        if silence_count > 2:
-            issues.append(f"WARNING: Multiple long silences found ({silence_count} total) - possible generation issue")
+            issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
-        # 3. Check for extreme audio artifacts (changes too rapid for natural speech)
+        # Detect audio artifacts
-        # Use a longer window to avoid flagging normal phoneme transitions
+        diff = np.diff(audio)
-        window_size = int(0.02 * sr)  # 20ms window
+        abs_diff = np.abs(diff)
-        db_smooth = np.convolve(db, np.ones(window_size)/window_size, 'same')
+        window_size = min(int(0.005 * sr), 256)
-        db_diff = np.abs(np.diff(db_smooth))
+        window = np.ones(window_size)/window_size
        local_avg_diff = np.convolve(abs_diff, window, mode='same')
-        # Much higher threshold to only catch truly unnatural changes
+        spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
-        artifact_threshold = 40  # dB
+        artifact_indices = np.nonzero(spikes)[0]
        min_duration = int(0.01 * sr)  # Minimum 10ms duration
-        # Find regions where the smoothed dB change is extreme
+        artifacts = []
-        artifact_points = np.where(db_diff > artifact_threshold)[0]
+        if len(artifact_indices) > 0:
            gaps = np.diff(artifact_indices)
            min_gap = int(0.005 * sr)
            break_points = np.nonzero(gaps > min_gap)[0] + 1
            groups = np.split(artifact_indices, break_points)
-        if len(artifact_points) > 0:
+            for group in groups:
-            # Group artifacts that are very close together
+                if len(group) >= 5:
-            grouped_artifacts = []
+                    severity = np.max(abs_diff[group])
-            current_group = [artifact_points[0]]
+                    if severity > 0.2:
                        center_idx = group[len(group)//2]
                        artifacts.append({
                            'time': float(center_idx/sr),  # Ensure float for consistent timing
                            'severity': float(severity)
                        })
                        issues.append(
                            f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
                            f"(severity: {severity:.3f})"
                        )
-            for i in range(1, len(artifact_points)):
+        # Check for repeated speech segments
-                if (artifact_points[i] - current_group[-1]) < min_duration:
+        for chunk_duration in [5.0, 10.0]:
                    current_group.append(artifact_points[i])
                else:
                    if len(current_group) * (1/sr) >= 0.01:  # Only keep groups lasting >= 10ms
                        grouped_artifacts.append(current_group)
                    current_group = [artifact_points[i]]
            if len(current_group) * (1/sr) >= 0.01:
                grouped_artifacts.append(current_group)
            # Report only the most severe artifacts
            for group in grouped_artifacts[:2]:  # Report up to 2 worst artifacts
                center_idx = group[len(group)//2]
                db_change = db_diff[center_idx]
                if db_change > 45:  # Only report very extreme changes
                    issues.append(
                        f"WARNING: Possible audio artifact at {center_idx/sr:.2f}s "
                        f"({db_change:.1f}dB change over {len(group)/sr*1000:.0f}ms)"
                    )
        # 4. Check for repeated speech segments (stuck/looping)
        # Check both short and long sentence durations at audiobook speed (150-160 wpm)
        for chunk_duration in [5.0, 10.0]:  # 5s (~12 words) and 10s (~25 words) at ~audiobook speed
            chunk_size = int(chunk_duration * sr)
-            overlap = int(0.2 * chunk_size)  # 20% overlap between chunks
+            overlap = int(0.2 * chunk_size)
            for i in range(0, len(audio) - 2*chunk_size, overlap):
                chunk1 = audio[i:i+chunk_size]
                chunk2 = audio[i+chunk_size:i+2*chunk_size]
                # Ignore chunks that are mostly silence
                if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
                    continue
                try:
                    correlation = np.corrcoef(chunk1, chunk2)[0,1]
-                    if not np.isnan(correlation) and correlation > 0.92:  # Lower threshold for sentence-length chunks
+                    if not np.isnan(correlation) and correlation > 0.92:
                        issues.append(
                            f"WARNING: Possible repeated speech at {i/sr:.1f}s "
                            f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
                        )
-                        break  # Found repetition at this duration, try next duration
+                        break
                except:
                    continue
        # 5. Check for extreme amplitude discontinuities (common in failed TTS)
        amplitude_envelope = np.abs(audio)
        window_size = sr // 10  # 100ms window for smoother envelope
        smooth_env = np.convolve(amplitude_envelope, np.ones(window_size)/float(window_size), 'same')
        env_diff = np.abs(np.diff(smooth_env))
        # Only detect very extreme amplitude changes
        jump_threshold = 0.5  # Much higher threshold
        jumps = np.where(env_diff > jump_threshold)[0]
        if len(jumps) > 0:
            # Group jumps that are close together
            grouped_jumps = []
            current_group = [jumps[0]]
            for i in range(1, len(jumps)):
                if (jumps[i] - current_group[-1]) < 0.05 * sr:  # Group within 50ms
                    current_group.append(jumps[i])
                else:
                    if len(current_group) >= 3:  # Only keep significant discontinuities
                        grouped_jumps.append(current_group)
                    current_group = [jumps[i]]
            if len(current_group) >= 3:
                grouped_jumps.append(current_group)
            # Report only the most severe discontinuities
            for group in grouped_jumps[:2]:  # Report up to 2 worst cases
                center_idx = group[len(group)//2]
                jump_size = env_diff[center_idx]
                if jump_size > 0.6:  # Only report very extreme changes
                    issues.append(
                        f"WARNING: Possible audio discontinuity at {center_idx/sr:.2f}s "
                        f"({jump_size:.2f} amplitude ratio change)"
                    )
        return {
            "file": wav_path,
            "duration": f"{duration:.2f}s",
            "sample_rate": sr,
-            "peak_amplitude": f"{peak:.3f}{clip_stats}",
+            "peak_amplitude": f"{stats['peak']:.3f}",
-            "rms_level": f"{rms:.3f}",
+            "rms_level": f"{stats['rms']:.3f}",
-            "dc_offset": f"{dc_offset:.3f}",
+            "dc_offset": f"{stats['dc_offset']:.3f}",
            "artifact_count": len(artifacts),
            "artifact_locations": [a['time'] for a in artifacts],
            "artifact_severities": [a['severity'] for a in artifacts],
            "issues": issues,
            "valid": len(issues) == 0
        }
@ -206,12 +141,78 @@ def validate_tts(wav_path: str) -> dict:
            "valid": False
        }
-if __name__ == "__main__":
+def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
-    parser = argparse.ArgumentParser(description="TTS Output Validator")
+    """
-    parser.add_argument("wav_file", help="Path to audio file to validate")
+    Generate analysis plots for audio file with time-aligned visualizations.
-    args = parser.parse_args()
+    """
    import matplotlib.pyplot as plt
    from scipy.signal import spectrogram
-    result = validate_tts(args.wav_file)
+    # Load audio
    audio, sr = sf.read(wav_path)
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    # Create figure with shared x-axis
    fig = plt.figure(figsize=(15, 8))
    gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    # Calculate spectrogram
    nperseg = 2048
    noverlap = 1536
    f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap, 
                           window='hann', scaling='spectrum')
    # Plot spectrogram
    im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), 
                        shading='gouraud', cmap='viridis', 
                        vmin=-100, vmax=-20)
    ax1.set_ylabel('Frequency [Hz]', fontsize=10)
    cbar = plt.colorbar(im, ax=ax1, label='dB')
    ax1.set_title('Spectrogram', pad=10, fontsize=12)
    # Plot waveform with exact time alignment
    times = np.arange(len(audio)) / sr
    ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
    ax2.set_ylabel('Amplitude', fontsize=10)
    ax2.set_xlabel('Time [sec]', fontsize=10)
    ax2.grid(True, alpha=0.2)
    # Add artifact markers
    if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
        for loc in validation_result['artifact_locations']:
            ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
            ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
        # Add legend to both plots
        if len(validation_result['artifact_locations']) > 0:
            ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
            ax1.legend(loc='upper right', fontsize=8)
            # Only add unique labels to legend
            handles, labels = ax2.get_legend_handles_labels()
            unique_labels = dict(zip(labels, handles))
            ax2.legend(unique_labels.values(), unique_labels.keys(), 
                      loc='upper right', fontsize=8)
    # Set common x limits
    xlim = (0, len(audio)/sr)
    ax1.set_xlim(xlim)
    ax2.set_xlim(xlim)
    og_filename = Path(wav_path).name.split(".")[0]
    # Save plot
    plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
    plt.close()
 if __name__ == "__main__":
    wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
    silent=False
    result = validate_tts(wav_file)
    if not silent:
        wav_root_dir = Path(wav_file).parent
        generate_analysis_plots(wav_file, wav_root_dir, result)
    print(f"\nValidating: {result['file']}")
    if "error" in result:
@ -222,6 +223,7 @@ if __name__ == "__main__":
        print(f"Peak Amplitude: {result['peak_amplitude']}")
        print(f"RMS Level: {result['rms_level']}")
        print(f"DC Offset: {result['dc_offset']}")
        print(f"Detected Artifacts: {result['artifact_count']}")
        if result["issues"]:
            print("\nIssues Found:")
--- a/examples/audio_analysis.png
+++ b/examples/audio_analysis.png
--- a/examples/output.wav
+++ b/examples/output.wav
--- a/examples/output_audio_analysis.png
+++ b/examples/output_audio_analysis.png
--- a/examples/stream_tts_playback.py
+++ b/examples/stream_tts_playback.py
@ -0,0 +1,144 @@
 #!/usr/bin/env python3
 import requests
 import sounddevice as sd
 import numpy as np
 import time
 import os
 import wave
 def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"):
    """Stream TTS audio and play it back in real-time"""
    print("\nStarting TTS stream request...")
    start_time = time.time()
    # Initialize variables
    sample_rate = 24000  # Known sample rate for Kokoro
    audio_started = False
    stream = None
    chunk_count = 0
    total_bytes = 0
    first_chunk_time = None
    all_audio_data = bytearray()  # Raw PCM audio data
    # Make streaming request to API
    try:
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": voice,
                "response_format": "pcm",
                "stream": True
            },
            stream=True,
            timeout=1800
        )
        response.raise_for_status()
        print(f"Request started successfully after {time.time() - start_time:.2f}s")
        # Process streaming response
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                chunk_count += 1
                total_bytes += len(chunk)
                # Handle first chunk
                if not audio_started:
                    first_chunk_time = time.time()
                    print(f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s")
                    print(f"First chunk size: {len(chunk)} bytes")
                    # Accumulate raw audio data
                    all_audio_data.extend(chunk)
                    # Convert PCM to float32 for playback
                    audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
                    # Scale to [-1, 1] range for sounddevice
                    audio_data = audio_data / 32768.0
                    # Start audio stream
                    stream = sd.OutputStream(
                        samplerate=sample_rate,
                        channels=1,
                        dtype=np.float32
                    )
                    stream.start()
                    audio_started = True
                    print("Audio playback started")
                    # Play first chunk
                    if len(audio_data) > 0:
                        stream.write(audio_data)
                # Handle subsequent chunks
                else:
                    # Accumulate raw audio data
                    all_audio_data.extend(chunk)
                    # Convert PCM to float32 for playback
                    audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
                    audio_data = audio_data / 32768.0
                    if len(audio_data) > 0:
                        stream.write(audio_data)
                # Log progress every 10 chunks
                if chunk_count % 10 == 0:
                    elapsed = time.time() - start_time
                    print(f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed")
        # Final stats
        total_time = time.time() - start_time
        print(f"\nStream complete:")
        print(f"Total chunks: {chunk_count}")
        print(f"Total data: {total_bytes/1024:.1f}KB")
        print(f"Total time: {total_time:.2f}s")
        print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
        # Save as WAV file
        if output_file:
            print(f"\nWriting audio to {output_file}")
            with wave.open(output_file, 'wb') as wav_file:
                wav_file.setnchannels(1)  # Mono
                wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
                wav_file.setframerate(sample_rate)
                wav_file.writeframes(all_audio_data)
            print(f"Saved {len(all_audio_data)} bytes of audio data")
        # Clean up
        if stream is not None:
            stream.stop()
            stream.close()
    except requests.exceptions.ConnectionError as e:
        print(f"Connection error - Is the server running? Error: {str(e)}")
        if stream is not None:
            stream.stop()
            stream.close()
    except Exception as e:
        print(f"Error during streaming: {str(e)}")
        if stream is not None:
            stream.stop()
            stream.close()
 def main():
    # Load sample text from HG Wells
    script_dir = os.path.dirname(os.path.abspath(__file__))
    wells_path = os.path.join(script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt")
    output_path = os.path.join(script_dir, "output.wav")
    with open(wells_path, "r", encoding="utf-8") as f:
        full_text = f.read()
        # Take first few paragraphs
        text = " ".join(full_text.split("\n\n")[:2])
    print("\nStarting TTS stream playback...")
    print(f"Text length: {len(text)} characters")
    print("\nFirst 100 characters:")
    print(text[:100] + "...")
    play_streaming_tts(text, output_file=output_path)
 if __name__ == "__main__":
    main()