Swapped generator to preprocessing

2025-08-05 16:48:53 +00:00 · 2025-01-04 22:23:59 -07:00 · 2025-01-04 22:23:59 -07:00 · 4c6cd83f85
commit 4c6cd83f85
parent e799f0c7c1
23 changed files with 955 additions and 127 deletions
--- a/.coverage
+++ b/.coverage
--- a/.gitignore
+++ b/.gitignore
@ -25,3 +25,5 @@ examples/assorted_checks/test_voices/output/*
 examples/assorted_checks/test_formats/output/*
 examples/assorted_checks/benchmarks/output_audio_stream/*
 ui/RepoScreenshot.png
 examples/assorted_checks/benchmarks/output_audio_stream_openai/*
--- a/api/src/main.py
+++ b/api/src/main.py
@ -24,7 +24,7 @@ async def lifespan(app: FastAPI):
    # Initialize the main model with warm-up
    voicepack_count = TTSModel.setup()
    # boundary = "█████╗"*9
-    boundary = "░" * 54
+    boundary = "░" * 30
    startup_msg =f"""
 {boundary}
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -57,10 +57,8 @@ async def create_speech(
            "pcm": "audio/pcm",
        }.get(request.response_format, f"audio/{request.response_format}")
-        # Check if streaming is requested via header
+        # Check if streaming is requested (default for OpenAI client)
-        is_streaming = x_raw_response == "stream"
+        if request.stream:
        if is_streaming:
            # Stream audio chunks as they're generated
            return StreamingResponse(
                stream_audio_chunks(tts_service, request),
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -49,7 +49,7 @@ def handle_decimal(num: re.Match) -> str:
    a, b = num.group().split(".")
    return " point ".join([a, " ".join(b)])
-@lru_cache(maxsize=1000)  # Cache normalized text results
+# @lru_cache(maxsize=1000)  # Cache normalized text results
 def normalize_text(text: str) -> str:
    """Normalize text for TTS processing
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -20,11 +20,40 @@ class TTSService:
    def __init__(self, output_dir: str = None):
        self.output_dir = output_dir
-    def _split_text(self, text: str) -> List[str]:
+    def _split_text(self, text: str):
-        """Split text into sentences"""
+        """Generate text chunks one at a time, splitting on natural pause points"""
        if not isinstance(text, str):
            text = str(text) if text is not None else ""
-        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+            
        # First split into sentences
        sentences = re.split(r"(?<=[.!?])\s+", text)
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            # For longer sentences, split on commas and semicolons
            if len(sentence) > 300:  # Only split long sentences
                # Split on pause points while preserving the punctuation
                chunks = re.split(r"((?<=[,;])\s+)", sentence)
                # Reassemble chunks with their trailing punctuation
                current_chunk = ""
                for i, chunk in enumerate(chunks):
                    if i % 2 == 0:  # Text chunk
                        current_chunk += chunk
                    else:  # Punctuation/whitespace chunk
                        current_chunk += chunk
                        if current_chunk.strip():
                            yield current_chunk.strip()
                        current_chunk = ""
                # Yield any remaining text
                if current_chunk.strip():
                    yield current_chunk.strip()
            else:
                yield sentence
    @staticmethod
    @lru_cache(maxsize=20)  # Cache up to 8 most recently used voices
@ -69,11 +98,11 @@ class TTSService:
            # Generate audio with or without stitching
            if stitch_long_output:
                chunks = self._split_text(text)
                audio_chunks = []
                chunk_count = 0
-                # Process all chunks
+                # Process chunks as they're generated
-                for i, chunk in enumerate(chunks):
+                for chunk in self._split_text(text):
                    try:
                        # Process text and generate audio
                        phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -81,23 +110,21 @@ class TTSService:
                        if chunk_audio is not None:
                            audio_chunks.append(chunk_audio)
                            chunk_count += 1
                        else:
-                            logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
+                            logger.error(f"No audio generated for chunk {chunk_count + 1}")
                    except Exception as e:
                        logger.error(
-                            f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
+                            f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
                        )
                        continue
                if not audio_chunks:
                    raise ValueError("No audio chunks were generated successfully")
-                audio = (
+                # Concatenate all chunks
-                    np.concatenate(audio_chunks)
+                audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
                    if len(audio_chunks) > 1
                    else audio_chunks[0]
                )
            else:
                # Process single chunk
                phonemes, tokens = TTSModel.process_text(text, voice[0])
@ -132,11 +159,9 @@ class TTSService:
                raise ValueError(f"Voice not found: {voice}")
            voicepack = self._load_voice(voice_path)
-            # Split text into sentences for natural boundaries
+            # Process chunks as they're generated
-            chunks = self._split_text(text)
+            is_first = True
-
+            for chunk in self._split_text(text):
            # Process and stream chunks
            for i, chunk in enumerate(chunks):
                try:
                    # Process text and generate audio
                    phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -148,17 +173,16 @@ class TTSService:
                            chunk_audio,
                            24000,
                            output_format,
-                            is_first_chunk=(i == 0),
+                            is_first_chunk=is_first,
                            normalizer=stream_normalizer
                        )
                        yield chunk_bytes
                        is_first = False
                    else:
-                        logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
+                        logger.error(f"No audio generated for chunk: '{chunk}'")
                except Exception as e:
-                    logger.error(
+                    logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
                        f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
                    )
                    continue
        except Exception as e:
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -31,6 +31,6 @@ class OpenAISpeechRequest(BaseModel):
        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
    )
    stream: bool = Field(
-        default=False,
+        default=True,  # Default to streaming for OpenAI compatibility
-        description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+        description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
    )
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -32,8 +32,10 @@ services:
      start_period: 1s
  kokoro-tts:
-    build:
+    image: ghcr.io/remsky/kokoro-fastapi:latest
-      context: .
+    # Uncomment below to build from source instead of using the released image
    # build:
    #   context: .
    volumes:
      - ./api/src:/app/api/src
      - ./Kokoro-82M:/app/Kokoro-82M
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream.py
@ -31,7 +31,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
                "model": "kokoro",
                "input": text,
                "voice": "af",
-                "response_format": "wav",
+                "response_format": "pcm",
                "stream": True
            },
            stream=True,
@ -53,33 +53,19 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
                    results["time_to_first_chunk"] = first_chunk_time - start_time
                chunks.append(chunk)
-        # Extract WAV header and data separately
+        # Concatenate all PCM chunks
        # First chunk has header + data, subsequent chunks are raw PCM
        if not chunks:
            raise ValueError("No audio chunks received")
-        first_chunk = chunks[0]
+        all_audio_data = b''.join(chunks)
        remaining_chunks = chunks[1:]
-        # Find end of WAV header (44 bytes for standard WAV)
+        # Write as WAV file
-        header = first_chunk[:44]
+        import wave
-        first_data = first_chunk[44:]
+        with wave.open(audio_path, 'wb') as wav_file:
-        
+            wav_file.setnchannels(1)  # Mono
-        # Concatenate all PCM data
+            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
-        all_data = first_data + b''.join(remaining_chunks)
+            wav_file.setframerate(24000)  # Known sample rate for Kokoro
-        
+            wav_file.writeframes(all_audio_data)
        # Update WAV header with total data size
        import struct
        data_size = len(all_data)
        # Update data size field (bytes 4-7)
        header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
        # Update subchunk2 size field (bytes 40-43)
        header = header[:40] + struct.pack('<I', data_size) + header[44:]
        # Write complete WAV file
        complete_audio = header + all_data
        with open(audio_path, 'wb') as f:
            f.write(complete_audio)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
@ -89,7 +75,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
        results["total_time"] = time.time() - start_time
        # Print debug info
-        print(f"Complete audio size: {len(complete_audio)} bytes")
+        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {len(chunks)}")
        print(f"Audio length: {results['audio_length']:.3f}s")
@ -114,7 +100,7 @@ def main():
        text = f.read()
    # Test specific token counts
-    token_sizes = [50, 100, 200, 500]
+    token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
    all_results = []
    for tokens in token_sizes:
@ -124,7 +110,7 @@ def main():
        print(f"Text preview: {test_text[:50]}...")
        # Run test 3 times for each size to get average
-        for i in range(3):
+        for i in range(5):
            print(f"Run {i+1}/3...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
@ -194,7 +180,7 @@ def main():
    plot_timeline(
        df,
-        os.path.join(output_plots_dir, "first_token_timeline_stream.png")
+        os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
    )
    print("\nResults and plots saved to:")
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_openai.py
@ -0,0 +1,184 @@
 #!/usr/bin/env python3
 import os
 import time
 import json
 import numpy as np
 import pandas as pd
 from openai import OpenAI
 from lib.shared_benchmark_utils import get_text_for_tokens, enc
 from lib.shared_utils import save_json_results
 from lib.shared_plotting import plot_correlation, plot_timeline
 def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
    """Measure time to audio via OpenAI API calls and save the audio output"""
    results = {
        "text_length": len(text),
        "token_count": len(enc.encode(text)),
        "total_time": None,
        "time_to_first_chunk": None,
        "error": None,
        "audio_path": None,
        "audio_length": None  # Length of output audio in seconds
    }
    try:
        start_time = time.time()
        # Initialize OpenAI client
        openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
        # Save complete audio
        audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        results["audio_path"] = audio_path
        first_chunk_time = None
        all_audio_data = bytearray()
        chunk_count = 0
        # Make streaming request using OpenAI client
        with openai.audio.speech.with_streaming_response.create(
            model="kokoro",
            voice="af",
            response_format="pcm",
            input=text,
        ) as response:
            for chunk in response.iter_bytes(chunk_size=1024):
                if chunk:
                    chunk_count += 1
                    if first_chunk_time is None:
                        first_chunk_time = time.time()
                        results["time_to_first_chunk"] = first_chunk_time - start_time
                    all_audio_data.extend(chunk)
        # Write as WAV file
        import wave
        with wave.open(audio_path, 'wb') as wav_file:
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
            wav_file.setframerate(24000)  # Known sample rate for Kokoro
            wav_file.writeframes(all_audio_data)
        # Calculate audio length using scipy
        import scipy.io.wavfile as wavfile
        sample_rate, audio_data = wavfile.read(audio_path)
        results["audio_length"] = len(audio_data) / sample_rate  # Length in seconds
        results["total_time"] = time.time() - start_time
        # Print debug info
        print(f"Complete audio size: {len(all_audio_data)} bytes")
        print(f"Number of chunks received: {chunk_count}")
        print(f"Audio length: {results['audio_length']:.3f}s")
        return results
    except Exception as e:
        results["error"] = str(e)
        return results
 def main():
    # Set up paths with _stream_openai suffix
    script_dir = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(script_dir, "output_audio_stream_openai")
    output_data_dir = os.path.join(script_dir, "output_data")
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(output_data_dir, exist_ok=True)
    # Load sample text
    with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
        text = f.read()
    # Test specific token counts
    token_sizes = [50, 100, 200, 500]
    all_results = []
    for tokens in token_sizes:
        print(f"\nTesting {tokens} tokens (streaming)")
        test_text = get_text_for_tokens(text, tokens)
        actual_tokens = len(enc.encode(test_text))
        print(f"Text preview: {test_text[:50]}...")
        # Run test 5 times for each size to get average
        for i in range(5):
            print(f"Run {i+1}/5...")
            result = measure_first_token(test_text, output_dir, tokens, i + 1)
            result["target_tokens"] = tokens
            result["actual_tokens"] = actual_tokens
            result["run_number"] = i + 1
            print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
            print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
            print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
            print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
            if result["error"]:
                print(f"Error: {result['error']}")
            all_results.append(result)
    # Calculate averages per token size
    summary = {}
    for tokens in token_sizes:
        matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
        if matching_results:
            avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
            avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
            avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
            summary[tokens] = {
                "avg_time_to_first_chunk": round(avg_first_chunk, 3),
                "avg_total_time": round(avg_total, 3),
                "avg_audio_length": round(avg_audio_length, 3),
                "num_successful_runs": len(matching_results)
            }
    # Save results with _stream_openai suffix
    results_data = {
        "individual_runs": all_results,
        "summary": summary,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    save_json_results(
        results_data,
        os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
    )
    # Create plot directory if it doesn't exist
    output_plots_dir = os.path.join(script_dir, "output_plots")
    os.makedirs(output_plots_dir, exist_ok=True)
    # Create DataFrame for plotting
    df = pd.DataFrame(all_results)
    # Create plots with _stream_openai suffix
    plot_correlation(
        df, "target_tokens", "time_to_first_chunk",
        "Time to First Audio vs Input Size (OpenAI Streaming)",
        "Number of Input Tokens",
        "Time to First Audio (seconds)",
        os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
    )
    plot_correlation(
        df, "target_tokens", "total_time",
        "Total Time vs Input Size (OpenAI Streaming)",
        "Number of Input Tokens",
        "Total Time (seconds)",
        os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
    )
    plot_timeline(
        df,
        os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
    )
    print("\nResults and plots saved to:")
    print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
    print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
    print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
 if __name__ == "__main__":
    main()
--- a/examples/assorted_checks/benchmarks/lib/shared_plotting.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_plotting.py
@ -138,7 +138,7 @@ def plot_system_metrics(metrics_data, output_path):
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()
-def plot_timeline(df, output_path):
+def plot_timeline(df, output_path, suffix=""):
    """Create timeline plot showing latency for each run.
    Args:
@ -255,7 +255,7 @@ def plot_timeline(df, output_path):
    # Customize appearance
    setup_plot(
        fig, ax,
-        "Time-To-Audio Latency",
+        "Time-To-Audio Latency" + suffix,
        xlabel="Time (seconds)",
        ylabel="Input Size"
    )
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream.json
@ -3,11 +3,11 @@
    {
      "text_length": 212,
      "token_count": 50,
-      "total_time": 0.9603095054626465,
+      "total_time": 0.7278211116790771,
-      "time_to_first_chunk": 0.5916037559509277,
+      "time_to_first_chunk": 0.3613290786743164,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
-      "audio_length": 15.45,
+      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
@ -15,11 +15,11 @@
    {
      "text_length": 212,
      "token_count": 50,
-      "total_time": 0.5130870342254639,
+      "total_time": 0.4556088447570801,
-      "time_to_first_chunk": 0.27448558807373047,
+      "time_to_first_chunk": 0.18642044067382812,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
-      "audio_length": 15.45,
+      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
@ -27,23 +27,47 @@
    {
      "text_length": 212,
      "token_count": 50,
-      "total_time": 0.4667215347290039,
+      "total_time": 0.5538768768310547,
-      "time_to_first_chunk": 0.22882533073425293,
+      "time_to_first_chunk": 0.2720797061920166,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
-      "audio_length": 15.45,
+      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.4395604133605957,
      "time_to_first_chunk": 0.15613913536071777,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.45748305320739746,
      "time_to_first_chunk": 0.18805718421936035,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
      "token_count": 100,
-      "total_time": 0.9051008224487305,
+      "total_time": 0.7347762584686279,
-      "time_to_first_chunk": 0.2526383399963379,
+      "time_to_first_chunk": 0.16963744163513184,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
-      "audio_length": 30.25,
+      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
@ -51,11 +75,11 @@
    {
      "text_length": 448,
      "token_count": 100,
-      "total_time": 0.8579132556915283,
+      "total_time": 0.8288509845733643,
-      "time_to_first_chunk": 0.25691914558410645,
+      "time_to_first_chunk": 0.20123004913330078,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
-      "audio_length": 30.25,
+      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
@ -63,23 +87,47 @@
    {
      "text_length": 448,
      "token_count": 100,
-      "total_time": 0.9683890342712402,
+      "total_time": 0.7503848075866699,
-      "time_to_first_chunk": 0.26229000091552734,
+      "time_to_first_chunk": 0.21662068367004395,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
-      "audio_length": 30.25,
+      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.694899320602417,
      "time_to_first_chunk": 0.1966841220855713,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 0.68701171875,
      "time_to_first_chunk": 0.19341063499450684,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
      "text_length": 906,
      "token_count": 200,
-      "total_time": 1.8075971603393555,
+      "total_time": 1.6845426559448242,
-      "time_to_first_chunk": 0.22536945343017578,
+      "time_to_first_chunk": 0.21096158027648926,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
-      "audio_length": 60.75,
+      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 1
@ -87,11 +135,11 @@
    {
      "text_length": 906,
      "token_count": 200,
-      "total_time": 1.493518590927124,
+      "total_time": 1.3545098304748535,
-      "time_to_first_chunk": 0.21502947807312012,
+      "time_to_first_chunk": 0.18648386001586914,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
-      "audio_length": 60.75,
+      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 2
@ -99,23 +147,47 @@
    {
      "text_length": 906,
      "token_count": 200,
-      "total_time": 1.4910809993743896,
+      "total_time": 1.426060676574707,
-      "time_to_first_chunk": 0.21600556373596191,
+      "time_to_first_chunk": 0.20081472396850586,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
-      "audio_length": 60.75,
+      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 3
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.4084081649780273,
      "time_to_first_chunk": 0.18551135063171387,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 4
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.4703152179718018,
      "time_to_first_chunk": 0.17750859260559082,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 5
    },
    {
      "text_length": 2232,
      "token_count": 500,
-      "total_time": 4.223623275756836,
+      "total_time": 4.289574384689331,
-      "time_to_first_chunk": 0.20010590553283691,
+      "time_to_first_chunk": 0.1997976303100586,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
-      "audio_length": 147.775,
+      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
@ -123,11 +195,11 @@
    {
      "text_length": 2232,
      "token_count": 500,
-      "total_time": 3.8811349868774414,
+      "total_time": 3.7089381217956543,
-      "time_to_first_chunk": 0.24638962745666504,
+      "time_to_first_chunk": 0.25969815254211426,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
-      "audio_length": 147.775,
+      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
@ -135,41 +207,65 @@
    {
      "text_length": 2232,
      "token_count": 500,
-      "total_time": 4.045536994934082,
+      "total_time": 4.138366222381592,
-      "time_to_first_chunk": 0.2252039909362793,
+      "time_to_first_chunk": 0.1831505298614502,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
-      "audio_length": 147.775,
+      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 3.980635643005371,
      "time_to_first_chunk": 0.20493030548095703,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.1370298862457275,
      "time_to_first_chunk": 0.19150757789611816,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "50": {
-      "avg_time_to_first_chunk": 0.365,
+      "avg_time_to_first_chunk": 0.233,
-      "avg_total_time": 0.647,
+      "avg_total_time": 0.527,
-      "avg_audio_length": 15.45,
+      "avg_audio_length": 16.325,
-      "num_successful_runs": 3
+      "num_successful_runs": 5
    },
    "100": {
-      "avg_time_to_first_chunk": 0.257,
+      "avg_time_to_first_chunk": 0.196,
-      "avg_total_time": 0.91,
+      "avg_total_time": 0.739,
-      "avg_audio_length": 30.25,
+      "avg_audio_length": 31.1,
-      "num_successful_runs": 3
+      "num_successful_runs": 5
    },
    "200": {
-      "avg_time_to_first_chunk": 0.219,
+      "avg_time_to_first_chunk": 0.192,
-      "avg_total_time": 1.597,
+      "avg_total_time": 1.469,
-      "avg_audio_length": 60.75,
+      "avg_audio_length": 62.625,
-      "num_successful_runs": 3
+      "num_successful_runs": 5
    },
    "500": {
-      "avg_time_to_first_chunk": 0.224,
+      "avg_time_to_first_chunk": 0.208,
-      "avg_total_time": 4.05,
+      "avg_total_time": 4.051,
-      "avg_audio_length": 147.775,
+      "avg_audio_length": 157.875,
-      "num_successful_runs": 3
+      "num_successful_runs": 5
    }
  },
-  "timestamp": "2025-01-04 14:59:28"
+  "timestamp": "2025-01-04 22:16:30"
 }
--- a/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
+++ b/examples/assorted_checks/benchmarks/output_data/first_token_benchmark_stream_openai.json
@ -0,0 +1,271 @@
 {
  "individual_runs": [
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 1.149611473083496,
      "time_to_first_chunk": 0.8767304420471191,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 1
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.9325947761535645,
      "time_to_first_chunk": 0.5965914726257324,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 2
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 0.9205234050750732,
      "time_to_first_chunk": 0.5961906909942627,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 3
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 1.1321916580200195,
      "time_to_first_chunk": 0.6946916580200195,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 4
    },
    {
      "text_length": 212,
      "token_count": 50,
      "total_time": 1.1146185398101807,
      "time_to_first_chunk": 0.6918885707855225,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
      "audio_length": 16.325,
      "target_tokens": 50,
      "actual_tokens": 50,
      "run_number": 5
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 1.3645410537719727,
      "time_to_first_chunk": 0.6802399158477783,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 1
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 1.4154777526855469,
      "time_to_first_chunk": 0.7297353744506836,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 2
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 1.3589520454406738,
      "time_to_first_chunk": 0.698603630065918,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 3
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 1.2276430130004883,
      "time_to_first_chunk": 0.6705801486968994,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 4
    },
    {
      "text_length": 448,
      "token_count": 100,
      "total_time": 1.0949454307556152,
      "time_to_first_chunk": 0.5698442459106445,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
      "audio_length": 31.1,
      "target_tokens": 100,
      "actual_tokens": 100,
      "run_number": 5
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8211240768432617,
      "time_to_first_chunk": 0.6070489883422852,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 1
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.8376774787902832,
      "time_to_first_chunk": 0.6538689136505127,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 2
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.6953792572021484,
      "time_to_first_chunk": 0.5554308891296387,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 3
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.887030839920044,
      "time_to_first_chunk": 0.5866930484771729,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 4
    },
    {
      "text_length": 906,
      "token_count": 200,
      "total_time": 1.7908406257629395,
      "time_to_first_chunk": 0.5897490978240967,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
      "audio_length": 62.625,
      "target_tokens": 200,
      "actual_tokens": 200,
      "run_number": 5
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.228837013244629,
      "time_to_first_chunk": 0.5315976142883301,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 1
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.489210367202759,
      "time_to_first_chunk": 0.5261838436126709,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 2
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.5290446281433105,
      "time_to_first_chunk": 0.6186764240264893,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 3
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.209261178970337,
      "time_to_first_chunk": 0.5990591049194336,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 4
    },
    {
      "text_length": 2232,
      "token_count": 500,
      "total_time": 4.218762636184692,
      "time_to_first_chunk": 0.5466251373291016,
      "error": null,
      "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
      "audio_length": 157.875,
      "target_tokens": 500,
      "actual_tokens": 500,
      "run_number": 5
    }
  ],
  "summary": {
    "50": {
      "avg_time_to_first_chunk": 0.691,
      "avg_total_time": 1.05,
      "avg_audio_length": 16.325,
      "num_successful_runs": 5
    },
    "100": {
      "avg_time_to_first_chunk": 0.67,
      "avg_total_time": 1.292,
      "avg_audio_length": 31.1,
      "num_successful_runs": 5
    },
    "200": {
      "avg_time_to_first_chunk": 0.599,
      "avg_total_time": 1.806,
      "avg_audio_length": 62.625,
      "num_successful_runs": 5
    },
    "500": {
      "avg_time_to_first_chunk": 0.564,
      "avg_total_time": 4.335,
      "avg_audio_length": 157.875,
      "num_successful_runs": 5
    }
  },
  "timestamp": "2025-01-04 22:18:03"
 }
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_latency_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/first_token_timeline_stream_openai.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream.png
--- a/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
+++ b/examples/assorted_checks/benchmarks/output_plots/total_time_latency_stream_openai.png
--- a/examples/assorted_checks/test_normalizer.py
+++ b/examples/assorted_checks/test_normalizer.py
@ -0,0 +1,268 @@
 import re
 import time
 import random
 import string
 from typing import List, Tuple
 def create_test_cases() -> List[str]:
    """Create a variety of test cases with different characteristics"""
    # Helper to create random text with specific patterns
    def random_text(length: int) -> str:
        return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
    test_cases = []
    # Base test cases that hit specific patterns
    base_cases = [
        "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
        "Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
        'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
        "X's and Y's properties cost £50 million in the 1990s",
        "こんにちは。今日は！",
    ]
    # Add base cases
    test_cases.extend(base_cases)
    # Add variations with random content
    for length in [100, 1000, 10000]:
        # Create 3 variations of each length
        for _ in range(3):
            text = random_text(length)
            # Insert some patterns we're looking for
            text = text.replace(text[10:20], "Dr. Smith")
            text = text.replace(text[30:40], "$1,234.56")
            text = text.replace(text[50:60], "A.B.C. xyz")
            test_cases.append(text)
    return test_cases
 class TextNormalizerInline:
    """Text normalizer using inline patterns"""
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
        text = re.sub(r"[^\S \n]", " ", text)
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
        text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
        text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
        text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
        text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
        text = re.sub(r"(?<=\d),(?=\d)", "", text)
        text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
        text = re.sub(r"\d*\.\d+", handle_decimal, text)
        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
        text = re.sub(r"(?<=\d)S", " S", text)
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
        text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
        return text.strip()
 class TextNormalizerCompiled:
    """Text normalizer using all compiled patterns"""
    def __init__(self):
        self.patterns = {
            'whitespace': re.compile(r"[^\S \n]"),
            'multi_space': re.compile(r"  +"),
            'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
            'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
            'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
            'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
            'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
            'etc': re.compile(r"\betc\.(?! [A-Z])"),
            'yeah': re.compile(r"(?i)\b(y)eah?\b"),
            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
            'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
            'decimal': re.compile(r"\d*\.\d+"),
            'range': re.compile(r"(?<=\d)-(?=\d)"),
            's_after_number': re.compile(r"(?<=\d)S"),
            'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
            'x_possessive': re.compile(r"(?<=X')S\b"),
            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
            'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
        }
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
        # Use compiled patterns
        text = self.patterns['whitespace'].sub(" ", text)
        text = self.patterns['multi_space'].sub(" ", text)
        text = self.patterns['newline_space'].sub("", text)
        text = self.patterns['doctor'].sub("Doctor", text)
        text = self.patterns['mister'].sub("Mister", text)
        text = self.patterns['miss'].sub("Miss", text)
        text = self.patterns['mrs'].sub("Mrs", text)
        text = self.patterns['etc'].sub("etc", text)
        text = self.patterns['yeah'].sub(r"\1e'a", text)
        text = self.patterns['numbers'].sub(split_num, text)
        text = self.patterns['comma_in_number'].sub("", text)
        text = self.patterns['money'].sub(handle_money, text)
        text = self.patterns['decimal'].sub(handle_decimal, text)
        text = self.patterns['range'].sub(" to ", text)
        text = self.patterns['s_after_number'].sub(" S", text)
        text = self.patterns['possessive_s'].sub("'S", text)
        text = self.patterns['x_possessive'].sub("s", text)
        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
        text = self.patterns['single_initial'].sub("-", text)
        return text.strip()
 class TextNormalizerHybrid:
    """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
    def __init__(self):
        # Only compile patterns that are complex or frequently used
        self.patterns = {
            'whitespace': re.compile(r"[^\S \n]"),
            'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
            'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
            'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
        }
    def normalize(self, text: str) -> str:
        # Replace quotes and brackets
        text = text.replace(chr(8216), "'").replace(chr(8217), "'")
        text = text.replace("«", chr(8220)).replace("»", chr(8221))
        text = text.replace(chr(8220), '"').replace(chr(8221), '"')
        text = text.replace("(", "«").replace(")", "»")
        # Handle CJK punctuation
        for a, b in zip("、。！，：；？", ",.!,:;?"):
            text = text.replace(a, b + " ")
        # Use compiled patterns for complex operations
        text = self.patterns['whitespace'].sub(" ", text)
        text = self.patterns['numbers'].sub(split_num, text)
        text = self.patterns['money'].sub(handle_money, text)
        text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
        # Use inline patterns for simpler operations
        text = re.sub(r"  +", " ", text)
        text = re.sub(r"(?<=\n) +(?=\n)", "", text)
        text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
        text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
        text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
        text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
        text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
        text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
        text = re.sub(r"(?<=\d),(?=\d)", "", text)
        text = re.sub(r"\d*\.\d+", handle_decimal, text)
        text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
        text = re.sub(r"(?<=\d)S", " S", text)
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", "s", text)
        text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
        return text.strip()
 def split_num(match: re.Match) -> str:
    """Split numbers for TTS processing"""
    num = match.group(0)
    if ":" in num:
        h, m = num.split(":")
        return f"{h} {m}"
    if num.endswith("s"):
        return f"{num[:-1]} s"
    return num
 def handle_money(match: re.Match) -> str:
    """Format money strings for TTS"""
    text = match.group(0)
    return text.replace("$", " dollars ").replace("£", " pounds ")
 def handle_decimal(match: re.Match) -> str:
    """Format decimal numbers for TTS"""
    num = match.group(0)
    return num.replace(".", " point ")
 def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
    """Benchmark all three implementations"""
    normalizers = {
        'inline': TextNormalizerInline(),
        'compiled': TextNormalizerCompiled(),
        'hybrid': TextNormalizerHybrid()
    }
    results = {}
    # Test each normalizer
    for name, normalizer in normalizers.items():
        start = time.perf_counter()
        # Run normalizations
        for _ in range(iterations):
            for test in test_cases:
                normalizer.normalize(test)
        results[name] = time.perf_counter() - start
    return results
 def verify_outputs(test_cases: List[str]) -> bool:
    """Verify that all implementations produce identical output"""
    normalizers = {
        'inline': TextNormalizerInline(),
        'compiled': TextNormalizerCompiled(),
        'hybrid': TextNormalizerHybrid()
    }
    for test in test_cases:
        results = [norm.normalize(test) for norm in normalizers.values()]
        if not all(r == results[0] for r in results):
            return False
    return True
 def main():
    # Create test cases
    print("Generating test cases...")
    test_cases = create_test_cases()
    total_chars = sum(len(t) for t in test_cases)
    print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
    # Verify output consistency
    print("\nVerifying output consistency...")
    if verify_outputs(test_cases):
        print("✓ All implementations produce identical output")
    else:
        print("✗ Warning: Implementations produce different outputs!")
        return
    # Run benchmarks
    print("\nRunning benchmarks...")
    iterations = 100
    results = benchmark_normalizers(test_cases, iterations)
    # Print results
    print(f"\nResults for {iterations} iterations: ")
    for name, time_taken in results.items():
        print(f"{name.capitalize()}: {time_taken:.3f}s")
 main()
--- a/examples/openai_streaming_audio.py
+++ b/examples/openai_streaming_audio.py
@ -36,10 +36,7 @@ def stream_to_speakers() -> None:
        model="kokoro",
        voice="af",
        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
-        input="""I see skies of blue and clouds of white
+        input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earth’s surface""",
                The bright blessed days, the dark sacred nights
                And I think to myself
                What a wonderful world""",
    ) as response:
        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
        for chunk in response.iter_bytes(chunk_size=1024):
--- a/examples/output.wav
+++ b/examples/output.wav
--- a/examples/speech.mp3
+++ b/examples/speech.mp3