Swapped generator to preprocessing

This commit is contained in:
remsky 2025-01-04 22:23:59 -07:00
parent e799f0c7c1
commit 4c6cd83f85
23 changed files with 955 additions and 127 deletions

BIN
.coverage

Binary file not shown.

2
.gitignore vendored
View file

@ -25,3 +25,5 @@ examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/*
examples/assorted_checks/benchmarks/output_audio_stream/*
ui/RepoScreenshot.png
examples/assorted_checks/benchmarks/output_audio_stream_openai/*

View file

@ -24,16 +24,16 @@ async def lifespan(app: FastAPI):
# Initialize the main model with warm-up
voicepack_count = TTSModel.setup()
# boundary = "█████╗"*9
boundary = "" * 54
boundary = "" * 30
startup_msg =f"""
{boundary}
{boundary}
"""

View file

@ -57,10 +57,8 @@ async def create_speech(
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
# Check if streaming is requested via header
is_streaming = x_raw_response == "stream"
if is_streaming:
# Check if streaming is requested (default for OpenAI client)
if request.stream:
# Stream audio chunks as they're generated
return StreamingResponse(
stream_audio_chunks(tts_service, request),

View file

@ -49,7 +49,7 @@ def handle_decimal(num: re.Match) -> str:
a, b = num.group().split(".")
return " point ".join([a, " ".join(b)])
@lru_cache(maxsize=1000) # Cache normalized text results
# @lru_cache(maxsize=1000) # Cache normalized text results
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing

View file

@ -20,11 +20,40 @@ class TTSService:
def __init__(self, output_dir: str = None):
self.output_dir = output_dir
def _split_text(self, text: str) -> List[str]:
"""Split text into sentences"""
def _split_text(self, text: str):
"""Generate text chunks one at a time, splitting on natural pause points"""
if not isinstance(text, str):
text = str(text) if text is not None else ""
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For longer sentences, split on commas and semicolons
if len(sentence) > 300: # Only split long sentences
# Split on pause points while preserving the punctuation
chunks = re.split(r"((?<=[,;])\s+)", sentence)
# Reassemble chunks with their trailing punctuation
current_chunk = ""
for i, chunk in enumerate(chunks):
if i % 2 == 0: # Text chunk
current_chunk += chunk
else: # Punctuation/whitespace chunk
current_chunk += chunk
if current_chunk.strip():
yield current_chunk.strip()
current_chunk = ""
# Yield any remaining text
if current_chunk.strip():
yield current_chunk.strip()
else:
yield sentence
@staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
@ -69,11 +98,11 @@ class TTSService:
# Generate audio with or without stitching
if stitch_long_output:
chunks = self._split_text(text)
audio_chunks = []
chunk_count = 0
# Process all chunks
for i, chunk in enumerate(chunks):
# Process chunks as they're generated
for chunk in self._split_text(text):
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -81,23 +110,21 @@ class TTSService:
if chunk_audio is not None:
audio_chunks.append(chunk_audio)
chunk_count += 1
else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
logger.error(f"No audio generated for chunk {chunk_count + 1}")
except Exception as e:
logger.error(
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
)
continue
if not audio_chunks:
raise ValueError("No audio chunks were generated successfully")
audio = (
np.concatenate(audio_chunks)
if len(audio_chunks) > 1
else audio_chunks[0]
)
# Concatenate all chunks
audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
else:
# Process single chunk
phonemes, tokens = TTSModel.process_text(text, voice[0])
@ -132,11 +159,9 @@ class TTSService:
raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path)
# Split text into sentences for natural boundaries
chunks = self._split_text(text)
# Process and stream chunks
for i, chunk in enumerate(chunks):
# Process chunks as they're generated
is_first = True
for chunk in self._split_text(text):
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -148,17 +173,16 @@ class TTSService:
chunk_audio,
24000,
output_format,
is_first_chunk=(i == 0),
is_first_chunk=is_first,
normalizer=stream_normalizer
)
yield chunk_bytes
is_first = False
else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
logger.error(f"No audio generated for chunk: '{chunk}'")
except Exception as e:
logger.error(
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
)
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
continue
except Exception as e:

View file

@ -31,6 +31,6 @@ class OpenAISpeechRequest(BaseModel):
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
stream: bool = Field(
default=False,
description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.",
default=True, # Default to streaming for OpenAI compatibility
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
)

View file

@ -32,8 +32,10 @@ services:
start_period: 1s
kokoro-tts:
build:
context: .
image: ghcr.io/remsky/kokoro-fastapi:latest
# Uncomment below to build from source instead of using the released image
# build:
# context: .
volumes:
- ./api/src:/app/api/src
- ./Kokoro-82M:/app/Kokoro-82M

View file

@ -31,7 +31,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"response_format": "pcm",
"stream": True
},
stream=True,
@ -53,33 +53,19 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Extract WAV header and data separately
# First chunk has header + data, subsequent chunks are raw PCM
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
first_chunk = chunks[0]
remaining_chunks = chunks[1:]
all_audio_data = b''.join(chunks)
# Find end of WAV header (44 bytes for standard WAV)
header = first_chunk[:44]
first_data = first_chunk[44:]
# Concatenate all PCM data
all_data = first_data + b''.join(remaining_chunks)
# Update WAV header with total data size
import struct
data_size = len(all_data)
# Update data size field (bytes 4-7)
header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
# Update subchunk2 size field (bytes 40-43)
header = header[:40] + struct.pack('<I', data_size) + header[44:]
# Write complete WAV file
complete_audio = header + all_data
with open(audio_path, 'wb') as f:
f.write(complete_audio)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
@ -89,7 +75,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(complete_audio)} bytes")
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
@ -114,7 +100,7 @@ def main():
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
all_results = []
for tokens in token_sizes:
@ -124,7 +110,7 @@ def main():
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(3):
for i in range(5):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
@ -194,7 +180,7 @@ def main():
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream.png")
os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
)
print("\nResults and plots saved to:")

View file

@ -0,0 +1,184 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Initialize OpenAI client
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream_openai suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream_openai")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 5 times for each size to get average
for i in range(5):
print(f"Run {i+1}/5...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream_openai suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots with _stream_openai suffix
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
if __name__ == "__main__":
main()

View file

@ -138,7 +138,7 @@ def plot_system_metrics(metrics_data, output_path):
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_timeline(df, output_path):
def plot_timeline(df, output_path, suffix=""):
"""Create timeline plot showing latency for each run.
Args:
@ -255,7 +255,7 @@ def plot_timeline(df, output_path):
# Customize appearance
setup_plot(
fig, ax,
"Time-To-Audio Latency",
"Time-To-Audio Latency" + suffix,
xlabel="Time (seconds)",
ylabel="Input Size"
)

View file

@ -3,11 +3,11 @@
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9603095054626465,
"time_to_first_chunk": 0.5916037559509277,
"total_time": 0.7278211116790771,
"time_to_first_chunk": 0.3613290786743164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.45,
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
@ -15,11 +15,11 @@
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5130870342254639,
"time_to_first_chunk": 0.27448558807373047,
"total_time": 0.4556088447570801,
"time_to_first_chunk": 0.18642044067382812,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 15.45,
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
@ -27,23 +27,47 @@
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4667215347290039,
"time_to_first_chunk": 0.22882533073425293,
"total_time": 0.5538768768310547,
"time_to_first_chunk": 0.2720797061920166,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 15.45,
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4395604133605957,
"time_to_first_chunk": 0.15613913536071777,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.45748305320739746,
"time_to_first_chunk": 0.18805718421936035,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9051008224487305,
"time_to_first_chunk": 0.2526383399963379,
"total_time": 0.7347762584686279,
"time_to_first_chunk": 0.16963744163513184,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 30.25,
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
@ -51,11 +75,11 @@
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8579132556915283,
"time_to_first_chunk": 0.25691914558410645,
"total_time": 0.8288509845733643,
"time_to_first_chunk": 0.20123004913330078,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 30.25,
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
@ -63,23 +87,47 @@
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9683890342712402,
"time_to_first_chunk": 0.26229000091552734,
"total_time": 0.7503848075866699,
"time_to_first_chunk": 0.21662068367004395,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 30.25,
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.694899320602417,
"time_to_first_chunk": 0.1966841220855713,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.68701171875,
"time_to_first_chunk": 0.19341063499450684,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8075971603393555,
"time_to_first_chunk": 0.22536945343017578,
"total_time": 1.6845426559448242,
"time_to_first_chunk": 0.21096158027648926,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
"audio_length": 60.75,
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
@ -87,11 +135,11 @@
{
"text_length": 906,
"token_count": 200,
"total_time": 1.493518590927124,
"time_to_first_chunk": 0.21502947807312012,
"total_time": 1.3545098304748535,
"time_to_first_chunk": 0.18648386001586914,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
"audio_length": 60.75,
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
@ -99,23 +147,47 @@
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4910809993743896,
"time_to_first_chunk": 0.21600556373596191,
"total_time": 1.426060676574707,
"time_to_first_chunk": 0.20081472396850586,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
"audio_length": 60.75,
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4084081649780273,
"time_to_first_chunk": 0.18551135063171387,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4703152179718018,
"time_to_first_chunk": 0.17750859260559082,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.223623275756836,
"time_to_first_chunk": 0.20010590553283691,
"total_time": 4.289574384689331,
"time_to_first_chunk": 0.1997976303100586,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 147.775,
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
@ -123,11 +195,11 @@
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.8811349868774414,
"time_to_first_chunk": 0.24638962745666504,
"total_time": 3.7089381217956543,
"time_to_first_chunk": 0.25969815254211426,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 147.775,
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
@ -135,41 +207,65 @@
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.045536994934082,
"time_to_first_chunk": 0.2252039909362793,
"total_time": 4.138366222381592,
"time_to_first_chunk": 0.1831505298614502,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 147.775,
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.980635643005371,
"time_to_first_chunk": 0.20493030548095703,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.1370298862457275,
"time_to_first_chunk": 0.19150757789611816,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"50": {
"avg_time_to_first_chunk": 0.365,
"avg_total_time": 0.647,
"avg_audio_length": 15.45,
"num_successful_runs": 3
"avg_time_to_first_chunk": 0.233,
"avg_total_time": 0.527,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.257,
"avg_total_time": 0.91,
"avg_audio_length": 30.25,
"num_successful_runs": 3
"avg_time_to_first_chunk": 0.196,
"avg_total_time": 0.739,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 0.219,
"avg_total_time": 1.597,
"avg_audio_length": 60.75,
"num_successful_runs": 3
"avg_time_to_first_chunk": 0.192,
"avg_total_time": 1.469,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 0.224,
"avg_total_time": 4.05,
"avg_audio_length": 147.775,
"num_successful_runs": 3
"avg_time_to_first_chunk": 0.208,
"avg_total_time": 4.051,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 14:59:28"
"timestamp": "2025-01-04 22:16:30"
}

View file

@ -0,0 +1,271 @@
{
"individual_runs": [
{
"text_length": 212,
"token_count": 50,
"total_time": 1.149611473083496,
"time_to_first_chunk": 0.8767304420471191,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9325947761535645,
"time_to_first_chunk": 0.5965914726257324,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9205234050750732,
"time_to_first_chunk": 0.5961906909942627,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1321916580200195,
"time_to_first_chunk": 0.6946916580200195,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1146185398101807,
"time_to_first_chunk": 0.6918885707855225,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3645410537719727,
"time_to_first_chunk": 0.6802399158477783,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.4154777526855469,
"time_to_first_chunk": 0.7297353744506836,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3589520454406738,
"time_to_first_chunk": 0.698603630065918,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.2276430130004883,
"time_to_first_chunk": 0.6705801486968994,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.0949454307556152,
"time_to_first_chunk": 0.5698442459106445,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8211240768432617,
"time_to_first_chunk": 0.6070489883422852,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8376774787902832,
"time_to_first_chunk": 0.6538689136505127,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.6953792572021484,
"time_to_first_chunk": 0.5554308891296387,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.887030839920044,
"time_to_first_chunk": 0.5866930484771729,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7908406257629395,
"time_to_first_chunk": 0.5897490978240967,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.228837013244629,
"time_to_first_chunk": 0.5315976142883301,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.489210367202759,
"time_to_first_chunk": 0.5261838436126709,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.5290446281433105,
"time_to_first_chunk": 0.6186764240264893,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.209261178970337,
"time_to_first_chunk": 0.5990591049194336,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.218762636184692,
"time_to_first_chunk": 0.5466251373291016,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"50": {
"avg_time_to_first_chunk": 0.691,
"avg_total_time": 1.05,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.67,
"avg_total_time": 1.292,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 0.599,
"avg_total_time": 1.806,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 0.564,
"avg_total_time": 4.335,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 22:18:03"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 214 KiB

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 268 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 189 KiB

After

Width:  |  Height:  |  Size: 193 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 243 KiB

After

Width:  |  Height:  |  Size: 252 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

View file

@ -0,0 +1,268 @@
import re
import time
import random
import string
from typing import List, Tuple
def create_test_cases() -> List[str]:
"""Create a variety of test cases with different characteristics"""
# Helper to create random text with specific patterns
def random_text(length: int) -> str:
return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
test_cases = []
# Base test cases that hit specific patterns
base_cases = [
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
"Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
"X's and Y's properties cost £50 million in the 1990s",
"こんにちは。今日は!",
]
# Add base cases
test_cases.extend(base_cases)
# Add variations with random content
for length in [100, 1000, 10000]:
# Create 3 variations of each length
for _ in range(3):
text = random_text(length)
# Insert some patterns we're looking for
text = text.replace(text[10:20], "Dr. Smith")
text = text.replace(text[30:40], "$1,234.56")
text = text.replace(text[50:60], "A.B.C. xyz")
test_cases.append(text)
return test_cases
class TextNormalizerInline:
"""Text normalizer using inline patterns"""
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
class TextNormalizerCompiled:
"""Text normalizer using all compiled patterns"""
def __init__(self):
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'multi_space': re.compile(r" +"),
'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
'etc': re.compile(r"\betc\.(?! [A-Z])"),
'yeah': re.compile(r"(?i)\b(y)eah?\b"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'decimal': re.compile(r"\d*\.\d+"),
'range': re.compile(r"(?<=\d)-(?=\d)"),
's_after_number': re.compile(r"(?<=\d)S"),
'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
'x_possessive': re.compile(r"(?<=X')S\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['multi_space'].sub(" ", text)
text = self.patterns['newline_space'].sub("", text)
text = self.patterns['doctor'].sub("Doctor", text)
text = self.patterns['mister'].sub("Mister", text)
text = self.patterns['miss'].sub("Miss", text)
text = self.patterns['mrs'].sub("Mrs", text)
text = self.patterns['etc'].sub("etc", text)
text = self.patterns['yeah'].sub(r"\1e'a", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['comma_in_number'].sub("", text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['decimal'].sub(handle_decimal, text)
text = self.patterns['range'].sub(" to ", text)
text = self.patterns['s_after_number'].sub(" S", text)
text = self.patterns['possessive_s'].sub("'S", text)
text = self.patterns['x_possessive'].sub("s", text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
text = self.patterns['single_initial'].sub("-", text)
return text.strip()
class TextNormalizerHybrid:
"""Text normalizer using hybrid approach - compile only complex/frequent patterns"""
def __init__(self):
# Only compile patterns that are complex or frequently used
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns for complex operations
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
# Use inline patterns for simpler operations
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
def split_num(match: re.Match) -> str:
"""Split numbers for TTS processing"""
num = match.group(0)
if ":" in num:
h, m = num.split(":")
return f"{h} {m}"
if num.endswith("s"):
return f"{num[:-1]} s"
return num
def handle_money(match: re.Match) -> str:
"""Format money strings for TTS"""
text = match.group(0)
return text.replace("$", " dollars ").replace("£", " pounds ")
def handle_decimal(match: re.Match) -> str:
"""Format decimal numbers for TTS"""
num = match.group(0)
return num.replace(".", " point ")
def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
"""Benchmark all three implementations"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
}
results = {}
# Test each normalizer
for name, normalizer in normalizers.items():
start = time.perf_counter()
# Run normalizations
for _ in range(iterations):
for test in test_cases:
normalizer.normalize(test)
results[name] = time.perf_counter() - start
return results
def verify_outputs(test_cases: List[str]) -> bool:
"""Verify that all implementations produce identical output"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
}
for test in test_cases:
results = [norm.normalize(test) for norm in normalizers.values()]
if not all(r == results[0] for r in results):
return False
return True
def main():
# Create test cases
print("Generating test cases...")
test_cases = create_test_cases()
total_chars = sum(len(t) for t in test_cases)
print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
# Verify output consistency
print("\nVerifying output consistency...")
if verify_outputs(test_cases):
print("✓ All implementations produce identical output")
else:
print("✗ Warning: Implementations produce different outputs!")
return
# Run benchmarks
print("\nRunning benchmarks...")
iterations = 100
results = benchmark_normalizers(test_cases, iterations)
# Print results
print(f"\nResults for {iterations} iterations: ")
for name, time_taken in results.items():
print(f"{name.capitalize()}: {time_taken:.3f}s")
main()

View file

@ -36,10 +36,7 @@ def stream_to_speakers() -> None:
model="kokoro",
voice="af",
response_format="pcm", # similar to WAV, but without a header chunk at the start.
input="""I see skies of blue and clouds of white
The bright blessed days, the dark sacred nights
And I think to myself
What a wonderful world""",
input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earths surface""",
) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
for chunk in response.iter_bytes(chunk_size=1024):

BIN
examples/output.wav Normal file

Binary file not shown.

Binary file not shown.