Swapped generator to preprocessing

This commit is contained in:
remsky 2025-01-04 22:23:59 -07:00
parent e799f0c7c1
commit 4c6cd83f85
23 changed files with 955 additions and 127 deletions

BIN
.coverage

Binary file not shown.

2
.gitignore vendored
View file

@ -25,3 +25,5 @@ examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/* examples/assorted_checks/test_formats/output/*
examples/assorted_checks/benchmarks/output_audio_stream/* examples/assorted_checks/benchmarks/output_audio_stream/*
ui/RepoScreenshot.png ui/RepoScreenshot.png
examples/assorted_checks/benchmarks/output_audio_stream_openai/*

View file

@ -24,7 +24,7 @@ async def lifespan(app: FastAPI):
# Initialize the main model with warm-up # Initialize the main model with warm-up
voicepack_count = TTSModel.setup() voicepack_count = TTSModel.setup()
# boundary = "█████╗"*9 # boundary = "█████╗"*9
boundary = "" * 54 boundary = "" * 30
startup_msg =f""" startup_msg =f"""
{boundary} {boundary}

View file

@ -57,10 +57,8 @@ async def create_speech(
"pcm": "audio/pcm", "pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}") }.get(request.response_format, f"audio/{request.response_format}")
# Check if streaming is requested via header # Check if streaming is requested (default for OpenAI client)
is_streaming = x_raw_response == "stream" if request.stream:
if is_streaming:
# Stream audio chunks as they're generated # Stream audio chunks as they're generated
return StreamingResponse( return StreamingResponse(
stream_audio_chunks(tts_service, request), stream_audio_chunks(tts_service, request),

View file

@ -49,7 +49,7 @@ def handle_decimal(num: re.Match) -> str:
a, b = num.group().split(".") a, b = num.group().split(".")
return " point ".join([a, " ".join(b)]) return " point ".join([a, " ".join(b)])
@lru_cache(maxsize=1000) # Cache normalized text results # @lru_cache(maxsize=1000) # Cache normalized text results
def normalize_text(text: str) -> str: def normalize_text(text: str) -> str:
"""Normalize text for TTS processing """Normalize text for TTS processing

View file

@ -20,11 +20,40 @@ class TTSService:
def __init__(self, output_dir: str = None): def __init__(self, output_dir: str = None):
self.output_dir = output_dir self.output_dir = output_dir
def _split_text(self, text: str) -> List[str]: def _split_text(self, text: str):
"""Split text into sentences""" """Generate text chunks one at a time, splitting on natural pause points"""
if not isinstance(text, str): if not isinstance(text, str):
text = str(text) if text is not None else "" text = str(text) if text is not None else ""
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For longer sentences, split on commas and semicolons
if len(sentence) > 300: # Only split long sentences
# Split on pause points while preserving the punctuation
chunks = re.split(r"((?<=[,;])\s+)", sentence)
# Reassemble chunks with their trailing punctuation
current_chunk = ""
for i, chunk in enumerate(chunks):
if i % 2 == 0: # Text chunk
current_chunk += chunk
else: # Punctuation/whitespace chunk
current_chunk += chunk
if current_chunk.strip():
yield current_chunk.strip()
current_chunk = ""
# Yield any remaining text
if current_chunk.strip():
yield current_chunk.strip()
else:
yield sentence
@staticmethod @staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices @lru_cache(maxsize=20) # Cache up to 8 most recently used voices
@ -69,11 +98,11 @@ class TTSService:
# Generate audio with or without stitching # Generate audio with or without stitching
if stitch_long_output: if stitch_long_output:
chunks = self._split_text(text)
audio_chunks = [] audio_chunks = []
chunk_count = 0
# Process all chunks # Process chunks as they're generated
for i, chunk in enumerate(chunks): for chunk in self._split_text(text):
try: try:
# Process text and generate audio # Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0]) phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -81,23 +110,21 @@ class TTSService:
if chunk_audio is not None: if chunk_audio is not None:
audio_chunks.append(chunk_audio) audio_chunks.append(chunk_audio)
chunk_count += 1
else: else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}") logger.error(f"No audio generated for chunk {chunk_count + 1}")
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}" f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
) )
continue continue
if not audio_chunks: if not audio_chunks:
raise ValueError("No audio chunks were generated successfully") raise ValueError("No audio chunks were generated successfully")
audio = ( # Concatenate all chunks
np.concatenate(audio_chunks) audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
if len(audio_chunks) > 1
else audio_chunks[0]
)
else: else:
# Process single chunk # Process single chunk
phonemes, tokens = TTSModel.process_text(text, voice[0]) phonemes, tokens = TTSModel.process_text(text, voice[0])
@ -132,11 +159,9 @@ class TTSService:
raise ValueError(f"Voice not found: {voice}") raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path) voicepack = self._load_voice(voice_path)
# Split text into sentences for natural boundaries # Process chunks as they're generated
chunks = self._split_text(text) is_first = True
for chunk in self._split_text(text):
# Process and stream chunks
for i, chunk in enumerate(chunks):
try: try:
# Process text and generate audio # Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0]) phonemes, tokens = TTSModel.process_text(chunk, voice[0])
@ -148,17 +173,16 @@ class TTSService:
chunk_audio, chunk_audio,
24000, 24000,
output_format, output_format,
is_first_chunk=(i == 0), is_first_chunk=is_first,
normalizer=stream_normalizer normalizer=stream_normalizer
) )
yield chunk_bytes yield chunk_bytes
is_first = False
else: else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}") logger.error(f"No audio generated for chunk: '{chunk}'")
except Exception as e: except Exception as e:
logger.error( logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
)
continue continue
except Exception as e: except Exception as e:

View file

@ -31,6 +31,6 @@ class OpenAISpeechRequest(BaseModel):
description="The speed of the generated audio. Select a value from 0.25 to 4.0.", description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
) )
stream: bool = Field( stream: bool = Field(
default=False, default=True, # Default to streaming for OpenAI compatibility
description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.", description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
) )

View file

@ -32,8 +32,10 @@ services:
start_period: 1s start_period: 1s
kokoro-tts: kokoro-tts:
build: image: ghcr.io/remsky/kokoro-fastapi:latest
context: . # Uncomment below to build from source instead of using the released image
# build:
# context: .
volumes: volumes:
- ./api/src:/app/api/src - ./api/src:/app/api/src
- ./Kokoro-82M:/app/Kokoro-82M - ./Kokoro-82M:/app/Kokoro-82M

View file

@ -31,7 +31,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"model": "kokoro", "model": "kokoro",
"input": text, "input": text,
"voice": "af", "voice": "af",
"response_format": "wav", "response_format": "pcm",
"stream": True "stream": True
}, },
stream=True, stream=True,
@ -53,33 +53,19 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
results["time_to_first_chunk"] = first_chunk_time - start_time results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk) chunks.append(chunk)
# Extract WAV header and data separately # Concatenate all PCM chunks
# First chunk has header + data, subsequent chunks are raw PCM
if not chunks: if not chunks:
raise ValueError("No audio chunks received") raise ValueError("No audio chunks received")
first_chunk = chunks[0] all_audio_data = b''.join(chunks)
remaining_chunks = chunks[1:]
# Find end of WAV header (44 bytes for standard WAV) # Write as WAV file
header = first_chunk[:44] import wave
first_data = first_chunk[44:] with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
# Concatenate all PCM data wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
all_data = first_data + b''.join(remaining_chunks) wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Update WAV header with total data size
import struct
data_size = len(all_data)
# Update data size field (bytes 4-7)
header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
# Update subchunk2 size field (bytes 40-43)
header = header[:40] + struct.pack('<I', data_size) + header[44:]
# Write complete WAV file
complete_audio = header + all_data
with open(audio_path, 'wb') as f:
f.write(complete_audio)
# Calculate audio length using scipy # Calculate audio length using scipy
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
@ -89,7 +75,7 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
results["total_time"] = time.time() - start_time results["total_time"] = time.time() - start_time
# Print debug info # Print debug info
print(f"Complete audio size: {len(complete_audio)} bytes") print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}") print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s") print(f"Audio length: {results['audio_length']:.3f}s")
@ -114,7 +100,7 @@ def main():
text = f.read() text = f.read()
# Test specific token counts # Test specific token counts
token_sizes = [50, 100, 200, 500] token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
all_results = [] all_results = []
for tokens in token_sizes: for tokens in token_sizes:
@ -124,7 +110,7 @@ def main():
print(f"Text preview: {test_text[:50]}...") print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average # Run test 3 times for each size to get average
for i in range(3): for i in range(5):
print(f"Run {i+1}/3...") print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1) result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens result["target_tokens"] = tokens
@ -194,7 +180,7 @@ def main():
plot_timeline( plot_timeline(
df, df,
os.path.join(output_plots_dir, "first_token_timeline_stream.png") os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
) )
print("\nResults and plots saved to:") print("\nResults and plots saved to:")

View file

@ -0,0 +1,184 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Initialize OpenAI client
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream_openai suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream_openai")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 5 times for each size to get average
for i in range(5):
print(f"Run {i+1}/5...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream_openai suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots with _stream_openai suffix
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
if __name__ == "__main__":
main()

View file

@ -138,7 +138,7 @@ def plot_system_metrics(metrics_data, output_path):
plt.savefig(output_path, dpi=300, bbox_inches="tight") plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()
def plot_timeline(df, output_path): def plot_timeline(df, output_path, suffix=""):
"""Create timeline plot showing latency for each run. """Create timeline plot showing latency for each run.
Args: Args:
@ -255,7 +255,7 @@ def plot_timeline(df, output_path):
# Customize appearance # Customize appearance
setup_plot( setup_plot(
fig, ax, fig, ax,
"Time-To-Audio Latency", "Time-To-Audio Latency" + suffix,
xlabel="Time (seconds)", xlabel="Time (seconds)",
ylabel="Input Size" ylabel="Input Size"
) )

View file

@ -3,11 +3,11 @@
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": 50,
"total_time": 0.9603095054626465, "total_time": 0.7278211116790771,
"time_to_first_chunk": 0.5916037559509277, "time_to_first_chunk": 0.3613290786743164,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.45, "audio_length": 16.325,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 1 "run_number": 1
@ -15,11 +15,11 @@
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": 50,
"total_time": 0.5130870342254639, "total_time": 0.4556088447570801,
"time_to_first_chunk": 0.27448558807373047, "time_to_first_chunk": 0.18642044067382812,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 15.45, "audio_length": 16.325,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 2 "run_number": 2
@ -27,23 +27,47 @@
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": 50,
"total_time": 0.4667215347290039, "total_time": 0.5538768768310547,
"time_to_first_chunk": 0.22882533073425293, "time_to_first_chunk": 0.2720797061920166,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 15.45, "audio_length": 16.325,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 3 "run_number": 3
}, },
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4395604133605957,
"time_to_first_chunk": 0.15613913536071777,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.45748305320739746,
"time_to_first_chunk": 0.18805718421936035,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": 100,
"total_time": 0.9051008224487305, "total_time": 0.7347762584686279,
"time_to_first_chunk": 0.2526383399963379, "time_to_first_chunk": 0.16963744163513184,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 30.25, "audio_length": 31.1,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 1 "run_number": 1
@ -51,11 +75,11 @@
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": 100,
"total_time": 0.8579132556915283, "total_time": 0.8288509845733643,
"time_to_first_chunk": 0.25691914558410645, "time_to_first_chunk": 0.20123004913330078,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 30.25, "audio_length": 31.1,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 2 "run_number": 2
@ -63,23 +87,47 @@
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": 100,
"total_time": 0.9683890342712402, "total_time": 0.7503848075866699,
"time_to_first_chunk": 0.26229000091552734, "time_to_first_chunk": 0.21662068367004395,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 30.25, "audio_length": 31.1,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 3 "run_number": 3
}, },
{
"text_length": 448,
"token_count": 100,
"total_time": 0.694899320602417,
"time_to_first_chunk": 0.1966841220855713,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.68701171875,
"time_to_first_chunk": 0.19341063499450684,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{ {
"text_length": 906, "text_length": 906,
"token_count": 200, "token_count": 200,
"total_time": 1.8075971603393555, "total_time": 1.6845426559448242,
"time_to_first_chunk": 0.22536945343017578, "time_to_first_chunk": 0.21096158027648926,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
"audio_length": 60.75, "audio_length": 62.625,
"target_tokens": 200, "target_tokens": 200,
"actual_tokens": 200, "actual_tokens": 200,
"run_number": 1 "run_number": 1
@ -87,11 +135,11 @@
{ {
"text_length": 906, "text_length": 906,
"token_count": 200, "token_count": 200,
"total_time": 1.493518590927124, "total_time": 1.3545098304748535,
"time_to_first_chunk": 0.21502947807312012, "time_to_first_chunk": 0.18648386001586914,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
"audio_length": 60.75, "audio_length": 62.625,
"target_tokens": 200, "target_tokens": 200,
"actual_tokens": 200, "actual_tokens": 200,
"run_number": 2 "run_number": 2
@ -99,23 +147,47 @@
{ {
"text_length": 906, "text_length": 906,
"token_count": 200, "token_count": 200,
"total_time": 1.4910809993743896, "total_time": 1.426060676574707,
"time_to_first_chunk": 0.21600556373596191, "time_to_first_chunk": 0.20081472396850586,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
"audio_length": 60.75, "audio_length": 62.625,
"target_tokens": 200, "target_tokens": 200,
"actual_tokens": 200, "actual_tokens": 200,
"run_number": 3 "run_number": 3
}, },
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4084081649780273,
"time_to_first_chunk": 0.18551135063171387,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4703152179718018,
"time_to_first_chunk": 0.17750859260559082,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": 500,
"total_time": 4.223623275756836, "total_time": 4.289574384689331,
"time_to_first_chunk": 0.20010590553283691, "time_to_first_chunk": 0.1997976303100586,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 147.775, "audio_length": 157.875,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 1 "run_number": 1
@ -123,11 +195,11 @@
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": 500,
"total_time": 3.8811349868774414, "total_time": 3.7089381217956543,
"time_to_first_chunk": 0.24638962745666504, "time_to_first_chunk": 0.25969815254211426,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 147.775, "audio_length": 157.875,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 2 "run_number": 2
@ -135,41 +207,65 @@
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": 500,
"total_time": 4.045536994934082, "total_time": 4.138366222381592,
"time_to_first_chunk": 0.2252039909362793, "time_to_first_chunk": 0.1831505298614502,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav", "audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 147.775, "audio_length": 157.875,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 3 "run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.980635643005371,
"time_to_first_chunk": 0.20493030548095703,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.1370298862457275,
"time_to_first_chunk": 0.19150757789611816,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
} }
], ],
"summary": { "summary": {
"50": { "50": {
"avg_time_to_first_chunk": 0.365, "avg_time_to_first_chunk": 0.233,
"avg_total_time": 0.647, "avg_total_time": 0.527,
"avg_audio_length": 15.45, "avg_audio_length": 16.325,
"num_successful_runs": 3 "num_successful_runs": 5
}, },
"100": { "100": {
"avg_time_to_first_chunk": 0.257, "avg_time_to_first_chunk": 0.196,
"avg_total_time": 0.91, "avg_total_time": 0.739,
"avg_audio_length": 30.25, "avg_audio_length": 31.1,
"num_successful_runs": 3 "num_successful_runs": 5
}, },
"200": { "200": {
"avg_time_to_first_chunk": 0.219, "avg_time_to_first_chunk": 0.192,
"avg_total_time": 1.597, "avg_total_time": 1.469,
"avg_audio_length": 60.75, "avg_audio_length": 62.625,
"num_successful_runs": 3 "num_successful_runs": 5
}, },
"500": { "500": {
"avg_time_to_first_chunk": 0.224, "avg_time_to_first_chunk": 0.208,
"avg_total_time": 4.05, "avg_total_time": 4.051,
"avg_audio_length": 147.775, "avg_audio_length": 157.875,
"num_successful_runs": 3 "num_successful_runs": 5
} }
}, },
"timestamp": "2025-01-04 14:59:28" "timestamp": "2025-01-04 22:16:30"
} }

View file

@ -0,0 +1,271 @@
{
"individual_runs": [
{
"text_length": 212,
"token_count": 50,
"total_time": 1.149611473083496,
"time_to_first_chunk": 0.8767304420471191,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9325947761535645,
"time_to_first_chunk": 0.5965914726257324,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9205234050750732,
"time_to_first_chunk": 0.5961906909942627,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1321916580200195,
"time_to_first_chunk": 0.6946916580200195,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1146185398101807,
"time_to_first_chunk": 0.6918885707855225,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3645410537719727,
"time_to_first_chunk": 0.6802399158477783,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.4154777526855469,
"time_to_first_chunk": 0.7297353744506836,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3589520454406738,
"time_to_first_chunk": 0.698603630065918,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.2276430130004883,
"time_to_first_chunk": 0.6705801486968994,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.0949454307556152,
"time_to_first_chunk": 0.5698442459106445,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8211240768432617,
"time_to_first_chunk": 0.6070489883422852,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8376774787902832,
"time_to_first_chunk": 0.6538689136505127,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.6953792572021484,
"time_to_first_chunk": 0.5554308891296387,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.887030839920044,
"time_to_first_chunk": 0.5866930484771729,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7908406257629395,
"time_to_first_chunk": 0.5897490978240967,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.228837013244629,
"time_to_first_chunk": 0.5315976142883301,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.489210367202759,
"time_to_first_chunk": 0.5261838436126709,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.5290446281433105,
"time_to_first_chunk": 0.6186764240264893,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.209261178970337,
"time_to_first_chunk": 0.5990591049194336,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.218762636184692,
"time_to_first_chunk": 0.5466251373291016,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"50": {
"avg_time_to_first_chunk": 0.691,
"avg_total_time": 1.05,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.67,
"avg_total_time": 1.292,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 0.599,
"avg_total_time": 1.806,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 0.564,
"avg_total_time": 4.335,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 22:18:03"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 214 KiB

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 268 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 189 KiB

After

Width:  |  Height:  |  Size: 193 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 243 KiB

After

Width:  |  Height:  |  Size: 252 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

View file

@ -0,0 +1,268 @@
import re
import time
import random
import string
from typing import List, Tuple
def create_test_cases() -> List[str]:
"""Create a variety of test cases with different characteristics"""
# Helper to create random text with specific patterns
def random_text(length: int) -> str:
return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
test_cases = []
# Base test cases that hit specific patterns
base_cases = [
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
"Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
"X's and Y's properties cost £50 million in the 1990s",
"こんにちは。今日は!",
]
# Add base cases
test_cases.extend(base_cases)
# Add variations with random content
for length in [100, 1000, 10000]:
# Create 3 variations of each length
for _ in range(3):
text = random_text(length)
# Insert some patterns we're looking for
text = text.replace(text[10:20], "Dr. Smith")
text = text.replace(text[30:40], "$1,234.56")
text = text.replace(text[50:60], "A.B.C. xyz")
test_cases.append(text)
return test_cases
class TextNormalizerInline:
"""Text normalizer using inline patterns"""
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
class TextNormalizerCompiled:
"""Text normalizer using all compiled patterns"""
def __init__(self):
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'multi_space': re.compile(r" +"),
'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
'etc': re.compile(r"\betc\.(?! [A-Z])"),
'yeah': re.compile(r"(?i)\b(y)eah?\b"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'decimal': re.compile(r"\d*\.\d+"),
'range': re.compile(r"(?<=\d)-(?=\d)"),
's_after_number': re.compile(r"(?<=\d)S"),
'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
'x_possessive': re.compile(r"(?<=X')S\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['multi_space'].sub(" ", text)
text = self.patterns['newline_space'].sub("", text)
text = self.patterns['doctor'].sub("Doctor", text)
text = self.patterns['mister'].sub("Mister", text)
text = self.patterns['miss'].sub("Miss", text)
text = self.patterns['mrs'].sub("Mrs", text)
text = self.patterns['etc'].sub("etc", text)
text = self.patterns['yeah'].sub(r"\1e'a", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['comma_in_number'].sub("", text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['decimal'].sub(handle_decimal, text)
text = self.patterns['range'].sub(" to ", text)
text = self.patterns['s_after_number'].sub(" S", text)
text = self.patterns['possessive_s'].sub("'S", text)
text = self.patterns['x_possessive'].sub("s", text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
text = self.patterns['single_initial'].sub("-", text)
return text.strip()
class TextNormalizerHybrid:
"""Text normalizer using hybrid approach - compile only complex/frequent patterns"""
def __init__(self):
# Only compile patterns that are complex or frequently used
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns for complex operations
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
# Use inline patterns for simpler operations
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
def split_num(match: re.Match) -> str:
"""Split numbers for TTS processing"""
num = match.group(0)
if ":" in num:
h, m = num.split(":")
return f"{h} {m}"
if num.endswith("s"):
return f"{num[:-1]} s"
return num
def handle_money(match: re.Match) -> str:
"""Format money strings for TTS"""
text = match.group(0)
return text.replace("$", " dollars ").replace("£", " pounds ")
def handle_decimal(match: re.Match) -> str:
"""Format decimal numbers for TTS"""
num = match.group(0)
return num.replace(".", " point ")
def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
"""Benchmark all three implementations"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
}
results = {}
# Test each normalizer
for name, normalizer in normalizers.items():
start = time.perf_counter()
# Run normalizations
for _ in range(iterations):
for test in test_cases:
normalizer.normalize(test)
results[name] = time.perf_counter() - start
return results
def verify_outputs(test_cases: List[str]) -> bool:
"""Verify that all implementations produce identical output"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
}
for test in test_cases:
results = [norm.normalize(test) for norm in normalizers.values()]
if not all(r == results[0] for r in results):
return False
return True
def main():
# Create test cases
print("Generating test cases...")
test_cases = create_test_cases()
total_chars = sum(len(t) for t in test_cases)
print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
# Verify output consistency
print("\nVerifying output consistency...")
if verify_outputs(test_cases):
print("✓ All implementations produce identical output")
else:
print("✗ Warning: Implementations produce different outputs!")
return
# Run benchmarks
print("\nRunning benchmarks...")
iterations = 100
results = benchmark_normalizers(test_cases, iterations)
# Print results
print(f"\nResults for {iterations} iterations: ")
for name, time_taken in results.items():
print(f"{name.capitalize()}: {time_taken:.3f}s")
main()

View file

@ -36,10 +36,7 @@ def stream_to_speakers() -> None:
model="kokoro", model="kokoro",
voice="af", voice="af",
response_format="pcm", # similar to WAV, but without a header chunk at the start. response_format="pcm", # similar to WAV, but without a header chunk at the start.
input="""I see skies of blue and clouds of white input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earths surface""",
The bright blessed days, the dark sacred nights
And I think to myself
What a wonderful world""",
) as response: ) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
for chunk in response.iter_bytes(chunk_size=1024): for chunk in response.iter_bytes(chunk_size=1024):

BIN
examples/output.wav Normal file

Binary file not shown.

Binary file not shown.