-update soundfile version

-alignment with streaming standards
-audio processing config settings
-more comprehensive model warmup
-minor model improvements
-enhancing testing, benchmarking
-cool ascii logo
This commit is contained in:
remsky 2025-01-06 03:32:41 -07:00
parent 4c6cd83f85
commit 720c1fb97d
77 changed files with 2945 additions and 5522 deletions

BIN
.coverage

Binary file not shown.

View file

@ -129,7 +129,7 @@ response = requests.post(
)
```
<p align="center">
<img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
<img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
</p>
</details>
@ -144,7 +144,7 @@ response = requests.post(
- pcm
<p align="center">
<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
</p>
</details>
@ -175,8 +175,8 @@ Benchmarking was performed on generation via the local API using text lengths up
- H.G. Wells - The Time Machine (full text)
<p align="center">
<img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
<img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
</p>
Key Performance Metrics:

View file

@ -18,6 +18,8 @@ class Settings(BaseSettings):
onnx_model_path: str = "kokoro-v0_19.onnx"
voices_dir: str = "voices"
sample_rate: int = 24000
max_chunk_size: int = 300 # Maximum size of text chunks for processing
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
# ONNX Optimization Settings
onnx_num_threads: int = 4 # Number of threads for intra-op parallelism

View file

@ -0,0 +1,9 @@
In a village of La Mancha, the name of which I have no desire to call
to mind, there lived not long since one of those gentlemen that keep a
lance in the lance-rack, an old buckler, a lean hack, and a greyhound
for coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
extra on Sundays, made away with three-quarters of his income. The rest
of it went in a doublet of fine cloth and velvet breeches and shoes to
match for holidays, while on week-days he made a brave figure in his
best homespun.

View file

@ -22,10 +22,11 @@ async def lifespan(app: FastAPI):
logger.info("Loading TTS model and voice packs...")
# Initialize the main model with warm-up
voicepack_count = TTSModel.setup()
voicepack_count = await TTSModel.setup()
# boundary = "█████╗"*9
boundary = "" * 30
boundary = "" * 24
startup_msg =f"""
{boundary}
@ -37,8 +38,9 @@ async def lifespan(app: FastAPI):
{boundary}
"""
startup_msg += f"\nModel loaded and warmed up on {TTSModel.get_device()}"
startup_msg += f"\n{voicepack_count} voice packs loaded successfully\n"
# TODO: Improve CPU warmup, threads, memory, etc
startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
startup_msg += f"\n{voicepack_count} voice packs loaded\n"
startup_msg += f"\n{boundary}\n"
logger.info(startup_msg)

View file

@ -83,8 +83,8 @@ async def create_speech(
audio,
24000,
request.response_format,
is_first_chunk=True
)
is_first_chunk=True,
stream=False)
return Response(
content=content,

View file

@ -4,22 +4,30 @@ from io import BytesIO
import numpy as np
import soundfile as sf
import scipy.io.wavfile as wavfile
from loguru import logger
from ..core.config import settings
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
def __init__(self):
self.int16_max = np.iinfo(np.int16).max
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Normalize audio data to int16 range"""
def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
"""Normalize audio data to int16 range and trim chunk boundaries"""
# Convert to float32 if not already
audio_float = audio_data.astype(np.float32)
# Normalize to [-1, 1] range first
if np.max(np.abs(audio_float)) > 0:
audio_float = audio_float / np.max(np.abs(audio_float))
# Trim end of non-final chunks to reduce gaps
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
audio_float = audio_float[:-self.samples_to_trim]
# Scale to int16 range
return (audio_float * self.int16_max).astype(np.int16)
@ -27,13 +35,30 @@ class AudioNormalizer:
class AudioService:
"""Service for audio format conversions"""
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
"mp3": {
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
"compression_level": 0.0, # Balanced compression
},
"opus": {
"compression_level": 0.0, # Good balance for speech
},
"flac": {
"compression_level": 0.0, # Light compression, still fast
}
}
@staticmethod
def convert_audio(
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
is_first_chunk: bool = True,
normalizer: AudioNormalizer = None
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
stream: bool = True
) -> bytes:
"""Convert audio data to specified format
@ -42,6 +67,19 @@ class AudioService:
sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, opus, flac, pcm)
is_first_chunk: Whether this is the first chunk of a stream
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
format_settings: Optional dict of format-specific settings to override defaults
Example: {
"mp3": {
"bitrate_mode": "VARIABLE",
"compression_level": 0.8
}
}
Default settings balance speed and compression:
optimized for localhost @ 0.0
- MP3: constant bitrate, no compression (0.0)
- OPUS: no compression (0.0)
- FLAC: no compression (0.0)
Returns:
Bytes of the converted audio
@ -50,31 +88,48 @@ class AudioService:
try:
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(audio_data)
if stream:
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
else:
normalized_audio = audio_data
if output_format == "pcm":
logger.info("Writing PCM data...")
# Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes())
elif output_format == "wav":
logger.info("Writing to WAV format...")
# Always include WAV header for WAV format
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
if stream:
# Use soundfile for streaming to ensure proper headers
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
else:
# Trying scipy.io.wavfile for non-streaming WAV generation
# seems faster than soundfile
# avoids overhead from header generation and PCM encoding
wavfile.write(buffer, sample_rate, normalized_audio)
elif output_format == "mp3":
logger.info("Converting to MP3 format...")
# Use lower bitrate for streaming
sf.write(buffer, normalized_audio, sample_rate, format="MP3")
# Use format settings or defaults
settings = format_settings.get("mp3", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
sf.write(
buffer, normalized_audio,
sample_rate, format="MP3",
**settings
)
elif output_format == "opus":
logger.info("Converting to Opus format...")
# Use lower bitrate and smaller frame size for streaming
sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
settings = format_settings.get("opus", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="OGG",
subtype="OPUS", **settings)
elif output_format == "flac":
logger.info("Converting to FLAC format...")
# Use smaller block size for streaming
if is_first_chunk:
logger.info("Starting FLAC stream...")
settings = format_settings.get("flac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
subtype='PCM_16')
subtype='PCM_16', **settings)
else:
if output_format == "aac":
raise ValueError(

View file

@ -0,0 +1,52 @@
"""Text chunking service"""
import re
from ...core.config import settings
def split_text(text: str, max_chunk=None):
"""Split text into chunks on natural pause points
Args:
text: Text to split into chunks
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
"""
if max_chunk is None:
max_chunk = settings.max_chunk_size
if not isinstance(text, str):
text = str(text) if text is not None else ""
text = text.strip()
if not text:
return
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For medium-length sentences, split on punctuation
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
# First try splitting on semicolons and colons
parts = re.split(r"(?<=[;:])\s+", sentence)
for part in parts:
part = part.strip()
if not part:
continue
# If part is still long, split on commas
if len(part) > max_chunk:
subparts = re.split(r"(?<=,)\s+", part)
for subpart in subparts:
subpart = subpart.strip()
if subpart:
yield subpart
else:
yield part
else:
yield sentence

View file

@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
@classmethod
def setup(cls):
async def setup(cls):
"""Initialize model and setup voices"""
with cls._lock:
# Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
except Exception as e:
logger.error(f"Error copying voice {voice_name}: {str(e)}")
# Warm up with default voice
# Load warmup text
try:
dummy_text = "Hello"
voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
# Process text and generate audio
phonemes, tokens = cls.process_text(dummy_text, "a")
cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
logger.info("Model warm-up complete")
with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
warmup_text = f.read()
except Exception as e:
logger.warning(f"Model warm-up failed: {e}")
logger.warning(f"Failed to load warmup text: {e}")
warmup_text = "This is a warmup text that will be split into chunks for processing."
# Use warmup service
from .warmup import WarmupService
warmup = WarmupService()
# Load and warm up voices
loaded_voices = warmup.load_voices()
await warmup.warmup_voices(warmup_text, loaded_voices)
logger.info("Model warm-up complete")
# Count voices in directory
voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])

View file

@ -1,6 +1,7 @@
import os
import numpy as np
import torch
import time
from loguru import logger
from models import build_model
from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
from .tts_base import TTSBaseModel
from ..core.config import settings
# @torch.no_grad()
# def forward(model, tokens, ref_s, speed):
# """Forward pass through the model"""
# device = ref_s.device
# tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
# input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
# text_mask = length_to_mask(input_lengths).to(device)
# bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
# d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
# s = ref_s[:, 128:]
# d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
# x, _ = model.predictor.lstm(d)
# duration = model.predictor.duration_proj(x)
# duration = torch.sigmoid(duration).sum(axis=-1) / speed
# pred_dur = torch.round(duration).clamp(min=1).long()
# pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# c_frame = 0
# for i in range(pred_aln_trg.size(0)):
# pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
# c_frame += pred_dur[0, i].item()
# en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
# F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
# t_en = model.text_encoder(tokens, input_lengths, text_mask)
# asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
"""Forward pass through the model"""
"""Forward pass through the model with light optimizations that preserve output quality"""
device = ref_s.device
# Keep original token handling but optimize device placement
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
# BERT and encoder pass
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
# Split reference signal once for efficiency
s_content = ref_s[:, 128:]
s_ref = ref_s[:, :128]
# Predictor forward pass
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
# Duration prediction - keeping original logic
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# Alignment matrix construction - keeping original approach for quality
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
# Matrix multiplications - reuse unsqueezed tensor
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
en = d.transpose(-1, -2) @ pred_aln_trg
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
# Text encoding and final decoding
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
asr = t_en @ pred_aln_trg
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
# def length_to_mask(lengths):
# """Create attention mask from lengths"""
# mask = (
# torch.arange(lengths.max())
# .unsqueeze(0)
# .expand(lengths.shape[0], -1)
# .type_as(lengths)
# )
# mask = torch.gt(mask + 1, lengths.unsqueeze(1))
# return mask
def length_to_mask(lengths):
"""Create attention mask from lengths"""
mask = (
torch.arange(lengths.max())
.unsqueeze(0)
.expand(lengths.shape[0], -1)
.type_as(lengths)
)
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
return mask
"""Create attention mask from lengths - possibly optimized version"""
max_len = lengths.max()
# Create mask directly on the same device as lengths
mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
# Avoid type_as by using the correct dtype from the start
if lengths.dtype != mask.dtype:
mask = mask.to(dtype=lengths.dtype)
# Fuse operations using broadcasting
return mask + 1 > lengths[:, None]
class TTSGPUModel(TTSBaseModel):
_instance = None

View file

@ -8,7 +8,7 @@ from functools import lru_cache
import numpy as np
import torch
import scipy.io.wavfile as wavfile
from .text_processing import normalize_text
from .text_processing import normalize_text, chunker
from loguru import logger
from ..core.config import settings
@ -20,40 +20,6 @@ class TTSService:
def __init__(self, output_dir: str = None):
self.output_dir = output_dir
def _split_text(self, text: str):
"""Generate text chunks one at a time, splitting on natural pause points"""
if not isinstance(text, str):
text = str(text) if text is not None else ""
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For longer sentences, split on commas and semicolons
if len(sentence) > 300: # Only split long sentences
# Split on pause points while preserving the punctuation
chunks = re.split(r"((?<=[,;])\s+)", sentence)
# Reassemble chunks with their trailing punctuation
current_chunk = ""
for i, chunk in enumerate(chunks):
if i % 2 == 0: # Text chunk
current_chunk += chunk
else: # Punctuation/whitespace chunk
current_chunk += chunk
if current_chunk.strip():
yield current_chunk.strip()
current_chunk = ""
# Yield any remaining text
if current_chunk.strip():
yield current_chunk.strip()
else:
yield sentence
@staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
@ -96,28 +62,32 @@ class TTSService:
# Load voice using cached loader
voicepack = self._load_voice(voice_path)
# Generate audio with or without stitching
# For non-streaming, preprocess all chunks first
if stitch_long_output:
audio_chunks = []
chunk_count = 0
# Process chunks as they're generated
for chunk in self._split_text(text):
# Preprocess all chunks to phonemes/tokens
chunks_data = []
for chunk in chunker.split_text(text):
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunks_data.append((chunk, tokens))
except Exception as e:
logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
continue
if not chunks_data:
raise ValueError("No chunks were processed successfully")
# Generate audio for all chunks
audio_chunks = []
for chunk, tokens in chunks_data:
try:
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
if chunk_audio is not None:
audio_chunks.append(chunk_audio)
chunk_count += 1
else:
logger.error(f"No audio generated for chunk {chunk_count + 1}")
logger.error(f"No audio generated for chunk: '{chunk}'")
except Exception as e:
logger.error(
f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
)
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
continue
if not audio_chunks:
@ -138,53 +108,93 @@ class TTSService:
raise
async def generate_audio_stream(
self, text: str, voice: str, speed: float, output_format: str = "wav"
self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
):
"""Generate and yield audio chunks as they're generated for real-time streaming"""
try:
stream_start = time.time()
# Create normalizer for consistent audio levels
stream_normalizer = AudioNormalizer()
# Input validation and preprocessing
if not text:
raise ValueError("Text is empty")
preprocess_start = time.time()
normalized = normalize_text(text)
if not normalized:
raise ValueError("Text is empty after preprocessing")
text = str(normalized)
logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
# Voice validation and loading
voice_start = time.time()
voice_path = self._get_voice_path(voice)
if not voice_path:
raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path)
logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
# Process chunks as they're generated
is_first = True
for chunk in self._split_text(text):
chunks_processed = 0
# last_chunk_end = time.time()
# Process chunks as they come from generator
chunk_gen = chunker.split_text(text)
current_chunk = next(chunk_gen, None)
while current_chunk is not None:
next_chunk = next(chunk_gen, None) # Peek at next chunk
# chunk_start = time.time()
chunks_processed += 1
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
# text_process_start = time.time()
phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
# text_process_time = time.time() - text_process_start
# audio_gen_start = time.time()
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
# audio_gen_time = time.time() - audio_gen_start
if chunk_audio is not None:
# Convert chunk with proper header handling
convert_start = time.time()
chunk_bytes = AudioService.convert_audio(
chunk_audio,
24000,
output_format,
is_first_chunk=is_first,
normalizer=stream_normalizer
normalizer=stream_normalizer,
is_last_chunk=(next_chunk is None) # Last if no next chunk
)
# convert_time = time.time() - convert_start
# Calculate gap from last chunk
# gap_time = chunk_start - last_chunk_end
# Log timing details if not silent
# if not silent:
# logger.debug(
# f"\nChunk {chunks_processed} timing:"
# f"\n Gap from last chunk: {gap_time*1000:.1f}ms"
# f"\n Text processing: {text_process_time*1000:.1f}ms"
# f"\n Audio generation: {audio_gen_time*1000:.1f}ms"
# f"\n Audio conversion: {convert_time*1000:.1f}ms"
# f"\n Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
# )
yield chunk_bytes
is_first = False
# last_chunk_end = time.time()
else:
logger.error(f"No audio generated for chunk: '{chunk}'")
logger.error(f"No audio generated for chunk: '{current_chunk}'")
except Exception as e:
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
continue
logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
current_chunk = next_chunk # Move to next chunk
except Exception as e:
logger.error(f"Error in audio generation stream: {str(e)}")
raise

View file

@ -0,0 +1,52 @@
import os
from typing import List, Tuple
import torch
from loguru import logger
from .tts_service import TTSService
from .tts_model import TTSModel
class WarmupService:
"""Service for warming up TTS models and voice caches"""
def __init__(self):
self.tts_service = TTSService()
def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
"""Load and cache voices up to LRU limit"""
# Get all voices sorted by filename length (shorter names first, usually base voices)
voice_files = sorted(
[f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
key=len
)
# Load up to LRU cache limit (20)
loaded_voices = []
for voice_file in voice_files[:20]:
try:
voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
loaded_voices.append((voice_file[:-3], voicepack)) # Store name and tensor
# logger.info(f"Loaded voice {voice_file[:-3]} into cache")
except Exception as e:
logger.error(f"Failed to load voice {voice_file}: {e}")
logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
return loaded_voices
async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
"""Warm up voice inference and streaming"""
n_warmups = 1
for voice_name, _ in loaded_voices[:n_warmups]:
try:
logger.info(f"Running warmup inference on voice {voice_name}")
async for _ in self.tts_service.generate_audio_stream(
warmup_text,
voice_name,
1.0,
"pcm"
):
pass # Process all chunks to properly warm up
logger.info(f"Completed warmup for voice {voice_name}")
except Exception as e:
logger.warning(f"Warmup failed for voice {voice_name}: {e}")

35
api/tests/test_chunker.py Normal file
View file

@ -0,0 +1,35 @@
"""Tests for text chunking service"""
import pytest
from api.src.services.text_processing import chunker
def test_split_text():
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = list(chunker.split_text(text))
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty():
"""Test splitting empty text"""
assert list(chunker.split_text("")) == []
def test_split_text_single_sentence():
"""Test splitting single sentence"""
text = "Just one sentence."
assert list(chunker.split_text(text)) == ["Just one sentence."]
def test_split_text_with_custom_chunk_size():
"""Test splitting with custom max chunk size"""
text = "First part, second part, third part."
chunks = list(chunker.split_text(text, max_chunk=15))
assert len(chunks) == 3
assert chunks[0] == "First part,"
assert chunks[1] == "second part,"
assert chunks[2] == "third part."

View file

@ -1,7 +1,8 @@
from unittest.mock import Mock
from unittest.mock import Mock, AsyncMock
import pytest
import pytest_asyncio
import asyncio
from fastapi.testclient import TestClient
from httpx import AsyncClient
@ -22,6 +23,12 @@ async def async_client():
def mock_tts_service(monkeypatch):
mock_service = Mock()
mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
# Create proper async generator mock
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_service.generate_audio_stream = mock_stream
mock_service.list_voices.return_value = [
"af",
"bm_lewis",
@ -65,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
"voice": "bm_lewis",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 200
@ -84,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
"voice": "invalid_voice",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 400 # Bad request
@ -98,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
"voice": "af",
"response_format": "wav",
"speed": -1.0, # Invalid speed
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 422 # Validation error
@ -112,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
"voice": "af",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 500
@ -171,13 +182,14 @@ async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
"input": "Hello world",
"voice": "af",
"response_format": "pcm",
"stream": True
}
# Mock streaming response
async def mock_stream():
yield b"chunk1"
yield b"chunk2"
mock_tts_service.generate_audio_stream.return_value = mock_stream()
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}
@ -198,13 +210,14 @@ async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
"input": "Hello world",
"voice": "af",
"response_format": "mp3",
"stream": True
}
# Mock streaming response
async def mock_stream():
yield b"mp3header"
yield b"mp3data"
mock_tts_service.generate_audio_stream.return_value = mock_stream()
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"mp3header", b"mp3data"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}
@ -227,14 +240,14 @@ async def test_openai_speech_streaming_generator(mock_tts_service, async_client)
"input": "Hello world",
"voice": "af",
"response_format": "pcm",
"stream": True
}
# Mock streaming response
async def mock_stream():
yield b"chunk1"
yield b"chunk2"
mock_tts_service.generate_audio_stream.return_value = mock_stream()
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}

View file

@ -28,29 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
"""Test successful model warmup in lifespan"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
mock_tts_model.setup.return_value = 3 # 3 voice files
mock_tts_model.get_device.return_value = "cuda"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Verify the expected logging sequence
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
# Check for the startup message containing the required info
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
startup_msg = next(msg for msg in startup_calls if "Model loaded and warmed up on" in msg)
assert "Model loaded and warmed up on cuda" in startup_msg
assert "3 voice packs loaded successfully" in startup_msg
# Create async mock
async def async_setup():
return 3
mock_tts_model.setup = MagicMock()
mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Verify the expected logging sequence
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
# Check for the startup message containing the required info
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
assert "Model warmed up on" in startup_msg
assert "3 voice packs loaded" in startup_msg
# Clean up
await async_gen.__aexit__(None, None, None)
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@ -81,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
"""Test model warmup specifically on CUDA"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
# Create async mock
async def async_setup():
return 2
mock_tts_model.setup = MagicMock()
mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
mock_tts_model.setup.return_value = 2 # 2 voice files
mock_tts_model.get_device.return_value = "cuda"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@patch("api.src.main.TTSModel")
async def test_lifespan_cpu_fallback(mock_tts_model):
"""Test model warmup falling back to CPU"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
with patch(
"os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
):
mock_tts_model.setup.return_value = 4 # 4 voice files
mock_tts_model.get_device.return_value = "cpu"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
# Clean up
await async_gen.__aexit__(None, None, None)

View file

@ -16,13 +16,14 @@ def test_get_device_error():
with pytest.raises(RuntimeError, match="Model not initialized"):
TTSBaseModel.get_device()
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA available"""
TTSBaseModel._device = None
mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cuda"
assert voice_count == 2
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA unavailable"""
TTSBaseModel._device = None
mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cpu"
assert voice_count == 2

View file

@ -31,27 +31,6 @@ def sample_audio():
return np.sin(2 * np.pi * frequency * t).astype(np.float32)
def test_split_text(tts_service):
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = tts_service._split_text(text)
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty(tts_service):
"""Test splitting empty text"""
assert tts_service._split_text("") == []
def test_split_text_single_sentence(tts_service):
"""Test splitting single sentence"""
text = "Just one sentence."
assert tts_service._split_text(text) == ["Just one sentence."]
def test_audio_to_bytes(tts_service, sample_audio):
"""Test converting audio tensor to bytes"""
audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
mock_torch_load.return_value = torch.zeros((10, 24000))
mock_generate.return_value = (None, None)
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0)
@ -185,7 +164,7 @@ def test_generate_audio_error(
mock_exists.return_value = True
mock_torch_load.return_value = torch.zeros((10, 24000))
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0)

Binary file not shown.

After

Width:  |  Height:  |  Size: 774 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

BIN
assets/voice_analysis.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 958 KiB

View file

@ -43,6 +43,7 @@ services:
- ONNX_OPTIMIZATION_LEVEL=all
- ONNX_MEMORY_PATTERN=true
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
depends_on:
model-fetcher:
condition: service_healthy

View file

@ -2,7 +2,7 @@ services:
model-fetcher:
image: datamachines/git-lfs:latest
environment:
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-true}
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
volumes:
- ./Kokoro-82M:/app/Kokoro-82M
working_dir: /app/Kokoro-82M
@ -32,10 +32,10 @@ services:
start_period: 1s
kokoro-tts:
image: ghcr.io/remsky/kokoro-fastapi:latest
# image: ghcr.io/remsky/kokoro-fastapi:latest
# Uncomment below to build from source instead of using the released image
# build:
# context: .
build:
context: .
volumes:
- ./api/src:/app/api/src
- ./Kokoro-82M:/app/Kokoro-82M
@ -54,14 +54,14 @@ services:
model-fetcher:
condition: service_healthy
# # Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui:
# build:
# context: ./ui
# ports:
# - "7860:7860"
# volumes:
# - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment:
# - GRADIO_WATCH=True # Enable hot reloading
# Gradio UI service [Comment out everything below if you don't need it]
gradio-ui:
build:
context: ./ui
ports:
- "7860:7860"
volumes:
- ./ui/data:/app/ui/data
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
environment:
- GRADIO_WATCH=True # Enable hot reloading

View file

@ -1,15 +1,19 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
import time
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
import numpy as np
import pandas as pd
import requests
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_timeline, plot_correlation
from lib.shared_benchmark_utils import enc, get_text_for_tokens
def measure_first_token(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
@ -18,12 +22,12 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
"audio_length": None, # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request without streaming
response = requests.post(
"http://localhost:8880/v1/audio/speech",
@ -32,58 +36,62 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False
"stream": False,
},
timeout=1800
timeout=1800,
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
content = response.content
with open(audio_path, 'wb') as f:
with open(audio_path, "wb") as f:
f.write(content)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["time_to_first_chunk"] = time.time() - start_time
results["total_time"] = time.time() - start_time
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 25, 50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(5):
print(f"Run {i+1}/3...")
@ -91,67 +99,74 @@ def main():
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
avg_first_chunk = sum(
r["time_to_first_chunk"] for r in matching_results
) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
"num_successful_runs": len(matching_results),
}
# Save results
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark.json")
results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
df,
"target_tokens",
"time_to_first_chunk",
"Time to Audio vs Input Size",
"Number of Input Tokens",
"Time to Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency.png")
os.path.join(output_plots_dir, "first_token_latency.png"),
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline.png")
)
plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
if __name__ == "__main__":
main()

View file

@ -1,193 +0,0 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "pcm",
"stream": True
},
stream=True,
timeout=1800
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
all_audio_data = b''.join(chunks)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(5):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots with _stream suffix
# Plot correlation for both metrics
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
if __name__ == "__main__":
main()

View file

@ -1,184 +0,0 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Initialize OpenAI client
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream_openai suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream_openai")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 5 times for each size to get average
for i in range(5):
print(f"Run {i+1}/5...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream_openai suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots with _stream_openai suffix
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
import os
import time
import requests
from openai import OpenAI
from lib.stream_utils import run_benchmark
OPENAI_CLIENT = OpenAI(
base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
)
def measure_first_token_requests(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via direct API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "pcm",
"stream": True,
},
stream=True,
timeout=1800,
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
all_audio_data = b"".join(chunks)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def measure_first_token_openai(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Initialize OpenAI client
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
prefix='cpu'
# Run requests benchmark
print("\n=== Running Direct Requests Benchmark ===")
run_benchmark(
measure_first_token_requests,
output_dir=os.path.join(script_dir, "output_audio_stream"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream",
plot_title_suffix="(Streaming)",
prefix=prefix
)
# Run OpenAI benchmark
print("\n=== Running OpenAI Library Benchmark ===")
run_benchmark(
measure_first_token_openai,
output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream_openai",
plot_title_suffix="(OpenAI Streaming)",
prefix=prefix
)
if __name__ == "__main__":
main()

View file

@ -1,30 +1,37 @@
#!/usr/bin/env python3
import os
import sys
import json
import time
import threading
import queue
import pandas as pd
import sys
import threading
from datetime import datetime
from lib.shared_plotting import plot_system_metrics, plot_correlation
import pandas as pd
from lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats,
real_time_factor
real_time_factor,
save_json_results,
get_system_metrics,
write_benchmark_stats,
)
from lib.shared_plotting import plot_correlation, plot_system_metrics
from lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
)
class SystemMonitor:
def __init__(self, interval=1.0):
"""Rough system tracker: Not always accurate"""
self.interval = interval
self.metrics_queue = queue.Queue()
self.stop_event = threading.Event()
self.metrics_timeline = []
self.start_time = None
def _monitor_loop(self):
"""Background thread function to collect system metrics."""
while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
metrics["relative_time"] = time.time() - self.start_time
self.metrics_queue.put(metrics)
time.sleep(self.interval)
def start(self):
"""Start the monitoring thread."""
self.start_time = time.time()
self.monitor_thread = threading.Thread(target=self._monitor_loop)
self.monitor_thread.daemon = True
self.monitor_thread.start()
def stop(self):
"""Stop the monitoring thread and collect final metrics."""
self.stop_event.set()
if hasattr(self, 'monitor_thread'):
if hasattr(self, "monitor_thread"):
self.monitor_thread.join(timeout=2)
# Collect all metrics from queue
while True:
try:
@ -53,23 +60,24 @@ class SystemMonitor:
self.metrics_timeline.append(metrics)
except queue.Empty:
break
return self.metrics_timeline
def main():
# Initialize system monitor
monitor = SystemMonitor(interval=1.0) # 1 second interval
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
prefix = "gpu"
prefix = "cpu"
# Generate token sizes
if 'gpu' in prefix:
if "gpu" in prefix:
token_sizes = generate_token_sizes(
max_tokens=5000, dense_step=150,
dense_max=1000, sparse_step=1000)
elif 'cpu' in prefix:
max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
)
elif "cpu" in prefix:
token_sizes = generate_token_sizes(
max_tokens=1000, dense_step=300,
dense_max=1000, sparse_step=0)
max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
)
else:
token_sizes = generate_token_sizes(max_tokens=3000)
@ -78,7 +86,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
filename = f"{prefix}_{filename}"
return os.path.join(path, filename)
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():
results = []
test_start_time = time.time()
# Start system monitoring
monitor.start()
@ -114,7 +124,8 @@ def main():
processing_time, audio_length = make_tts_request(
chunk,
output_dir=output_dir,
prefix=prefix
prefix=prefix,
stream=False, # Use non-streaming mode for RTF benchmarking
)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
# Calculate RTF using the correct formula
rtf = real_time_factor(processing_time, audio_length)
print(f"Real-Time Factor: {rtf:.5f}")
results.append({
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 2),
})
results.append(
{
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 5),
}
)
df = pd.DataFrame(results)
if df.empty:
@ -144,89 +157,101 @@ def main():
{
"title": "Benchmark Statistics (with correct RTF)",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average RTF": df['rtf'].mean(),
"Average Real Time Speed": 1/df['rtf'].mean()
}
"Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average RTF": df["rtf"].mean(),
"Average Real Time Speed": 1 / df["rtf"].mean(),
},
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
"Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df["output_length"].mean(),
},
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
}
}
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
},
},
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
write_benchmark_stats(
stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
)
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
df,
"tokens",
"processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time_rtf.png")
prefix_path(output_plots_dir, "processing_time_rtf.png"),
)
# Plot RTF vs Token Count
plot_correlation(
df, "tokens", "rtf",
df,
"tokens",
"rtf",
"Real-Time Factor vs Input Size",
"Number of Input Tokens",
"Real-Time Factor (processing time / audio length)",
prefix_path(output_plots_dir, "realtime_factor_rtf.png")
prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
)
# Stop monitoring and get final metrics
final_metrics = monitor.stop()
# Convert metrics timeline to DataFrame for stats
metrics_df = pd.DataFrame(final_metrics)
# Add system usage stats
if not metrics_df.empty:
stats.append({
"title": "System Usage Statistics",
"stats": {
"Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
"Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
"Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
"Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
"Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
"Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
stats.append(
{
"title": "System Usage Statistics",
"stats": {
"Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
"Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
"Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
"Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
"Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
"Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
},
}
})
if 'gpu_memory_used' in metrics_df:
stats[-1]["stats"].update({
"Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
"Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
})
)
if "gpu_memory_used" in metrics_df:
stats[-1]["stats"].update(
{
"Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
"Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
}
)
# Plot system metrics
plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
plot_system_metrics(
final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
)
# Save final results
save_json_results(
{
"results": results,
"system_metrics": final_metrics,
"test_duration": time.time() - test_start_time
"test_duration": time.time() - test_start_time,
},
prefix_path(output_data_dir, "benchmark_results_rtf.json")
prefix_path(output_data_dir, "benchmark_results_rtf.json"),
)
print("\nResults saved to:")

View file

@ -1,19 +1,30 @@
import os
import json
import time
import pandas as pd
from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
from examples.assorted_checks.lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats
save_json_results,
get_system_metrics,
write_benchmark_stats,
)
from examples.assorted_checks.lib.shared_plotting import (
plot_correlation,
plot_system_metrics,
)
from examples.assorted_checks.lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
)
def main():
# Get optional prefix from first command line argument
import sys
prefix = sys.argv[1] if len(sys.argv) > 1 else ""
# Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
token_sizes = generate_token_sizes(total_tokens)
print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
# Save intermediate results
save_json_results(
{"results": results, "system_metrics": system_metrics},
prefix_path(output_data_dir, "benchmark_results.json")
prefix_path(output_data_dir, "benchmark_results.json"),
)
# Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
{
"title": "Benchmark Statistics",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average realtime factor": df['realtime_factor'].mean()
}
"Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average realtime factor": df["realtime_factor"].mean(),
},
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
"Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df["output_length"].mean(),
},
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
}
}
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
},
},
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
df,
"tokens",
"processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time.png")
prefix_path(output_plots_dir, "processing_time.png"),
)
# Plot Realtime Factor vs Token Count
plot_correlation(
df, "tokens", "realtime_factor",
df,
"tokens",
"realtime_factor",
"Realtime Factor vs Input Size",
"Number of Input Tokens",
"Realtime Factor (output length / processing time)",
prefix_path(output_plots_dir, "realtime_factor.png")
prefix_path(output_plots_dir, "realtime_factor.png"),
)
# Plot system metrics
plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
plot_system_metrics(
system_metrics, prefix_path(output_plots_dir, "system_usage.png")
)
print("\nResults saved to:")
print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")

View file

@ -1,11 +1,12 @@
"""Shared utilities specific to TTS benchmarking."""
import time
from typing import List, Optional, Tuple
from typing import List, Tuple, Optional
import requests
import tiktoken
from .shared_utils import get_audio_length, save_audio_file
from .shared_utils import save_audio_file, get_audio_length
# Global tokenizer instance
enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")
def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens.
Args:
text: Input text to slice
num_tokens: Desired number of tokens
Returns:
str: Text slice containing exactly num_tokens tokens
"""
@ -31,44 +32,69 @@ def make_tts_request(
text: str,
output_dir: str = None,
timeout: int = 1800,
prefix: str = ""
prefix: str = "",
stream: bool = True,
) -> Tuple[Optional[float], Optional[float]]:
"""Make TTS request using OpenAI-compatible endpoint.
Args:
text: Input text to convert to speech
output_dir: Directory to save audio files. If None, audio won't be saved.
timeout: Request timeout in seconds
prefix: Optional prefix for output filenames
Returns:
tuple: (processing_time, audio_length) in seconds, or (None, None) on error
"""
try:
start_time = time.time()
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
},
timeout=timeout,
)
response.raise_for_status()
if stream:
# For streaming, we need to collect all chunks
audio_chunks = []
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": True,
},
timeout=timeout,
stream=True,
)
response.raise_for_status()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
audio_chunks.append(chunk)
# Combine all chunks
audio_data = b"".join(audio_chunks)
else:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False,
},
timeout=timeout,
)
response.raise_for_status()
audio_data = response.content
processing_time = round(time.time() - start_time, 2)
# Calculate audio length from response content
audio_length = get_audio_length(response.content)
# Calculate audio length from audio data
audio_length = get_audio_length(audio_data)
# Save the audio file if output_dir is provided
if output_dir:
token_count = len(enc.encode(text))
output_file = save_audio_file(
response.content,
f"chunk_{token_count}_tokens",
output_dir
audio_data, f"chunk_{token_count}_tokens", output_dir
)
print(f"Saved audio to {output_file}")
@ -86,26 +112,26 @@ def generate_token_sizes(
max_tokens: int,
dense_step: int = 100,
dense_max: int = 1000,
sparse_step: int = 1000
sparse_step: int = 1000,
) -> List[int]:
"""Generate token size ranges with dense sampling at start.
Args:
max_tokens: Maximum number of tokens to generate sizes up to
dense_step: Step size for dense sampling range
dense_max: Maximum value for dense sampling
sparse_step: Step size for sparse sampling range
Returns:
list: Sorted list of token sizes
"""
# Dense sampling at start
dense_range = list(range(dense_step, dense_max + 1, dense_step))
if max_tokens <= dense_max or sparse_step < dense_max:
return sorted(dense_range)
# Sparse sampling for larger sizes
sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
# Combine and deduplicate
return sorted(list(set(dense_range + sparse_range)))

View file

@ -1,7 +1,8 @@
"""Shared plotting utilities for benchmarks and tests."""
import numpy as np
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
@ -12,66 +13,71 @@ STYLE_CONFIG = {
"secondary_color": "#05d9e8",
"grid_color": "#ffffff",
"text_color": "#ffffff",
"font_sizes": {
"title": 16,
"label": 14,
"tick": 12,
"text": 10
}
"font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
}
def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
"""Configure plot styling with consistent theme.
Args:
fig: matplotlib figure object
ax: matplotlib axis object
title: str, plot title
xlabel: str, optional x-axis label
ylabel: str, optional y-axis label
Returns:
tuple: (fig, ax) with applied styling
"""
# Grid styling
ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
# Title and labels
ax.set_title(title, pad=20,
fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"])
ax.set_title(
title,
pad=20,
fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"],
)
if xlabel:
ax.set_xlabel(xlabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
ax.set_xlabel(
xlabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
if ylabel:
ax.set_ylabel(ylabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
ax.set_ylabel(
ylabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
# Tick styling
ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"],
colors=STYLE_CONFIG["text_color"])
ax.tick_params(
labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
)
# Spine styling
for spine in ax.spines.values():
spine.set_color(STYLE_CONFIG["text_color"])
spine.set_alpha(0.3)
spine.set_linewidth(0.5)
# Background colors
ax.set_facecolor(STYLE_CONFIG["background_color"])
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
return fig, ax
def plot_system_metrics(metrics_data, output_path):
"""Create plots for system metrics over time.
Args:
metrics_data: list of dicts containing system metrics
output_path: str, path to save the output plot
@ -79,68 +85,118 @@ def plot_system_metrics(metrics_data, output_path):
df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
# Get baseline values
baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
baseline_gpu = (
df["gpu_memory_used"].iloc[0] / 1024
if "gpu_memory_used" in df.columns
else None
)
# Convert GPU memory to GB if present
if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
plt.style.use("dark_background")
# Create subplots based on available metrics
has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
# Smoothing window
window = min(5, len(df) // 2)
# Plot CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[0], "CPU Usage Over Time",
xlabel="Time (seconds)", ylabel="CPU Usage (%)")
sns.lineplot(
x=elapsed_time,
y=smoothed_cpu,
ax=axes[0],
color=STYLE_CONFIG["primary_color"],
linewidth=2,
)
axes[0].axhline(
y=baseline_cpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[0],
"CPU Usage Over Time",
xlabel="Time (seconds)",
ylabel="CPU Usage (%)",
)
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
axes[0].legend()
# Plot RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1],
color=STYLE_CONFIG["secondary_color"], linewidth=2)
axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[1], "RAM Usage Over Time",
xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
sns.lineplot(
x=elapsed_time,
y=smoothed_ram,
ax=axes[1],
color=STYLE_CONFIG["secondary_color"],
linewidth=2,
)
axes[1].axhline(
y=baseline_ram,
color=STYLE_CONFIG["primary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[1],
"RAM Usage Over Time",
xlabel="Time (seconds)",
ylabel="RAM Usage (GB)",
)
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
axes[1].legend()
# Plot GPU Memory if available
if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[2], "GPU Memory Usage Over Time",
xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
sns.lineplot(
x=elapsed_time,
y=smoothed_gpu,
ax=axes[2],
color=STYLE_CONFIG["primary_color"],
linewidth=2,
)
axes[2].axhline(
y=baseline_gpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[2],
"GPU Memory Usage Over Time",
xlabel="Time (seconds)",
ylabel="GPU Memory (GB)",
)
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
axes[2].legend()
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_timeline(df, output_path, suffix=""):
def plot_timeline(df, output_path, suffix="", prefix=""):
"""Create timeline plot showing latency for each run.
Args:
df: pandas DataFrame containing run data with columns:
- target_tokens: number of tokens
@ -149,124 +205,161 @@ def plot_timeline(df, output_path, suffix=""):
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
# Sort by tokens and run number
df = df.sort_values(['target_tokens', 'run_number'])
df = df.sort_values(["target_tokens", "run_number"])
# Create figure and axis
fig, ax = plt.subplots(figsize=(12, 6))
# Calculate y positions for each run with tighter grouping
unique_tokens = sorted(df['target_tokens'].unique())
unique_tokens = sorted(df["target_tokens"].unique())
y_positions = {}
current_y = 0
group_spacing = 0.8 # Space between groups
run_spacing = 0.2 # Space between runs in a group
run_spacing = 0.2 # Space between runs in a group
for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens]
runs = df[df["target_tokens"] == tokens]
base_y = current_y
for i, (_, run) in enumerate(runs.iterrows()):
y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
current_y = base_y + (len(runs) * run_spacing) + group_spacing
# Plot bars and points with more transparency
bar_height = 0.15
for _, row in df.iterrows():
y = y_positions[(row['target_tokens'], row['run_number'])]
latency = row['time_to_first_chunk']
y = y_positions[(row["target_tokens"], row["run_number"])]
latency = row["time_to_first_chunk"]
# Latency bar
ax.add_patch(patches.Rectangle(
(0, y - bar_height/2),
latency,
bar_height,
facecolor=STYLE_CONFIG["primary_color"],
alpha=0.3
))
ax.add_patch(
patches.Rectangle(
(0, y - bar_height / 2),
latency,
bar_height,
facecolor=STYLE_CONFIG["primary_color"],
alpha=0.3,
)
)
# End point
ax.plot(latency, y, 'o',
color=STYLE_CONFIG["secondary_color"],
markersize=4,
alpha=0.5)
ax.plot(
latency,
y,
"o",
color=STYLE_CONFIG["secondary_color"],
markersize=4,
alpha=0.5,
)
# Add mean lines and values for each token group
for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens]
mean_latency = token_runs['time_to_first_chunk'].mean()
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
token_runs = df[df["target_tokens"] == tokens]
mean_latency = token_runs["time_to_first_chunk"].mean()
y_positions_for_token = [
y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
]
min_y = min(y_positions_for_token)
max_y = max(y_positions_for_token)
group_center = (min_y + max_y) / 2
# Plot mean line with gradient alpha
gradient = np.linspace(0.2, 0.8, 100)
for i in range(len(gradient)-1):
y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
ax.plot([mean_latency, mean_latency], [y1, y2],
'-', color=STYLE_CONFIG["secondary_color"],
linewidth=3, alpha=gradient[i])
for i in range(len(gradient) - 1):
y1 = (
min_y
- bar_height
+ (max_y - min_y + 2 * bar_height) * (i / len(gradient))
)
y2 = (
min_y
- bar_height
+ (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
)
ax.plot(
[mean_latency, mean_latency],
[y1, y2],
"-",
color=STYLE_CONFIG["secondary_color"],
linewidth=3,
alpha=gradient[i],
)
# Add mean value label with background
label_text = f'Mean: {mean_latency:.3f}s'
label_text = f"Mean: {mean_latency:.3f}s"
bbox_props = dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["secondary_color"],
alpha=0.8,
pad=3,
linewidth=1
linewidth=1,
)
ax.text(mean_latency + 0.02, group_center,
label_text,
color=STYLE_CONFIG["secondary_color"],
va='center',
fontsize=10,
fontweight='bold',
bbox=bbox_props)
ax.text(
mean_latency + 0.02,
group_center,
label_text,
color=STYLE_CONFIG["secondary_color"],
va="center",
fontsize=10,
fontweight="bold",
bbox=bbox_props,
)
# Customize plot
ax.set_ylim(-1, current_y)
ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3) # Extra space for labels
ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3) # Extra space for labels
# Add labels for token groups with tighter spacing
group_positions = {}
for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens]
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
plt.axhline(y=min(y_positions_for_token) - bar_height,
color='white', alpha=0.1, linestyle='-')
runs = df[df["target_tokens"] == tokens]
y_positions_for_token = [
y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
]
group_positions[tokens] = sum(y_positions_for_token) / len(
y_positions_for_token
)
plt.axhline(
y=min(y_positions_for_token) - bar_height,
color="white",
alpha=0.1,
linestyle="-",
)
# Calculate mean audio length for each token group
audio_lengths = {}
for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens]
audio_lengths[tokens] = token_runs['audio_length'].mean()
token_runs = df[df["target_tokens"] == tokens]
audio_lengths[tokens] = token_runs["audio_length"].mean()
# Set y-ticks at group centers with token counts and audio lengths
plt.yticks(
list(group_positions.values()),
[f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
fontsize=10
[
f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
for tokens in group_positions.keys()
],
fontsize=10,
)
# Customize appearance
setup_plot(
fig, ax,
"Time-To-Audio Latency" + suffix,
fig,
ax,
prefix.upper() + " Time-To-Audio Latency " + suffix,
xlabel="Time (seconds)",
ylabel="Input Size"
ylabel="Input Size",
)
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
"""Create correlation plot with regression line and correlation coefficient.
Args:
df: pandas DataFrame containing the data
x: str, column name for x-axis
@ -277,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(12, 8))
# Scatter plot
sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6,
color=STYLE_CONFIG["primary_color"])
sns.scatterplot(
data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
)
# Regression line
sns.regplot(data=df, x=x, y=y, scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2})
sns.regplot(
data=df,
x=x,
y=y,
scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2},
)
# Add correlation coefficient
corr = df[x].corr(df[y])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=STYLE_CONFIG["font_sizes"]["text"],
color=STYLE_CONFIG["text_color"],
bbox=dict(facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7))
plt.text(
0.05,
0.95,
f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=STYLE_CONFIG["font_sizes"]["text"],
color=STYLE_CONFIG["text_color"],
bbox=dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7,
),
)
setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()

View file

@ -1,9 +1,10 @@
"""Shared utilities for benchmarks and tests."""
import os
import json
import subprocess
from typing import Any, Dict, List, Union, Optional
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
import psutil
import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
TORCH_AVAILABLE = False
try:
import torch
TORCH_AVAILABLE = torch.cuda.is_available()
except ImportError:
pass
def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
"""Check if an audio file is silent by comparing peak amplitude to a threshold.
Args:
audio_path: Path to the audio file
threshold: Peak amplitude threshold for silence
Returns:
bool: True if audio is silent, False otherwise
"""
rate, data = wavfile.read(audio_path)
peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0 # 16-bit audio
return peak_amplitude < threshold
def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
"""Get audio length in seconds from bytes data.
Args:
audio_data: Raw audio bytes
temp_dir: Directory for temporary file. If None, uses system temp directory.
Returns:
float: Audio length in seconds
"""
if temp_dir is None:
import tempfile
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, "temp.wav")
os.makedirs(temp_dir, exist_ok=True)
with open(temp_path, "wb") as f:
f.write(audio_data)
@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
"""Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
Args:
average: If True and multiple GPUs present, returns average memory usage.
If False, returns list of memory usage per GPU.
Returns:
float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
n_gpus = torch.cuda.device_count()
memory_used = []
for i in range(n_gpus):
memory_used.append(torch.cuda.memory_allocated(i) / 1024**2) # Convert to MB
memory_used.append(
torch.cuda.memory_allocated(i) / 1024**2
) # Convert to MB
if average and len(memory_used) > 0:
return sum(memory_used) / len(memory_used)
return memory_used if len(memory_used) > 1 else memory_used[0]
# Fall back to nvidia-smi
try:
result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
)
memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
memory_values = [
float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
]
if average and len(memory_values) > 0:
return sum(memory_values) / len(memory_values)
return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
def get_system_metrics() -> Dict[str, Union[str, float]]:
"""Get current system metrics including CPU, RAM, and GPU if available.
Returns:
dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
"""
# Get per-CPU percentages and calculate average
cpu_percentages = psutil.cpu_percent(percpu=True)
avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
metrics = {
"timestamp": datetime.now().isoformat(),
"cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:
def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
"""Save audio data to a file with proper naming and directory creation.
Args:
audio_data: Raw audio bytes
identifier: String to identify this audio file (e.g. token count, test name)
output_dir: Directory to save the file
Returns:
str: Path to the saved audio file
"""
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{identifier}.wav")
with open(output_file, "wb") as f:
f.write(audio_data)
return output_file
def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
"""Write benchmark statistics to a file in a clean, organized format.
Args:
stats: List of dictionaries containing stat name/value pairs
output_file: Path to output file
"""
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w") as f:
for section in stats:
# Write section header
f.write(f"=== {section['title']} ===\n\n")
# Write stats
for label, value in section['stats'].items():
for label, value in section["stats"].items():
if isinstance(value, float):
f.write(f"{label}: {value:.2f}\n")
else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None
def save_json_results(results: Dict[str, Any], output_file: str) -> None:
"""Save benchmark results to a JSON file with proper formatting.
Args:
results: Dictionary of results to save
output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
json.dump(results, f, indent=2)
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
def real_time_factor(
processing_time: float, audio_length: float, decimals: int = 2
) -> float:
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
Args:
processing_time: Time taken to process/generate audio
audio_length: Length of the generated audio
decimals: Number of decimal places to round to
Returns:
float: RTF value
"""

View file

@ -0,0 +1,205 @@
#!/usr/bin/env python3
import os
import time
import wave
from typing import Any, Dict, List, Callable, Optional
import pandas as pd
import scipy.io.wavfile as wavfile
from .shared_utils import save_json_results
from .shared_plotting import plot_timeline, plot_correlation
from .shared_benchmark_utils import enc, get_text_for_tokens
def check_audio_silence(audio_path: str) -> bool:
"""Check if audio file contains only silence"""
sample_rate, audio_data = wavfile.read(audio_path)
# Convert to float for RMS calculation
audio_float = audio_data.astype(float)
# Calculate RMS value
rms = (audio_float**2).mean() ** 0.5
# Define silence threshold (adjust if needed)
SILENCE_THRESHOLD = 50.0
return rms < SILENCE_THRESHOLD
def process_benchmark_results(
all_results: List[Dict[str, Any]], token_sizes: List[int]
) -> Dict[str, Any]:
"""Process benchmark results and generate summary"""
summary = {}
for tokens in token_sizes:
matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results:
avg_first_chunk = sum(
r["time_to_first_chunk"] for r in matching_results
) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results),
}
return summary
def save_benchmark_results(
all_results: List[Dict[str, Any]],
summary: Dict[str, Any],
output_data_dir: str,
output_plots_dir: str,
suffix: str,
plot_title_suffix: str,
prefix: str = "",
):
"""Save benchmark results and generate plots"""
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
save_json_results(
results_data,
os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots
plot_correlation(
df,
"target_tokens",
"time_to_first_chunk",
f"Time to First Audio vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
)
plot_correlation(
df,
"target_tokens",
"total_time",
f"Total Time vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
)
plot_timeline(
df,
os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
suffix=plot_title_suffix,
)
def run_benchmark(
measure_func: Callable,
output_dir: str,
output_data_dir: str,
output_plots_dir: str,
suffix: str = "",
plot_title_suffix: str = "",
num_runs: int = 5,
client=None,
prefix="",
):
"""Run benchmark with the given measurement function"""
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(output_plots_dir, exist_ok=True)
# Load sample text
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 50, 100, 250, 500]
all_results = []
silent_files = []
for tokens in token_sizes:
print(
f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
)
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
for i in range(num_runs):
print(f"Run {i+1}/{num_runs}...")
result = measure_func(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
# Handle time to first audio
first_chunk = result.get('time_to_first_chunk')
print(
f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
)
# Handle total time
total_time = result.get('total_time')
print(
f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
)
# Handle audio length
audio_length = result.get('audio_length')
print(
f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
)
# Calculate streaming overhead only if both values exist
if total_time is not None and first_chunk is not None:
print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
else:
print("Streaming overhead: N/A")
if result["error"]:
print(f"Error: {result['error']}")
elif result["audio_path"] and check_audio_silence(result["audio_path"]):
silent_files.append(result["audio_path"])
all_results.append(result)
# Process and save results
summary = process_benchmark_results(all_results, token_sizes)
save_benchmark_results(
all_results,
summary,
output_data_dir,
output_plots_dir,
suffix,
plot_title_suffix,
)
# Print paths
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
# Print silence check summary
if silent_files:
print("\nWARNING: The following files contain only silence:")
for file in silent_files:
print(f"- {file}")
else:
print("\nAll generated audio files contain valid audio content.")

View file

@ -1,111 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 18.833295583724976,
"output_length": 31.15,
"realtime_factor": 1.6539856161403135,
"elapsed_time": 19.024322748184204
},
{
"tokens": 200,
"processing_time": 38.95506024360657,
"output_length": 62.6,
"realtime_factor": 1.6069799304257042,
"elapsed_time": 58.21527123451233
},
{
"tokens": 300,
"processing_time": 49.74252939224243,
"output_length": 96.325,
"realtime_factor": 1.9364716908630366,
"elapsed_time": 108.19673728942871
},
{
"tokens": 400,
"processing_time": 61.349056243896484,
"output_length": 128.575,
"realtime_factor": 2.095794261102292,
"elapsed_time": 169.733656167984
},
{
"tokens": 500,
"processing_time": 82.86568236351013,
"output_length": 158.575,
"realtime_factor": 1.9136389815071193,
"elapsed_time": 252.7968451976776
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:13:49.865330",
"cpu_percent": 8.0,
"ram_percent": 39.4,
"ram_used_gb": 25.03811264038086,
"gpu_memory_used": 1204.0
},
{
"timestamp": "2025-01-03T00:14:08.781551",
"cpu_percent": 26.8,
"ram_percent": 42.6,
"ram_used_gb": 27.090862274169922,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:08.916973",
"cpu_percent": 16.1,
"ram_percent": 42.6,
"ram_used_gb": 27.089553833007812,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:47.979053",
"cpu_percent": 31.5,
"ram_percent": 43.6,
"ram_used_gb": 27.714427947998047,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:48.098976",
"cpu_percent": 20.0,
"ram_percent": 43.6,
"ram_used_gb": 27.704315185546875,
"gpu_memory_used": 1211.0
},
{
"timestamp": "2025-01-03T00:15:37.944729",
"cpu_percent": 29.7,
"ram_percent": 38.6,
"ram_used_gb": 24.53925323486328,
"gpu_memory_used": 1217.0
},
{
"timestamp": "2025-01-03T00:15:38.071915",
"cpu_percent": 8.6,
"ram_percent": 38.5,
"ram_used_gb": 24.51690673828125,
"gpu_memory_used": 1208.0
},
{
"timestamp": "2025-01-03T00:16:39.525449",
"cpu_percent": 23.4,
"ram_percent": 38.8,
"ram_used_gb": 24.71230697631836,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:16:39.612442",
"cpu_percent": 5.5,
"ram_percent": 38.9,
"ram_used_gb": 24.72066879272461,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:18:02.569076",
"cpu_percent": 27.4,
"ram_percent": 39.1,
"ram_used_gb": 24.868202209472656,
"gpu_memory_used": 1264.0
}
]
}

View file

@ -1,216 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 14.349808931350708,
"output_length": 31.15,
"rtf": 0.46,
"elapsed_time": 14.716031074523926
},
{
"tokens": 200,
"processing_time": 28.341803312301636,
"output_length": 62.6,
"rtf": 0.45,
"elapsed_time": 43.44207406044006
},
{
"tokens": 300,
"processing_time": 43.352553606033325,
"output_length": 96.325,
"rtf": 0.45,
"elapsed_time": 87.26906609535217
},
{
"tokens": 400,
"processing_time": 71.02449822425842,
"output_length": 128.575,
"rtf": 0.55,
"elapsed_time": 158.7198133468628
},
{
"tokens": 500,
"processing_time": 70.92521691322327,
"output_length": 158.575,
"rtf": 0.45,
"elapsed_time": 230.01379895210266
},
{
"tokens": 600,
"processing_time": 83.6328592300415,
"output_length": 189.25,
"rtf": 0.44,
"elapsed_time": 314.02610969543457
},
{
"tokens": 700,
"processing_time": 103.0810194015503,
"output_length": 222.075,
"rtf": 0.46,
"elapsed_time": 417.5678551197052
},
{
"tokens": 800,
"processing_time": 127.02162909507751,
"output_length": 253.85,
"rtf": 0.5,
"elapsed_time": 545.0128681659698
},
{
"tokens": 900,
"processing_time": 130.49781227111816,
"output_length": 283.775,
"rtf": 0.46,
"elapsed_time": 675.8943417072296
},
{
"tokens": 1000,
"processing_time": 154.76425909996033,
"output_length": 315.475,
"rtf": 0.49,
"elapsed_time": 831.0677945613861
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:23:52.896889",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.86032485961914,
"gpu_memory_used": 1281.0
},
{
"timestamp": "2025-01-03T00:24:07.429461",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.847564697265625,
"gpu_memory_used": 1285.0
},
{
"timestamp": "2025-01-03T00:24:07.620587",
"cpu_percent": 2.7,
"ram_percent": 39.1,
"ram_used_gb": 24.846607208251953,
"gpu_memory_used": 1275.0
},
{
"timestamp": "2025-01-03T00:24:36.140754",
"cpu_percent": 5.4,
"ram_percent": 39.1,
"ram_used_gb": 24.857810974121094,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:24:36.340675",
"cpu_percent": 6.2,
"ram_percent": 39.1,
"ram_used_gb": 24.85773468017578,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:25:19.905634",
"cpu_percent": 29.1,
"ram_percent": 39.2,
"ram_used_gb": 24.920318603515625,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:25:20.182219",
"cpu_percent": 20.0,
"ram_percent": 39.2,
"ram_used_gb": 24.930198669433594,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:26:31.414760",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.127891540527344,
"gpu_memory_used": 1259.0
},
{
"timestamp": "2025-01-03T00:26:31.617256",
"cpu_percent": 3.6,
"ram_percent": 39.5,
"ram_used_gb": 25.126346588134766,
"gpu_memory_used": 1252.0
},
{
"timestamp": "2025-01-03T00:27:42.736097",
"cpu_percent": 10.5,
"ram_percent": 39.5,
"ram_used_gb": 25.100231170654297,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:27:42.912870",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.098285675048828,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:29:06.725264",
"cpu_percent": 8.9,
"ram_percent": 39.5,
"ram_used_gb": 25.123123168945312,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:29:06.928826",
"cpu_percent": 5.5,
"ram_percent": 39.5,
"ram_used_gb": 25.128646850585938,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:30:50.206349",
"cpu_percent": 49.6,
"ram_percent": 39.6,
"ram_used_gb": 25.162948608398438,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:30:50.491837",
"cpu_percent": 14.8,
"ram_percent": 39.5,
"ram_used_gb": 25.13379669189453,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:32:57.721467",
"cpu_percent": 6.2,
"ram_percent": 39.6,
"ram_used_gb": 25.187721252441406,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:32:57.913350",
"cpu_percent": 3.6,
"ram_percent": 39.6,
"ram_used_gb": 25.199390411376953,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:35:08.608730",
"cpu_percent": 6.3,
"ram_percent": 39.8,
"ram_used_gb": 25.311710357666016,
"gpu_memory_used": 1330.0
},
{
"timestamp": "2025-01-03T00:35:08.791851",
"cpu_percent": 5.3,
"ram_percent": 39.8,
"ram_used_gb": 25.326683044433594,
"gpu_memory_used": 1333.0
},
{
"timestamp": "2025-01-03T00:37:43.782406",
"cpu_percent": 6.8,
"ram_percent": 40.6,
"ram_used_gb": 25.803058624267578,
"gpu_memory_used": 1409.0
}
]
}

View file

@ -1,300 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 0.96,
"output_length": 31.1,
"rtf": 0.03,
"elapsed_time": 1.11
},
{
"tokens": 250,
"processing_time": 2.23,
"output_length": 77.17,
"rtf": 0.03,
"elapsed_time": 3.49
},
{
"tokens": 400,
"processing_time": 4.05,
"output_length": 128.05,
"rtf": 0.03,
"elapsed_time": 7.77
},
{
"tokens": 550,
"processing_time": 4.06,
"output_length": 171.45,
"rtf": 0.02,
"elapsed_time": 12.0
},
{
"tokens": 700,
"processing_time": 6.01,
"output_length": 221.6,
"rtf": 0.03,
"elapsed_time": 18.16
},
{
"tokens": 850,
"processing_time": 6.9,
"output_length": 269.1,
"rtf": 0.03,
"elapsed_time": 25.21
},
{
"tokens": 1000,
"processing_time": 7.65,
"output_length": 315.05,
"rtf": 0.02,
"elapsed_time": 33.03
},
{
"tokens": 6000,
"processing_time": 48.7,
"output_length": 1837.1,
"rtf": 0.03,
"elapsed_time": 82.21
},
{
"tokens": 11000,
"processing_time": 92.44,
"output_length": 3388.57,
"rtf": 0.03,
"elapsed_time": 175.46
},
{
"tokens": 16000,
"processing_time": 163.61,
"output_length": 4977.32,
"rtf": 0.03,
"elapsed_time": 340.46
},
{
"tokens": 21000,
"processing_time": 209.72,
"output_length": 6533.3,
"rtf": 0.03,
"elapsed_time": 551.92
},
{
"tokens": 26000,
"processing_time": 329.35,
"output_length": 8068.15,
"rtf": 0.04,
"elapsed_time": 883.37
},
{
"tokens": 31000,
"processing_time": 473.52,
"output_length": 9611.48,
"rtf": 0.05,
"elapsed_time": 1359.28
},
{
"tokens": 36000,
"processing_time": 650.98,
"output_length": 11157.15,
"rtf": 0.06,
"elapsed_time": 2012.9
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T14:41:01.331735",
"cpu_percent": 7.5,
"ram_percent": 50.2,
"ram_used_gb": 31.960269927978516,
"gpu_memory_used": 3191.0
},
{
"timestamp": "2025-01-03T14:41:02.357116",
"cpu_percent": 17.01,
"ram_percent": 50.2,
"ram_used_gb": 31.96163558959961,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:02.445009",
"cpu_percent": 9.5,
"ram_percent": 50.3,
"ram_used_gb": 31.966781616210938,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:04.742152",
"cpu_percent": 18.27,
"ram_percent": 50.4,
"ram_used_gb": 32.08788299560547,
"gpu_memory_used": 3642.0
},
{
"timestamp": "2025-01-03T14:41:04.847795",
"cpu_percent": 16.27,
"ram_percent": 50.5,
"ram_used_gb": 32.094364166259766,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.019590",
"cpu_percent": 15.97,
"ram_percent": 50.7,
"ram_used_gb": 32.23244094848633,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.110324",
"cpu_percent": 3.54,
"ram_percent": 50.7,
"ram_used_gb": 32.234458923339844,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:13.252607",
"cpu_percent": 13.4,
"ram_percent": 50.6,
"ram_used_gb": 32.194271087646484,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:13.327557",
"cpu_percent": 4.69,
"ram_percent": 50.6,
"ram_used_gb": 32.191776275634766,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:19.413633",
"cpu_percent": 12.92,
"ram_percent": 50.9,
"ram_used_gb": 32.3467903137207,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:19.492758",
"cpu_percent": 7.5,
"ram_percent": 50.8,
"ram_used_gb": 32.34375,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:26.467284",
"cpu_percent": 13.09,
"ram_percent": 51.2,
"ram_used_gb": 32.56281280517578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:26.553559",
"cpu_percent": 8.39,
"ram_percent": 51.2,
"ram_used_gb": 32.56183624267578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:34.284362",
"cpu_percent": 12.61,
"ram_percent": 51.7,
"ram_used_gb": 32.874778747558594,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:34.362353",
"cpu_percent": 1.25,
"ram_percent": 51.7,
"ram_used_gb": 32.87461471557617,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:42:23.471312",
"cpu_percent": 11.64,
"ram_percent": 54.9,
"ram_used_gb": 34.90264129638672,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:42:23.547203",
"cpu_percent": 5.31,
"ram_percent": 54.9,
"ram_used_gb": 34.91563415527344,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:43:56.724933",
"cpu_percent": 12.97,
"ram_percent": 59.5,
"ram_used_gb": 37.84241485595703,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:43:56.815453",
"cpu_percent": 11.75,
"ram_percent": 59.5,
"ram_used_gb": 37.832679748535156,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:46:41.705155",
"cpu_percent": 12.94,
"ram_percent": 66.3,
"ram_used_gb": 42.1534538269043,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:46:41.835177",
"cpu_percent": 7.73,
"ram_percent": 66.2,
"ram_used_gb": 42.13554000854492,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:50:13.166236",
"cpu_percent": 11.62,
"ram_percent": 73.4,
"ram_used_gb": 46.71288299560547,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:50:13.261611",
"cpu_percent": 8.16,
"ram_percent": 73.4,
"ram_used_gb": 46.71356201171875,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:55:44.623607",
"cpu_percent": 12.92,
"ram_percent": 82.8,
"ram_used_gb": 52.65533447265625,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T14:55:44.735410",
"cpu_percent": 15.29,
"ram_percent": 82.7,
"ram_used_gb": 52.63290786743164,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T15:03:40.534449",
"cpu_percent": 13.88,
"ram_percent": 85.0,
"ram_used_gb": 54.050071716308594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:03:40.638708",
"cpu_percent": 12.21,
"ram_percent": 85.0,
"ram_used_gb": 54.053733825683594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:14:34.159142",
"cpu_percent": 14.51,
"ram_percent": 78.1,
"ram_used_gb": 49.70396423339844,
"gpu_memory_used": 4739.0
}
]
}

View file

@ -1,19 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 5500
Total audio generated: 1741.65s
Total test duration: 831.07s
Average processing rate: 6.72 tokens/second
Average RTF: 0.47x
Per-chunk Stats:
Average chunk size: 550.00 tokens
Min chunk size: 100.00 tokens
Max chunk size: 1000.00 tokens
Average processing time: 82.70s
Average output length: 174.17s
Performance Ranges:
Processing rate range: 5.63 - 7.17 tokens/second
RTF range: 0.44x - 0.55x

View file

@ -1,9 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 150850
Total audio generated: 46786.59s
Total test duration: 2012.90s
Average processing rate: 104.34 tokens/second
Average RTF: 0.03x

View file

@ -1,23 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 1800
Total audio generated (s): 568.53
Total test duration (s): 244.10
Average processing rate (tokens/s): 7.34
Average RTF: 0.43
Average Real Time Speed: 2.33
=== Per-chunk Stats ===
Average chunk size (tokens): 600.00
Min chunk size (tokens): 300
Max chunk size (tokens): 900
Average processing time (s): 81.30
Average output length (s): 189.51
=== Performance Ranges ===
Processing rate range (tokens/s): 7.21 - 7.47
RTF range: 0.43x - 0.43x
Real Time Speed range: 2.33x - 2.33x

View file

@ -1,403 +0,0 @@
{
"individual_runs": [
{
"text_length": 37,
"token_count": 10,
"total_time": 0.16574740409851074,
"time_to_first_chunk": 0.16574740409851074,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18812799453735352,
"time_to_first_chunk": 0.18812799453735352,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18645429611206055,
"time_to_first_chunk": 0.18645429611206055,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.17632031440734863,
"time_to_first_chunk": 0.17632031440734863,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.13381195068359375,
"time_to_first_chunk": 0.13381195068359375,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2086498737335205,
"time_to_first_chunk": 0.2086498737335205,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 1
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2727653980255127,
"time_to_first_chunk": 0.2727653980255127,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 2
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2096250057220459,
"time_to_first_chunk": 0.2096250057220459,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 3
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2256758213043213,
"time_to_first_chunk": 0.2256758213043213,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 4
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.1945042610168457,
"time_to_first_chunk": 0.1945042610168457,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 5
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4975121021270752,
"time_to_first_chunk": 0.4975121021270752,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4518404006958008,
"time_to_first_chunk": 0.4518404006958008,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5640325546264648,
"time_to_first_chunk": 0.5640325546264648,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5305957794189453,
"time_to_first_chunk": 0.5305957794189453,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5540030002593994,
"time_to_first_chunk": 0.5540030002593994,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.7963137626647949,
"time_to_first_chunk": 0.7963137626647949,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9320805072784424,
"time_to_first_chunk": 0.9320805072784424,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.824256181716919,
"time_to_first_chunk": 0.824256181716919,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9034836292266846,
"time_to_first_chunk": 0.9034836292266846,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8364357948303223,
"time_to_first_chunk": 0.8364357948303223,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8122682571411133,
"time_to_first_chunk": 1.8122682571411133,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7290427684783936,
"time_to_first_chunk": 1.7290427684783936,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.141728401184082,
"time_to_first_chunk": 2.141728401184082,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.0155680179595947,
"time_to_first_chunk": 2.0155680179595947,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8707575798034668,
"time_to_first_chunk": 1.8707575798034668,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.822713851928711,
"time_to_first_chunk": 4.822713851928711,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.227782726287842,
"time_to_first_chunk": 4.227782726287842,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.414916276931763,
"time_to_first_chunk": 4.414916276931763,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.579505681991577,
"time_to_first_chunk": 4.579505681991577,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.332529067993164,
"time_to_first_chunk": 4.332529067993164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 0.17,
"avg_total_time": 0.17,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"25": {
"avg_time_to_first_chunk": 0.222,
"avg_total_time": 0.222,
"avg_audio_length": 7.225,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 0.52,
"avg_total_time": 0.52,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.859,
"avg_total_time": 0.859,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 1.914,
"avg_total_time": 1.914,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 4.475,
"avg_total_time": 4.475,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 13:52:28"
}

View file

@ -1,271 +1,337 @@
{
"individual_runs": [
{
"text_length": 212,
"token_count": 50,
"total_time": 0.7278211116790771,
"time_to_first_chunk": 0.3613290786743164,
"text_length": 37,
"token_count": null,
"total_time": 0.4376556873321533,
"time_to_first_chunk": 0.4189143180847168,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.37163758277893066,
"time_to_first_chunk": 0.34892702102661133,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.2654602527618408,
"time_to_first_chunk": 0.2409076690673828,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.24376440048217773,
"time_to_first_chunk": 0.23003816604614258,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.25968003273010254,
"time_to_first_chunk": 0.24081206321716309,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 1.049060344696045,
"time_to_first_chunk": 0.3336215019226074,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4556088447570801,
"time_to_first_chunk": 0.18642044067382812,
"token_count": null,
"total_time": 0.8934676647186279,
"time_to_first_chunk": 0.3011031150817871,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5538768768310547,
"time_to_first_chunk": 0.2720797061920166,
"token_count": null,
"total_time": 0.9444286823272705,
"time_to_first_chunk": 0.3198091983795166,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4395604133605957,
"time_to_first_chunk": 0.15613913536071777,
"token_count": null,
"total_time": 0.9735183715820312,
"time_to_first_chunk": 0.369948148727417,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.45748305320739746,
"time_to_first_chunk": 0.18805718421936035,
"token_count": null,
"total_time": 0.8089118003845215,
"time_to_first_chunk": 0.30179858207702637,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.7347762584686279,
"time_to_first_chunk": 0.16963744163513184,
"token_count": null,
"total_time": 1.641003131866455,
"time_to_first_chunk": 0.2979745864868164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8288509845733643,
"time_to_first_chunk": 0.20123004913330078,
"token_count": null,
"total_time": 1.3709619045257568,
"time_to_first_chunk": 0.4272146224975586,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.7503848075866699,
"time_to_first_chunk": 0.21662068367004395,
"token_count": null,
"total_time": 1.2554471492767334,
"time_to_first_chunk": 0.29790568351745605,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.694899320602417,
"time_to_first_chunk": 0.1966841220855713,
"token_count": null,
"total_time": 1.3761844635009766,
"time_to_first_chunk": 0.32633328437805176,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.68701171875,
"time_to_first_chunk": 0.19341063499450684,
"token_count": null,
"total_time": 1.56705904006958,
"time_to_first_chunk": 0.32801246643066406,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.6845426559448242,
"time_to_first_chunk": 0.21096158027648926,
"text_length": 1140,
"token_count": null,
"total_time": 5.086699962615967,
"time_to_first_chunk": 0.33925390243530273,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.3545098304748535,
"time_to_first_chunk": 0.18648386001586914,
"text_length": 1140,
"token_count": null,
"total_time": 3.827953338623047,
"time_to_first_chunk": 0.39266157150268555,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.426060676574707,
"time_to_first_chunk": 0.20081472396850586,
"text_length": 1140,
"token_count": null,
"total_time": 3.9389824867248535,
"time_to_first_chunk": 0.3231511116027832,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4084081649780273,
"time_to_first_chunk": 0.18551135063171387,
"text_length": 1140,
"token_count": null,
"total_time": 3.942399740219116,
"time_to_first_chunk": 0.34731340408325195,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4703152179718018,
"time_to_first_chunk": 0.17750859260559082,
"text_length": 1140,
"token_count": null,
"total_time": 3.7748308181762695,
"time_to_first_chunk": 0.40787601470947266,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.289574384689331,
"time_to_first_chunk": 0.1997976303100586,
"token_count": null,
"total_time": 9.003147840499878,
"time_to_first_chunk": 0.5455703735351562,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.7089381217956543,
"time_to_first_chunk": 0.25969815254211426,
"token_count": null,
"total_time": 10.081491231918335,
"time_to_first_chunk": 0.4591703414916992,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.138366222381592,
"time_to_first_chunk": 0.1831505298614502,
"token_count": null,
"total_time": 9.767668962478638,
"time_to_first_chunk": 0.31237053871154785,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.980635643005371,
"time_to_first_chunk": 0.20493030548095703,
"token_count": null,
"total_time": 9.090342998504639,
"time_to_first_chunk": 0.41753244400024414,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.1370298862457275,
"time_to_first_chunk": 0.19150757789611816,
"token_count": null,
"total_time": 9.876578330993652,
"time_to_first_chunk": 0.3965120315551758,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 0.296,
"avg_total_time": 0.316,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 0.233,
"avg_total_time": 0.527,
"avg_audio_length": 16.325,
"avg_time_to_first_chunk": 0.325,
"avg_total_time": 0.934,
"avg_audio_length": 15.925,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.196,
"avg_total_time": 0.739,
"avg_audio_length": 31.1,
"avg_time_to_first_chunk": 0.335,
"avg_total_time": 1.442,
"avg_audio_length": 30.5,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 0.192,
"avg_total_time": 1.469,
"avg_audio_length": 62.625,
"250": {
"avg_time_to_first_chunk": 0.362,
"avg_total_time": 4.114,
"avg_audio_length": 78.775,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 0.208,
"avg_total_time": 4.051,
"avg_audio_length": 157.875,
"avg_time_to_first_chunk": 0.426,
"avg_total_time": 9.564,
"avg_audio_length": 156.475,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 22:16:30"
"timestamp": "2025-01-06 00:00:43"
}

View file

@ -1,271 +1,337 @@
{
"individual_runs": [
{
"text_length": 212,
"token_count": 50,
"total_time": 1.149611473083496,
"time_to_first_chunk": 0.8767304420471191,
"text_length": 37,
"token_count": null,
"total_time": 0.7105245590209961,
"time_to_first_chunk": 0.6905441284179688,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.35063982009887695,
"time_to_first_chunk": 0.32647228240966797,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.43519043922424316,
"time_to_first_chunk": 0.41011548042297363,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.33886170387268066,
"time_to_first_chunk": 0.32068943977355957,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.31725525856018066,
"time_to_first_chunk": 0.29624342918395996,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 1.0215234756469727,
"time_to_first_chunk": 0.38323354721069336,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9325947761535645,
"time_to_first_chunk": 0.5965914726257324,
"token_count": null,
"total_time": 1.38511061668396,
"time_to_first_chunk": 0.47052764892578125,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9205234050750732,
"time_to_first_chunk": 0.5961906909942627,
"token_count": null,
"total_time": 1.0185234546661377,
"time_to_first_chunk": 0.3535764217376709,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1321916580200195,
"time_to_first_chunk": 0.6946916580200195,
"token_count": null,
"total_time": 0.8875925540924072,
"time_to_first_chunk": 0.3373105525970459,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 1.1146185398101807,
"time_to_first_chunk": 0.6918885707855225,
"token_count": null,
"total_time": 0.9557526111602783,
"time_to_first_chunk": 0.3364882469177246,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 16.325,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3645410537719727,
"time_to_first_chunk": 0.6802399158477783,
"token_count": null,
"total_time": 1.569596767425537,
"time_to_first_chunk": 0.42070746421813965,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.4154777526855469,
"time_to_first_chunk": 0.7297353744506836,
"token_count": null,
"total_time": 1.5172030925750732,
"time_to_first_chunk": 0.3982264995574951,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.3589520454406738,
"time_to_first_chunk": 0.698603630065918,
"token_count": null,
"total_time": 1.5318474769592285,
"time_to_first_chunk": 0.3533785343170166,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.2276430130004883,
"time_to_first_chunk": 0.6705801486968994,
"token_count": null,
"total_time": 1.3858752250671387,
"time_to_first_chunk": 0.3360786437988281,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 1.0949454307556152,
"time_to_first_chunk": 0.5698442459106445,
"token_count": null,
"total_time": 1.7841475009918213,
"time_to_first_chunk": 0.34446048736572266,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 31.1,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 30.5,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8211240768432617,
"time_to_first_chunk": 0.6070489883422852,
"text_length": 1140,
"token_count": null,
"total_time": 4.334965467453003,
"time_to_first_chunk": 0.4336512088775635,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8376774787902832,
"time_to_first_chunk": 0.6538689136505127,
"text_length": 1140,
"token_count": null,
"total_time": 5.265941858291626,
"time_to_first_chunk": 0.5461773872375488,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.6953792572021484,
"time_to_first_chunk": 0.5554308891296387,
"text_length": 1140,
"token_count": null,
"total_time": 5.66066575050354,
"time_to_first_chunk": 0.4757547378540039,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.887030839920044,
"time_to_first_chunk": 0.5866930484771729,
"text_length": 1140,
"token_count": null,
"total_time": 9.289174318313599,
"time_to_first_chunk": 0.40159058570861816,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7908406257629395,
"time_to_first_chunk": 0.5897490978240967,
"text_length": 1140,
"token_count": null,
"total_time": 4.425869703292847,
"time_to_first_chunk": 0.40808558464050293,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
"audio_length": 78.775,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.228837013244629,
"time_to_first_chunk": 0.5315976142883301,
"token_count": null,
"total_time": 9.600461483001709,
"time_to_first_chunk": 0.3966805934906006,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.489210367202759,
"time_to_first_chunk": 0.5261838436126709,
"token_count": null,
"total_time": 8.82239580154419,
"time_to_first_chunk": 0.3900904655456543,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.5290446281433105,
"time_to_first_chunk": 0.6186764240264893,
"token_count": null,
"total_time": 10.99152159690857,
"time_to_first_chunk": 0.4041757583618164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.209261178970337,
"time_to_first_chunk": 0.5990591049194336,
"token_count": null,
"total_time": 9.12995958328247,
"time_to_first_chunk": 0.43430614471435547,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.218762636184692,
"time_to_first_chunk": 0.5466251373291016,
"token_count": null,
"total_time": 10.043727159500122,
"time_to_first_chunk": 0.41181445121765137,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 157.875,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 156.475,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 0.409,
"avg_total_time": 0.43,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 0.691,
"avg_total_time": 1.05,
"avg_audio_length": 16.325,
"avg_time_to_first_chunk": 0.376,
"avg_total_time": 1.054,
"avg_audio_length": 15.925,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.67,
"avg_total_time": 1.292,
"avg_audio_length": 31.1,
"avg_time_to_first_chunk": 0.371,
"avg_total_time": 1.558,
"avg_audio_length": 30.5,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 0.599,
"avg_total_time": 1.806,
"avg_audio_length": 62.625,
"250": {
"avg_time_to_first_chunk": 0.453,
"avg_total_time": 5.795,
"avg_audio_length": 78.775,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 0.564,
"avg_total_time": 4.335,
"avg_audio_length": 157.875,
"avg_time_to_first_chunk": 0.407,
"avg_total_time": 9.718,
"avg_audio_length": 156.475,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 22:18:03"
"timestamp": "2025-01-06 00:02:21"
}

View file

@ -1,23 +1,23 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 17150
Total audio generated (s): 5296.38
Total test duration (s): 155.23
Average processing rate (tokens/s): 102.86
Average RTF: 0.03
Average Real Time Speed: 31.25
Total tokens processed: 3150
Total audio generated (s): 1056.03
Total test duration (s): 70.20
Average processing rate (tokens/s): 46.46
Average RTF: 0.07
Average Real Time Speed: 15.00
=== Per-chunk Stats ===
Average chunk size (tokens): 1715.00
Average chunk size (tokens): 525.00
Min chunk size (tokens): 150
Max chunk size (tokens): 5000
Average processing time (s): 15.39
Average output length (s): 529.64
Max chunk size (tokens): 900
Average processing time (s): 11.57
Average output length (s): 176.00
=== Performance Ranges ===
Processing rate range (tokens/s): 80.65 - 125.10
RTF range: 0.03x - 0.04x
Real Time Speed range: 25.00x - 33.33x
Processing rate range (tokens/s): 40.07 - 53.57
RTF range: 0.06x - 0.08x
Real Time Speed range: 12.50x - 16.67x

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 181 KiB

After

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 454 KiB

After

Width:  |  Height:  |  Size: 491 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 246 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 210 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 268 KiB

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 233 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 193 KiB

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 196 KiB

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 764 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 238 KiB

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 250 KiB

After

Width:  |  Height:  |  Size: 221 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 459 KiB

After

Width:  |  Height:  |  Size: 463 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 198 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 252 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 258 KiB

After

Width:  |  Height:  |  Size: 263 KiB

View file

@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""Script to generate all plots needed for the README."""
import os
import sys
import shutil
from pathlib import Path
from validate_wav import validate_tts
# Get absolute paths
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent.parent
# Add directories to Python path for imports
sys.path.append(str(script_dir))
sys.path.append(str(script_dir / "benchmarks"))
# Import test scripts
from benchmark_tts_rtf import main as benchmark_rtf
from test_formats.test_audio_formats import main as test_formats
from benchmark_first_token_stream_unified import main as benchmark_stream
from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
# Remove directories from path after imports
sys.path.remove(str(script_dir))
sys.path.remove(str(script_dir / "benchmarks"))
def ensure_assets_dir():
"""Create assets directory if it doesn't exist."""
assets_dir = project_root / "assets"
assets_dir.mkdir(exist_ok=True)
return assets_dir
def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
"""Copy a plot to the assets directory with a new name."""
if os.path.exists(src_path):
shutil.copy2(src_path, assets_dir / dest_name)
print(f"Copied {src_path} to {assets_dir / dest_name}")
else:
print(f"Warning: Source plot not found at {src_path}")
def validate_and_print(wav_path: str, category: str):
"""Validate a WAV file and print results."""
if not os.path.exists(wav_path):
print(f"Warning: WAV file not found at {wav_path}")
return
print(f"\n=== Validating {category} Audio ===")
result = validate_tts(wav_path)
if "error" in result:
print(f"Error: {result['error']}")
else:
print(f"Duration: {result['duration']}")
print(f"Sample Rate: {result['sample_rate']} Hz")
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
print(f"- {issue}")
else:
print("\nNo issues found")
def main():
"""Generate all plots needed for the README."""
# Ensure assets directory exists
prefix = "gpu"
assets_dir = ensure_assets_dir()
print("\n=== Generating Format Comparison Plot ===")
test_formats()
copy_plot(
str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
"format_comparison.png",
assets_dir,
)
# Validate WAV output from format test
validate_and_print(
str(script_dir / "test_formats/output/test_formats/speech.wav"),
"Format Test WAV",
)
print("\n=== Generating Voice Analysis Plot ===")
test_voice_analysis()
copy_plot(
str(script_dir / "test_combinations/output/analysis_comparison.png"),
"voice_analysis.png",
assets_dir,
)
# Validate combined voice output
validate_and_print(
str(
script_dir
/ "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
),
"Combined Voice",
)
print("\n=== Generating Performance Benchmark Plots ===")
benchmark_rtf()
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
f"{prefix}_processing_time.png",
assets_dir,
)
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
f"{prefix}_realtime_factor.png",
assets_dir,
)
# Validate RTF benchmark output (~500 tokens)
validate_and_print(
str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
"RTF Benchmark",
)
print("\n=== Generating Streaming Benchmark Plots ===")
benchmark_stream()
# Copy direct streaming plots
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
f"{prefix}_first_token_latency_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
f"{prefix}_first_token_timeline_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
f"{prefix}_total_time_latency_direct.png",
assets_dir,
)
# Copy OpenAI streaming plots
copy_plot(
str(
script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
),
f"{prefix}_first_token_latency_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir
/ "benchmarks/output_plots/first_token_timeline_stream_openai.png"
),
f"{prefix}_first_token_timeline_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
),
f"{prefix}_total_time_latency_openai.png",
assets_dir,
)
# Wait a moment for files to be generated
import time
time.sleep(2)
# Validate streaming outputs (~500 tokens)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
),
"Direct Streaming",
)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
),
"OpenAI Streaming",
)
validate_and_print(
str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
"Format Test WAV",
)
print("\nAll plots have been generated and copied to the assets directory")
if __name__ == "__main__":
main()

View file

@ -73,6 +73,7 @@ def generate_speech(
"voice": voice,
"speed": 1.0,
"response_format": "wav", # Use WAV for analysis
"stream": False,
},
)
@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
fig.patch.set_facecolor("#1a1a2e")
num_files = len(audio_files)
# Create subplot grid with proper spacing
# Create subplot grid with proper spacing for waveforms and metrics
total_rows = num_files + 2 # Add one more row for metrics
gs = plt.GridSpec(
num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
)
# Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
# Colors for voices
colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]
# Create two subplots for metrics with similar scales
# Left subplot: Brightness and Volume
ax1 = plt.subplot(gs[num_files, 0])
metrics1 = [
# Create metrics for each subplot
metrics = [
(
"Brightness",
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
"kHz",
),
("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
]
# Right subplot: Voice Pitch and Texture
ax2 = plt.subplot(gs[num_files, 1])
metrics2 = [
(
"Voice Pitch",
[min(chars["dominant_frequencies"]) for chars in all_chars.values()],
"Hz",
plt.subplot(gs[num_files, 0]),
[
(
"Volume",
[chars["rms"] * 100 for chars in all_chars.values()],
"RMS×100",
)
],
),
(
"Texture",
[chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
"ZCR×1000",
plt.subplot(gs[num_files, 1]),
[
(
"Brightness",
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
"kHz",
)
],
),
(
plt.subplot(gs[num_files + 1, 0]),
[
(
"Voice Pitch",
[
min(chars["dominant_frequencies"])
for chars in all_chars.values()
],
"Hz",
)
],
),
(
plt.subplot(gs[num_files + 1, 1]),
[
(
"Texture",
[
chars["zero_crossing_rate"] * 1000
for chars in all_chars.values()
],
"ZCR×1000",
)
],
),
]
def plot_grouped_bars(ax, metrics, show_legend=True):
n_groups = len(metrics)
# Plot each metric
for i, (ax, metric_data) in enumerate(metrics):
n_voices = len(audio_files)
bar_width = 0.25
indices = np.array([0])
indices = np.arange(n_groups)
values = metric_data[0][1]
max_val = max(values)
# Get max value for y-axis scaling
max_val = max(max(m[1]) for m in metrics)
for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
values = [m[1][i] for m in metrics]
offset = (i - n_voices / 2 + 0.5) * bar_width
for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
offset = (j - n_voices / 2 + 0.5) * bar_width
bars = ax.bar(
indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
indices + offset,
[values[j]],
bar_width,
label=voice,
color=color,
alpha=0.8,
)
# Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
)
ax.set_xticks(indices)
ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
# Set y-axis limits with some padding
ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
ax.set_ylim(0, max_val * 1.2)
ax.set_ylabel("Value")
if show_legend:
# Only show legend on first metric plot
if i == 0:
ax.legend(
bbox_to_anchor=(1.05, 1),
loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
edgecolor="#ffffff",
)
# Plot both subplots
plot_grouped_bars(ax1, metrics1, show_legend=True)
plot_grouped_bars(ax2, metrics2, show_legend=False)
# Style the subplot
setup_plot(fig, ax, metric_data[0][0])
# Style both subplots
setup_plot(fig, ax1, "Brightness and Volume")
setup_plot(fig, ax2, "Voice Pitch and Texture")
# Add y-axis labels
ax1.set_ylabel("Value")
ax2.set_ylabel("Value")
# Adjust the figure size to accommodate the legend
fig.set_size_inches(15, 15)
# Add padding around the entire figure
# Adjust the figure size and padding
fig.set_size_inches(15, 20)
plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
)
parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
parser.add_argument(
"--output-dir",
"--output-dir",
default="examples/assorted_checks/test_combinations/output",
help="Output directory for audio files",
)

View file

@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
for i, stat in enumerate(stats):
format_name = stat["format"].upper()
try:
# Handle PCM format differently
if stat["format"] == "pcm":
# Read raw PCM data (16-bit mono)
with open(
os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
) as f:
raw_data = f.read()
data = np.frombuffer(raw_data, dtype=np.int16)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000
else:
# Read other formats with soundfile
data, sr = sf.read(
os.path.join(output_dir, f"test_audio.{stat['format']}")
)
file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
# Plot waveform
if stat["format"] == "wav":
# Use scipy.io.wavfile for WAV files
sr, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
elif stat["format"] == "pcm":
# Read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000 # Known sample rate for our endpoint
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sr = sf.read(file_path)
# Plot waveform with consistent normalization
ax = plt.subplot(gs_waves[i])
time = np.arange(len(data)) / sr
plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
ax.set_xlabel("Time (seconds)")
ax.set_ylabel("")
ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
"""Get audio file statistics"""
file_size = os.path.getsize(file_path)
file_size_kb = file_size / 1024 # Convert to KB
format_name = Path(file_path).suffix[1:]
try:
# Try reading with soundfile first
if format_name == "wav":
# Use scipy.io.wavfile for WAV files
sample_rate, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1]
elif format_name == "pcm":
# For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
sample_rate = 24000 # Known sample rate for our endpoint
duration = len(data) / sample_rate
channels = 1
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sample_rate = sf.read(file_path)
duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1]
# Calculate audio statistics
stats = {
"format": Path(file_path).suffix[1:],
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": sample_rate,
"channels": channels,
"min_amplitude": float(np.min(data)),
"max_amplitude": float(np.max(data)),
"mean_amplitude": float(np.mean(np.abs(data))),
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
}
return stats
except:
# For PCM, read raw bytes and estimate duration
with open(file_path, "rb") as f:
data = f.read()
# Assuming 16-bit PCM mono at 24kHz
samples = len(data) // 2 # 2 bytes per sample
duration = samples / 24000
return {
"format": "pcm",
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": 24000,
"channels": 1,
"note": "PCM stats are estimated from raw bytes",
}
# Calculate audio statistics
stats = {
"format": format_name,
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": sample_rate,
"channels": channels,
"min_amplitude": float(np.min(data)),
"max_amplitude": float(np.max(data)),
"mean_amplitude": float(np.mean(np.abs(data))),
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
}
return stats
def main():
@ -254,13 +256,49 @@ def main():
# Generate and save
start_time = time.time()
response = client.audio.speech.create(
model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
# Use requests with stream=False for consistent data handling
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"voice": voice,
"input": SAMPLE_TEXT,
"response_format": fmt,
"stream": False, # Explicitly disable streaming to get single complete chunk
},
stream=False,
headers={"Accept": f"audio/{fmt}"}, # Explicitly request audio format
)
generation_time = time.time() - start_time
with open(output_path, "wb") as f:
f.write(response.content)
print(f"\nResponse headers for {fmt}:")
for header, value in response.headers.items():
print(f"{header}: {value}")
print(f"Content length: {len(response.content)} bytes")
print(f"First few bytes: {response.content[:20].hex()}")
# Write the file and verify it was written correctly
try:
with open(output_path, "wb") as f:
f.write(response.content)
# Verify file was written
if not output_path.exists():
raise Exception(f"Failed to write {fmt} file")
# Check file size matches content length
written_size = output_path.stat().st_size
if written_size != len(response.content):
raise Exception(
f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
)
print(f"Successfully wrote {fmt} file")
except Exception as e:
print(f"Error writing {fmt} file: {e}")
continue
# Get stats
file_stats = get_audio_stats(str(output_path))

View file

@ -4,15 +4,19 @@ import random
import string
from typing import List, Tuple
def create_test_cases() -> List[str]:
"""Create a variety of test cases with different characteristics"""
# Helper to create random text with specific patterns
def random_text(length: int) -> str:
return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
return "".join(
random.choice(string.ascii_letters + string.digits + " .,!?")
for _ in range(length)
)
test_cases = []
# Base test cases that hit specific patterns
base_cases = [
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
@ -21,10 +25,10 @@ def create_test_cases() -> List[str]:
"X's and Y's properties cost £50 million in the 1990s",
"こんにちは。今日は!",
]
# Add base cases
test_cases.extend(base_cases)
# Add variations with random content
for length in [100, 1000, 10000]:
# Create 3 variations of each length
@ -35,23 +39,24 @@ def create_test_cases() -> List[str]:
text = text.replace(text[30:40], "$1,234.56")
text = text.replace(text[50:60], "A.B.C. xyz")
test_cases.append(text)
return test_cases
class TextNormalizerInline:
"""Text normalizer using inline patterns"""
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -61,108 +66,132 @@ class TextNormalizerInline:
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
split_num,
text,
)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
handle_money,
text,
)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
class TextNormalizerCompiled:
"""Text normalizer using all compiled patterns"""
def __init__(self):
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'multi_space': re.compile(r" +"),
'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
'etc': re.compile(r"\betc\.(?! [A-Z])"),
'yeah': re.compile(r"(?i)\b(y)eah?\b"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'decimal': re.compile(r"\d*\.\d+"),
'range': re.compile(r"(?<=\d)-(?=\d)"),
's_after_number': re.compile(r"(?<=\d)S"),
'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
'x_possessive': re.compile(r"(?<=X')S\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
"whitespace": re.compile(r"[^\S \n]"),
"multi_space": re.compile(r" +"),
"newline_space": re.compile(r"(?<=\n) +(?=\n)"),
"doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
"mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
"miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
"mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
"etc": re.compile(r"\betc\.(?! [A-Z])"),
"yeah": re.compile(r"(?i)\b(y)eah?\b"),
"numbers": re.compile(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
),
"comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
"money": re.compile(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
),
"decimal": re.compile(r"\d*\.\d+"),
"range": re.compile(r"(?<=\d)-(?=\d)"),
"s_after_number": re.compile(r"(?<=\d)S"),
"possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
"x_possessive": re.compile(r"(?<=X')S\b"),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
"single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['multi_space'].sub(" ", text)
text = self.patterns['newline_space'].sub("", text)
text = self.patterns['doctor'].sub("Doctor", text)
text = self.patterns['mister'].sub("Mister", text)
text = self.patterns['miss'].sub("Miss", text)
text = self.patterns['mrs'].sub("Mrs", text)
text = self.patterns['etc'].sub("etc", text)
text = self.patterns['yeah'].sub(r"\1e'a", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['comma_in_number'].sub("", text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['decimal'].sub(handle_decimal, text)
text = self.patterns['range'].sub(" to ", text)
text = self.patterns['s_after_number'].sub(" S", text)
text = self.patterns['possessive_s'].sub("'S", text)
text = self.patterns['x_possessive'].sub("s", text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
text = self.patterns['single_initial'].sub("-", text)
text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns["multi_space"].sub(" ", text)
text = self.patterns["newline_space"].sub("", text)
text = self.patterns["doctor"].sub("Doctor", text)
text = self.patterns["mister"].sub("Mister", text)
text = self.patterns["miss"].sub("Miss", text)
text = self.patterns["mrs"].sub("Mrs", text)
text = self.patterns["etc"].sub("etc", text)
text = self.patterns["yeah"].sub(r"\1e'a", text)
text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns["comma_in_number"].sub("", text)
text = self.patterns["money"].sub(handle_money, text)
text = self.patterns["decimal"].sub(handle_decimal, text)
text = self.patterns["range"].sub(" to ", text)
text = self.patterns["s_after_number"].sub(" S", text)
text = self.patterns["possessive_s"].sub("'S", text)
text = self.patterns["x_possessive"].sub("s", text)
text = self.patterns["initials"].sub(
lambda m: m.group().replace(".", "-"), text
)
text = self.patterns["single_initial"].sub("-", text)
return text.strip()
class TextNormalizerHybrid:
"""Text normalizer using hybrid approach - compile only complex/frequent patterns"""
def __init__(self):
# Only compile patterns that are complex or frequently used
self.patterns = {
'whitespace': re.compile(r"[^\S \n]"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
"whitespace": re.compile(r"[^\S \n]"),
"numbers": re.compile(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
),
"money": re.compile(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns for complex operations
text = self.patterns['whitespace'].sub(" ", text)
text = self.patterns['numbers'].sub(split_num, text)
text = self.patterns['money'].sub(handle_money, text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns["money"].sub(handle_money, text)
text = self.patterns["initials"].sub(
lambda m: m.group().replace(".", "-"), text
)
# Use inline patterns for simpler operations
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -179,9 +208,10 @@ class TextNormalizerHybrid:
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
def split_num(match: re.Match) -> str:
"""Split numbers for TTS processing"""
num = match.group(0)
@ -192,61 +222,70 @@ def split_num(match: re.Match) -> str:
return f"{num[:-1]} s"
return num
def handle_money(match: re.Match) -> str:
"""Format money strings for TTS"""
text = match.group(0)
return text.replace("$", " dollars ").replace("£", " pounds ")
def handle_decimal(match: re.Match) -> str:
"""Format decimal numbers for TTS"""
num = match.group(0)
return num.replace(".", " point ")
def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
def benchmark_normalizers(
test_cases: List[str], iterations: int = 100
) -> Tuple[float, float, float]:
"""Benchmark all three implementations"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
"inline": TextNormalizerInline(),
"compiled": TextNormalizerCompiled(),
"hybrid": TextNormalizerHybrid(),
}
results = {}
# Test each normalizer
for name, normalizer in normalizers.items():
start = time.perf_counter()
# Run normalizations
for _ in range(iterations):
for test in test_cases:
normalizer.normalize(test)
results[name] = time.perf_counter() - start
return results
def verify_outputs(test_cases: List[str]) -> bool:
"""Verify that all implementations produce identical output"""
normalizers = {
'inline': TextNormalizerInline(),
'compiled': TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid()
"inline": TextNormalizerInline(),
"compiled": TextNormalizerCompiled(),
"hybrid": TextNormalizerHybrid(),
}
for test in test_cases:
results = [norm.normalize(test) for norm in normalizers.values()]
if not all(r == results[0] for r in results):
return False
return True
def main():
# Create test cases
print("Generating test cases...")
test_cases = create_test_cases()
total_chars = sum(len(t) for t in test_cases)
print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
print(
f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
)
# Verify output consistency
print("\nVerifying output consistency...")
if verify_outputs(test_cases):
@ -254,15 +293,16 @@ def main():
else:
print("✗ Warning: Implementations produce different outputs!")
return
# Run benchmarks
print("\nRunning benchmarks...")
iterations = 100
results = benchmark_normalizers(test_cases, iterations)
# Print results
print(f"\nResults for {iterations} iterations: ")
for name, time_taken in results.items():
print(f"{name.capitalize()}: {time_taken:.3f}s")
main()
main()

View file

@ -1,8 +1,11 @@
import argparse
from typing import Any, Dict
from pathlib import Path
import numpy as np
import soundfile as sf
import argparse
from pathlib import Path
from typing import Dict, Any
from tqdm import tqdm
def validate_tts(wav_path: str) -> dict:
"""
@ -13,34 +16,40 @@ def validate_tts(wav_path: str) -> dict:
audio, sr = sf.read(wav_path)
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
duration = len(audio) / sr
issues = []
# Basic quality checks
abs_audio = np.abs(audio)
stats = {
'rms': float(np.sqrt(np.mean(audio**2))),
'peak': float(np.max(abs_audio)),
'dc_offset': float(np.mean(audio))
"rms": float(np.sqrt(np.mean(audio**2))),
"peak": float(np.max(abs_audio)),
"dc_offset": float(np.mean(audio)),
}
clip_count = np.sum(abs_audio >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if duration < 0.1:
issues.append("WARNING: Audio is suspiciously short - possible failed generation")
if stats['peak'] >= 1.0:
issues.append(
"WARNING: Audio is suspiciously short - possible failed generation"
)
if stats["peak"] >= 1.0:
if clip_percent > 1.0:
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
issues.append(
f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
)
elif clip_percent > 0.01:
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
if stats['rms'] < 0.01:
issues.append(
f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
)
if stats["rms"] < 0.01:
issues.append("WARNING: Audio is very quiet - possible failed generation")
if abs(stats['dc_offset']) > 0.1:
if abs(stats["dc_offset"]) > 0.1:
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
# Check for long silence gaps
@ -51,66 +60,79 @@ def validate_tts(wav_path: str) -> dict:
window_size = int(min_silence * sr)
silence_count = 0
last_silence = -1
start_idx = int(0.2 * sr) # Skip first 0.2s
for i in range(start_idx, len(db) - window_size, window_size):
window = db[i:i+window_size]
for i in tqdm(
range(start_idx, len(db) - window_size, window_size),
desc="Checking for silence",
):
window = db[i : i + window_size]
if np.mean(window) < silence_threshold:
silent_ratio = np.mean(window < silence_threshold)
if silent_ratio > 0.9:
if last_silence == -1 or (i/sr - last_silence) > 2.0:
if last_silence == -1 or (i / sr - last_silence) > 2.0:
silence_count += 1
last_silence = i/sr
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
last_silence = i / sr
issues.append(
f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
)
if silence_count > 2:
issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
issues.append(
f"WARNING: Multiple long silences found ({silence_count} total)"
)
# Detect audio artifacts
diff = np.diff(audio)
abs_diff = np.abs(diff)
window_size = min(int(0.005 * sr), 256)
window = np.ones(window_size)/window_size
local_avg_diff = np.convolve(abs_diff, window, mode='same')
window = np.ones(window_size) / window_size
local_avg_diff = np.convolve(abs_diff, window, mode="same")
spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
artifact_indices = np.nonzero(spikes)[0]
artifacts = []
if len(artifact_indices) > 0:
gaps = np.diff(artifact_indices)
min_gap = int(0.005 * sr)
break_points = np.nonzero(gaps > min_gap)[0] + 1
groups = np.split(artifact_indices, break_points)
for group in groups:
if len(group) >= 5:
severity = np.max(abs_diff[group])
if severity > 0.2:
center_idx = group[len(group)//2]
artifacts.append({
'time': float(center_idx/sr), # Ensure float for consistent timing
'severity': float(severity)
})
center_idx = group[len(group) // 2]
artifacts.append(
{
"time": float(
center_idx / sr
), # Ensure float for consistent timing
"severity": float(severity),
}
)
issues.append(
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
f"(severity: {severity:.3f})"
)
# Check for repeated speech segments
for chunk_duration in [5.0, 10.0]:
for chunk_duration in tqdm(
[0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
):
chunk_size = int(chunk_duration * sr)
overlap = int(0.2 * chunk_size)
for i in range(0, len(audio) - 2*chunk_size, overlap):
chunk1 = audio[i:i+chunk_size]
chunk2 = audio[i+chunk_size:i+2*chunk_size]
for i in range(0, len(audio) - 2 * chunk_size, overlap):
chunk1 = audio[i : i + chunk_size]
chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
continue
try:
correlation = np.corrcoef(chunk1, chunk2)[0,1]
correlation = np.corrcoef(chunk1, chunk2)[0, 1]
if not np.isnan(correlation) and correlation > 0.92:
issues.append(
f"WARNING: Possible repeated speech at {i/sr:.1f}s "
@ -128,92 +150,113 @@ def validate_tts(wav_path: str) -> dict:
"rms_level": f"{stats['rms']:.3f}",
"dc_offset": f"{stats['dc_offset']:.3f}",
"artifact_count": len(artifacts),
"artifact_locations": [a['time'] for a in artifacts],
"artifact_severities": [a['severity'] for a in artifacts],
"artifact_locations": [a["time"] for a in artifacts],
"artifact_severities": [a["severity"] for a in artifacts],
"issues": issues,
"valid": len(issues) == 0
}
except Exception as e:
return {
"file": wav_path,
"error": str(e),
"valid": False
"valid": len(issues) == 0,
}
def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
except Exception as e:
return {"file": wav_path, "error": str(e), "valid": False}
def generate_analysis_plots(
wav_path: str, output_dir: str, validation_result: Dict[str, Any]
):
"""
Generate analysis plots for audio file with time-aligned visualizations.
"""
import matplotlib.pyplot as plt
from scipy.signal import spectrogram
# Load audio
audio, sr = sf.read(wav_path)
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
# Create figure with shared x-axis
fig = plt.figure(figsize=(15, 8))
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1], sharex=ax1)
# Calculate spectrogram
nperseg = 2048
noverlap = 1536
f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap,
window='hann', scaling='spectrum')
f, t, Sxx = spectrogram(
audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
)
# Plot spectrogram
im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10),
shading='gouraud', cmap='viridis',
vmin=-100, vmax=-20)
ax1.set_ylabel('Frequency [Hz]', fontsize=10)
cbar = plt.colorbar(im, ax=ax1, label='dB')
ax1.set_title('Spectrogram', pad=10, fontsize=12)
im = ax1.pcolormesh(
t,
f,
10 * np.log10(Sxx + 1e-10),
shading="gouraud",
cmap="viridis",
vmin=-100,
vmax=-20,
)
ax1.set_ylabel("Frequency [Hz]", fontsize=10)
cbar = plt.colorbar(im, ax=ax1, label="dB")
ax1.set_title("Spectrogram", pad=10, fontsize=12)
# Plot waveform with exact time alignment
times = np.arange(len(audio)) / sr
ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
ax2.set_ylabel('Amplitude', fontsize=10)
ax2.set_xlabel('Time [sec]', fontsize=10)
ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
ax2.set_ylabel("Amplitude", fontsize=10)
ax2.set_xlabel("Time [sec]", fontsize=10)
ax2.grid(True, alpha=0.2)
# Add artifact markers
if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
for loc in validation_result['artifact_locations']:
ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
if (
"artifact_locations" in validation_result
and validation_result["artifact_locations"]
):
for loc in validation_result["artifact_locations"]:
ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
ax2.axvline(
x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
)
# Add legend to both plots
if len(validation_result['artifact_locations']) > 0:
ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
ax1.legend(loc='upper right', fontsize=8)
if len(validation_result["artifact_locations"]) > 0:
ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
ax1.legend(loc="upper right", fontsize=8)
# Only add unique labels to legend
handles, labels = ax2.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles))
ax2.legend(unique_labels.values(), unique_labels.keys(),
loc='upper right', fontsize=8)
ax2.legend(
unique_labels.values(),
unique_labels.keys(),
loc="upper right",
fontsize=8,
)
# Set common x limits
xlim = (0, len(audio)/sr)
xlim = (0, len(audio) / sr)
ax1.set_xlim(xlim)
ax2.set_xlim(xlim)
og_filename = Path(wav_path).name.split(".")[0]
# Save plot
plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
plt.savefig(
Path(output_dir) / f"{og_filename}_audio_analysis.png",
dpi=300,
bbox_inches="tight",
)
plt.close()
if __name__ == "__main__":
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
silent=False
if __name__ == "__main__":
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
silent = False
print(f"\n\n Processing:\n\t{wav_file}")
result = validate_tts(wav_file)
if not silent:
wav_root_dir = Path(wav_file).parent
generate_analysis_plots(wav_file, wav_root_dir, result)
print(f"\nValidating: {result['file']}")
if "error" in result:
print(f"Error: {result['error']}")
@ -224,10 +267,10 @@ if __name__ == "__main__":
print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}")
print(f"Detected Artifacts: {result['artifact_count']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
print(f"- {issue}")
else:
print("\nNo issues found")
print("\nNo issues found")

View file

@ -1,7 +1,9 @@
import argparse
from pathlib import Path
from validate_wav import validate_tts
def print_validation_result(result: dict, rel_path: Path):
"""Print full validation details for a single file."""
print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
else:
print("\nNo issues found")
def validate_directory(directory: str):
"""Validate all wav files in a directory with detailed output and summary."""
dir_path = Path(directory)
# Find all wav files (including nested directories)
wav_files = list(dir_path.rglob("*.wav"))
wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s
wav_files = sorted(wav_files)
if not wav_files:
print(f"No .wav or .mp3 files found in {directory}")
return
print(f"Found {len(wav_files)} files in {directory}")
print("=" * 80)
# Store results for summary
results = []
# Detailed validation output
for wav_file in wav_files:
result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
print_validation_result(result, rel_path)
results.append((rel_path, result))
print("=" * 80)
# Summary with detailed issues
print("\nSUMMARY:")
for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
issues = result["issues"]
first_issue = issues[0].replace("WARNING: ", "")
if len(issues) > 1:
print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
print(
f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
)
else:
print(f"{rel_path}: FAIL - {first_issue}")
else:
print(f"{rel_path}: PASS")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
parser.add_argument("directory", help="Directory containing wav files to validate")
args = parser.parse_args()
validate_directory(args.directory)

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

View file

@ -13,7 +13,7 @@ numpy==2.2.1
scipy==1.14.1
# Audio processing
soundfile==0.12.1
soundfile==0.13.0
# Text processing
phonemizer==3.3.0