-update soundfile version

-alignment with streaming standards
-audio processing config settings
-more comprehensive model warmup
-minor model improvements
-enhancing testing, benchmarking
-cool ascii logo
This commit is contained in:
remsky 2025-01-06 03:32:41 -07:00
parent 4c6cd83f85
commit 720c1fb97d
77 changed files with 2945 additions and 5522 deletions

BIN
.coverage

Binary file not shown.

View file

@ -129,7 +129,7 @@ response = requests.post(
) )
``` ```
<p align="center"> <p align="center">
<img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;"> <img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
</p> </p>
</details> </details>
@ -144,7 +144,7 @@ response = requests.post(
- pcm - pcm
<p align="center"> <p align="center">
<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;"> <img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
</p> </p>
</details> </details>
@ -175,8 +175,8 @@ Benchmarking was performed on generation via the local API using text lengths up
- H.G. Wells - The Time Machine (full text) - H.G. Wells - The Time Machine (full text)
<p align="center"> <p align="center">
<img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;"> <img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;"> <img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
</p> </p>
Key Performance Metrics: Key Performance Metrics:

View file

@ -18,6 +18,8 @@ class Settings(BaseSettings):
onnx_model_path: str = "kokoro-v0_19.onnx" onnx_model_path: str = "kokoro-v0_19.onnx"
voices_dir: str = "voices" voices_dir: str = "voices"
sample_rate: int = 24000 sample_rate: int = 24000
max_chunk_size: int = 300 # Maximum size of text chunks for processing
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
# ONNX Optimization Settings # ONNX Optimization Settings
onnx_num_threads: int = 4 # Number of threads for intra-op parallelism onnx_num_threads: int = 4 # Number of threads for intra-op parallelism

View file

@ -0,0 +1,9 @@
In a village of La Mancha, the name of which I have no desire to call
to mind, there lived not long since one of those gentlemen that keep a
lance in the lance-rack, an old buckler, a lean hack, and a greyhound
for coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
extra on Sundays, made away with three-quarters of his income. The rest
of it went in a doublet of fine cloth and velvet breeches and shoes to
match for holidays, while on week-days he made a brave figure in his
best homespun.

View file

@ -22,10 +22,11 @@ async def lifespan(app: FastAPI):
logger.info("Loading TTS model and voice packs...") logger.info("Loading TTS model and voice packs...")
# Initialize the main model with warm-up # Initialize the main model with warm-up
voicepack_count = TTSModel.setup() voicepack_count = await TTSModel.setup()
# boundary = "█████╗"*9 # boundary = "█████╗"*9
boundary = "" * 30 boundary = "" * 24
startup_msg =f""" startup_msg =f"""
{boundary} {boundary}
@ -37,8 +38,9 @@ async def lifespan(app: FastAPI):
{boundary} {boundary}
""" """
startup_msg += f"\nModel loaded and warmed up on {TTSModel.get_device()}" # TODO: Improve CPU warmup, threads, memory, etc
startup_msg += f"\n{voicepack_count} voice packs loaded successfully\n" startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
startup_msg += f"\n{voicepack_count} voice packs loaded\n"
startup_msg += f"\n{boundary}\n" startup_msg += f"\n{boundary}\n"
logger.info(startup_msg) logger.info(startup_msg)

View file

@ -83,8 +83,8 @@ async def create_speech(
audio, audio,
24000, 24000,
request.response_format, request.response_format,
is_first_chunk=True is_first_chunk=True,
) stream=False)
return Response( return Response(
content=content, content=content,

View file

@ -4,22 +4,30 @@ from io import BytesIO
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import scipy.io.wavfile as wavfile
from loguru import logger from loguru import logger
from ..core.config import settings
class AudioNormalizer: class AudioNormalizer:
"""Handles audio normalization state for a single stream""" """Handles audio normalization state for a single stream"""
def __init__(self): def __init__(self):
self.int16_max = np.iinfo(np.int16).max self.int16_max = np.iinfo(np.int16).max
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
def normalize(self, audio_data: np.ndarray) -> np.ndarray: def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
"""Normalize audio data to int16 range""" """Normalize audio data to int16 range and trim chunk boundaries"""
# Convert to float32 if not already # Convert to float32 if not already
audio_float = audio_data.astype(np.float32) audio_float = audio_data.astype(np.float32)
# Normalize to [-1, 1] range first # Normalize to [-1, 1] range first
if np.max(np.abs(audio_float)) > 0: if np.max(np.abs(audio_float)) > 0:
audio_float = audio_float / np.max(np.abs(audio_float)) audio_float = audio_float / np.max(np.abs(audio_float))
# Trim end of non-final chunks to reduce gaps
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
audio_float = audio_float[:-self.samples_to_trim]
# Scale to int16 range # Scale to int16 range
return (audio_float * self.int16_max).astype(np.int16) return (audio_float * self.int16_max).astype(np.int16)
@ -27,13 +35,30 @@ class AudioNormalizer:
class AudioService: class AudioService:
"""Service for audio format conversions""" """Service for audio format conversions"""
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
"mp3": {
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
"compression_level": 0.0, # Balanced compression
},
"opus": {
"compression_level": 0.0, # Good balance for speech
},
"flac": {
"compression_level": 0.0, # Light compression, still fast
}
}
@staticmethod @staticmethod
def convert_audio( def convert_audio(
audio_data: np.ndarray, audio_data: np.ndarray,
sample_rate: int, sample_rate: int,
output_format: str, output_format: str,
is_first_chunk: bool = True, is_first_chunk: bool = True,
normalizer: AudioNormalizer = None is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
stream: bool = True
) -> bytes: ) -> bytes:
"""Convert audio data to specified format """Convert audio data to specified format
@ -42,6 +67,19 @@ class AudioService:
sample_rate: Sample rate of the audio sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, opus, flac, pcm) output_format: Target format (wav, mp3, opus, flac, pcm)
is_first_chunk: Whether this is the first chunk of a stream is_first_chunk: Whether this is the first chunk of a stream
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
format_settings: Optional dict of format-specific settings to override defaults
Example: {
"mp3": {
"bitrate_mode": "VARIABLE",
"compression_level": 0.8
}
}
Default settings balance speed and compression:
optimized for localhost @ 0.0
- MP3: constant bitrate, no compression (0.0)
- OPUS: no compression (0.0)
- FLAC: no compression (0.0)
Returns: Returns:
Bytes of the converted audio Bytes of the converted audio
@ -50,31 +88,48 @@ class AudioService:
try: try:
# Always normalize audio to ensure proper amplitude scaling # Always normalize audio to ensure proper amplitude scaling
if normalizer is None: if stream:
normalizer = AudioNormalizer() if normalizer is None:
normalized_audio = normalizer.normalize(audio_data) normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
else:
normalized_audio = audio_data
if output_format == "pcm": if output_format == "pcm":
logger.info("Writing PCM data...")
# Raw 16-bit PCM samples, no header # Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes()) buffer.write(normalized_audio.tobytes())
elif output_format == "wav": elif output_format == "wav":
logger.info("Writing to WAV format...") if stream:
# Always include WAV header for WAV format # Use soundfile for streaming to ensure proper headers
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16') sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
else:
# Trying scipy.io.wavfile for non-streaming WAV generation
# seems faster than soundfile
# avoids overhead from header generation and PCM encoding
wavfile.write(buffer, sample_rate, normalized_audio)
elif output_format == "mp3": elif output_format == "mp3":
logger.info("Converting to MP3 format...") # Use format settings or defaults
# Use lower bitrate for streaming settings = format_settings.get("mp3", {}) if format_settings else {}
sf.write(buffer, normalized_audio, sample_rate, format="MP3") settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
sf.write(
buffer, normalized_audio,
sample_rate, format="MP3",
**settings
)
elif output_format == "opus": elif output_format == "opus":
logger.info("Converting to Opus format...") settings = format_settings.get("opus", {}) if format_settings else {}
# Use lower bitrate and smaller frame size for streaming settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS") sf.write(buffer, normalized_audio, sample_rate, format="OGG",
subtype="OPUS", **settings)
elif output_format == "flac": elif output_format == "flac":
logger.info("Converting to FLAC format...") if is_first_chunk:
# Use smaller block size for streaming logger.info("Starting FLAC stream...")
settings = format_settings.get("flac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="FLAC", sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
subtype='PCM_16') subtype='PCM_16', **settings)
else: else:
if output_format == "aac": if output_format == "aac":
raise ValueError( raise ValueError(

View file

@ -0,0 +1,52 @@
"""Text chunking service"""
import re
from ...core.config import settings
def split_text(text: str, max_chunk=None):
"""Split text into chunks on natural pause points
Args:
text: Text to split into chunks
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
"""
if max_chunk is None:
max_chunk = settings.max_chunk_size
if not isinstance(text, str):
text = str(text) if text is not None else ""
text = text.strip()
if not text:
return
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For medium-length sentences, split on punctuation
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
# First try splitting on semicolons and colons
parts = re.split(r"(?<=[;:])\s+", sentence)
for part in parts:
part = part.strip()
if not part:
continue
# If part is still long, split on commas
if len(part) > max_chunk:
subparts = re.split(r"(?<=,)\s+", part)
for subpart in subparts:
subpart = subpart.strip()
if subpart:
yield subpart
else:
yield part
else:
yield sentence

View file

@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices") VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
@classmethod @classmethod
def setup(cls): async def setup(cls):
"""Initialize model and setup voices""" """Initialize model and setup voices"""
with cls._lock: with cls._lock:
# Set device # Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
except Exception as e: except Exception as e:
logger.error(f"Error copying voice {voice_name}: {str(e)}") logger.error(f"Error copying voice {voice_name}: {str(e)}")
# Warm up with default voice # Load warmup text
try: try:
dummy_text = "Hello" with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
voice_path = os.path.join(cls.VOICES_DIR, "af.pt") warmup_text = f.read()
dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
# Process text and generate audio
phonemes, tokens = cls.process_text(dummy_text, "a")
cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
logger.info("Model warm-up complete")
except Exception as e: except Exception as e:
logger.warning(f"Model warm-up failed: {e}") logger.warning(f"Failed to load warmup text: {e}")
warmup_text = "This is a warmup text that will be split into chunks for processing."
# Use warmup service
from .warmup import WarmupService
warmup = WarmupService()
# Load and warm up voices
loaded_voices = warmup.load_voices()
await warmup.warmup_voices(warmup_text, loaded_voices)
logger.info("Model warm-up complete")
# Count voices in directory # Count voices in directory
voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")]) voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])

View file

@ -1,6 +1,7 @@
import os import os
import numpy as np import numpy as np
import torch import torch
import time
from loguru import logger from loguru import logger
from models import build_model from models import build_model
from .text_processing import phonemize, tokenize from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
from .tts_base import TTSBaseModel from .tts_base import TTSBaseModel
from ..core.config import settings from ..core.config import settings
# @torch.no_grad()
# def forward(model, tokens, ref_s, speed):
# """Forward pass through the model"""
# device = ref_s.device
# tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
# input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
# text_mask = length_to_mask(input_lengths).to(device)
# bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
# d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
# s = ref_s[:, 128:]
# d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
# x, _ = model.predictor.lstm(d)
# duration = model.predictor.duration_proj(x)
# duration = torch.sigmoid(duration).sum(axis=-1) / speed
# pred_dur = torch.round(duration).clamp(min=1).long()
# pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# c_frame = 0
# for i in range(pred_aln_trg.size(0)):
# pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
# c_frame += pred_dur[0, i].item()
# en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
# F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
# t_en = model.text_encoder(tokens, input_lengths, text_mask)
# asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad() @torch.no_grad()
def forward(model, tokens, ref_s, speed): def forward(model, tokens, ref_s, speed):
"""Forward pass through the model""" """Forward pass through the model with light optimizations that preserve output quality"""
device = ref_s.device device = ref_s.device
# Keep original token handling but optimize device placement
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device) tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device) input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device) text_mask = length_to_mask(input_lengths).to(device)
# BERT and encoder pass
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int()) bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2) d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask) # Split reference signal once for efficiency
s_content = ref_s[:, 128:]
s_ref = ref_s[:, :128]
# Predictor forward pass
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
x, _ = model.predictor.lstm(d) x, _ = model.predictor.lstm(d)
# Duration prediction - keeping original logic
duration = model.predictor.duration_proj(x) duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long() pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# Alignment matrix construction - keeping original approach for quality
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
c_frame = 0 c_frame = 0
for i in range(pred_aln_trg.size(0)): for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1 pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item() c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s) # Matrix multiplications - reuse unsqueezed tensor
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
en = d.transpose(-1, -2) @ pred_aln_trg
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
# Text encoding and final decoding
t_en = model.text_encoder(tokens, input_lengths, text_mask) t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device) asr = t_en @ pred_aln_trg
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
# def length_to_mask(lengths):
# """Create attention mask from lengths"""
# mask = (
# torch.arange(lengths.max())
# .unsqueeze(0)
# .expand(lengths.shape[0], -1)
# .type_as(lengths)
# )
# mask = torch.gt(mask + 1, lengths.unsqueeze(1))
# return mask
def length_to_mask(lengths): def length_to_mask(lengths):
"""Create attention mask from lengths""" """Create attention mask from lengths - possibly optimized version"""
mask = ( max_len = lengths.max()
torch.arange(lengths.max()) # Create mask directly on the same device as lengths
.unsqueeze(0) mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
.expand(lengths.shape[0], -1) # Avoid type_as by using the correct dtype from the start
.type_as(lengths) if lengths.dtype != mask.dtype:
) mask = mask.to(dtype=lengths.dtype)
mask = torch.gt(mask + 1, lengths.unsqueeze(1)) # Fuse operations using broadcasting
return mask return mask + 1 > lengths[:, None]
class TTSGPUModel(TTSBaseModel): class TTSGPUModel(TTSBaseModel):
_instance = None _instance = None

View file

@ -8,7 +8,7 @@ from functools import lru_cache
import numpy as np import numpy as np
import torch import torch
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
from .text_processing import normalize_text from .text_processing import normalize_text, chunker
from loguru import logger from loguru import logger
from ..core.config import settings from ..core.config import settings
@ -20,40 +20,6 @@ class TTSService:
def __init__(self, output_dir: str = None): def __init__(self, output_dir: str = None):
self.output_dir = output_dir self.output_dir = output_dir
def _split_text(self, text: str):
"""Generate text chunks one at a time, splitting on natural pause points"""
if not isinstance(text, str):
text = str(text) if text is not None else ""
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For longer sentences, split on commas and semicolons
if len(sentence) > 300: # Only split long sentences
# Split on pause points while preserving the punctuation
chunks = re.split(r"((?<=[,;])\s+)", sentence)
# Reassemble chunks with their trailing punctuation
current_chunk = ""
for i, chunk in enumerate(chunks):
if i % 2 == 0: # Text chunk
current_chunk += chunk
else: # Punctuation/whitespace chunk
current_chunk += chunk
if current_chunk.strip():
yield current_chunk.strip()
current_chunk = ""
# Yield any remaining text
if current_chunk.strip():
yield current_chunk.strip()
else:
yield sentence
@staticmethod @staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices @lru_cache(maxsize=20) # Cache up to 8 most recently used voices
@ -96,28 +62,32 @@ class TTSService:
# Load voice using cached loader # Load voice using cached loader
voicepack = self._load_voice(voice_path) voicepack = self._load_voice(voice_path)
# Generate audio with or without stitching # For non-streaming, preprocess all chunks first
if stitch_long_output: if stitch_long_output:
audio_chunks = [] # Preprocess all chunks to phonemes/tokens
chunk_count = 0 chunks_data = []
for chunk in chunker.split_text(text):
# Process chunks as they're generated
for chunk in self._split_text(text):
try: try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0]) phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunks_data.append((chunk, tokens))
except Exception as e:
logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
continue
if not chunks_data:
raise ValueError("No chunks were processed successfully")
# Generate audio for all chunks
audio_chunks = []
for chunk, tokens in chunks_data:
try:
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed) chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
if chunk_audio is not None: if chunk_audio is not None:
audio_chunks.append(chunk_audio) audio_chunks.append(chunk_audio)
chunk_count += 1
else: else:
logger.error(f"No audio generated for chunk {chunk_count + 1}") logger.error(f"No audio generated for chunk: '{chunk}'")
except Exception as e: except Exception as e:
logger.error( logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
)
continue continue
if not audio_chunks: if not audio_chunks:
@ -138,53 +108,93 @@ class TTSService:
raise raise
async def generate_audio_stream( async def generate_audio_stream(
self, text: str, voice: str, speed: float, output_format: str = "wav" self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
): ):
"""Generate and yield audio chunks as they're generated for real-time streaming""" """Generate and yield audio chunks as they're generated for real-time streaming"""
try: try:
stream_start = time.time()
# Create normalizer for consistent audio levels # Create normalizer for consistent audio levels
stream_normalizer = AudioNormalizer() stream_normalizer = AudioNormalizer()
# Input validation and preprocessing # Input validation and preprocessing
if not text: if not text:
raise ValueError("Text is empty") raise ValueError("Text is empty")
preprocess_start = time.time()
normalized = normalize_text(text) normalized = normalize_text(text)
if not normalized: if not normalized:
raise ValueError("Text is empty after preprocessing") raise ValueError("Text is empty after preprocessing")
text = str(normalized) text = str(normalized)
logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
# Voice validation and loading # Voice validation and loading
voice_start = time.time()
voice_path = self._get_voice_path(voice) voice_path = self._get_voice_path(voice)
if not voice_path: if not voice_path:
raise ValueError(f"Voice not found: {voice}") raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path) voicepack = self._load_voice(voice_path)
logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
# Process chunks as they're generated # Process chunks as they're generated
is_first = True is_first = True
for chunk in self._split_text(text): chunks_processed = 0
# last_chunk_end = time.time()
# Process chunks as they come from generator
chunk_gen = chunker.split_text(text)
current_chunk = next(chunk_gen, None)
while current_chunk is not None:
next_chunk = next(chunk_gen, None) # Peek at next chunk
# chunk_start = time.time()
chunks_processed += 1
try: try:
# Process text and generate audio # Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0]) # text_process_start = time.time()
phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
# text_process_time = time.time() - text_process_start
# audio_gen_start = time.time()
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed) chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
# audio_gen_time = time.time() - audio_gen_start
if chunk_audio is not None: if chunk_audio is not None:
# Convert chunk with proper header handling # Convert chunk with proper header handling
convert_start = time.time()
chunk_bytes = AudioService.convert_audio( chunk_bytes = AudioService.convert_audio(
chunk_audio, chunk_audio,
24000, 24000,
output_format, output_format,
is_first_chunk=is_first, is_first_chunk=is_first,
normalizer=stream_normalizer normalizer=stream_normalizer,
is_last_chunk=(next_chunk is None) # Last if no next chunk
) )
# convert_time = time.time() - convert_start
# Calculate gap from last chunk
# gap_time = chunk_start - last_chunk_end
# Log timing details if not silent
# if not silent:
# logger.debug(
# f"\nChunk {chunks_processed} timing:"
# f"\n Gap from last chunk: {gap_time*1000:.1f}ms"
# f"\n Text processing: {text_process_time*1000:.1f}ms"
# f"\n Audio generation: {audio_gen_time*1000:.1f}ms"
# f"\n Audio conversion: {convert_time*1000:.1f}ms"
# f"\n Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
# )
yield chunk_bytes yield chunk_bytes
is_first = False is_first = False
# last_chunk_end = time.time()
else: else:
logger.error(f"No audio generated for chunk: '{chunk}'") logger.error(f"No audio generated for chunk: '{current_chunk}'")
except Exception as e: except Exception as e:
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}") logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
continue
current_chunk = next_chunk # Move to next chunk
except Exception as e: except Exception as e:
logger.error(f"Error in audio generation stream: {str(e)}") logger.error(f"Error in audio generation stream: {str(e)}")
raise raise

View file

@ -0,0 +1,52 @@
import os
from typing import List, Tuple
import torch
from loguru import logger
from .tts_service import TTSService
from .tts_model import TTSModel
class WarmupService:
"""Service for warming up TTS models and voice caches"""
def __init__(self):
self.tts_service = TTSService()
def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
"""Load and cache voices up to LRU limit"""
# Get all voices sorted by filename length (shorter names first, usually base voices)
voice_files = sorted(
[f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
key=len
)
# Load up to LRU cache limit (20)
loaded_voices = []
for voice_file in voice_files[:20]:
try:
voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
loaded_voices.append((voice_file[:-3], voicepack)) # Store name and tensor
# logger.info(f"Loaded voice {voice_file[:-3]} into cache")
except Exception as e:
logger.error(f"Failed to load voice {voice_file}: {e}")
logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
return loaded_voices
async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
"""Warm up voice inference and streaming"""
n_warmups = 1
for voice_name, _ in loaded_voices[:n_warmups]:
try:
logger.info(f"Running warmup inference on voice {voice_name}")
async for _ in self.tts_service.generate_audio_stream(
warmup_text,
voice_name,
1.0,
"pcm"
):
pass # Process all chunks to properly warm up
logger.info(f"Completed warmup for voice {voice_name}")
except Exception as e:
logger.warning(f"Warmup failed for voice {voice_name}: {e}")

35
api/tests/test_chunker.py Normal file
View file

@ -0,0 +1,35 @@
"""Tests for text chunking service"""
import pytest
from api.src.services.text_processing import chunker
def test_split_text():
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = list(chunker.split_text(text))
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty():
"""Test splitting empty text"""
assert list(chunker.split_text("")) == []
def test_split_text_single_sentence():
"""Test splitting single sentence"""
text = "Just one sentence."
assert list(chunker.split_text(text)) == ["Just one sentence."]
def test_split_text_with_custom_chunk_size():
"""Test splitting with custom max chunk size"""
text = "First part, second part, third part."
chunks = list(chunker.split_text(text, max_chunk=15))
assert len(chunks) == 3
assert chunks[0] == "First part,"
assert chunks[1] == "second part,"
assert chunks[2] == "third part."

View file

@ -1,7 +1,8 @@
from unittest.mock import Mock from unittest.mock import Mock, AsyncMock
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import asyncio
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from httpx import AsyncClient from httpx import AsyncClient
@ -22,6 +23,12 @@ async def async_client():
def mock_tts_service(monkeypatch): def mock_tts_service(monkeypatch):
mock_service = Mock() mock_service = Mock()
mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0) mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
# Create proper async generator mock
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_service.generate_audio_stream = mock_stream
mock_service.list_voices.return_value = [ mock_service.list_voices.return_value = [
"af", "af",
"bm_lewis", "bm_lewis",
@ -65,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
"voice": "bm_lewis", "voice": "bm_lewis",
"response_format": "wav", "response_format": "wav",
"speed": 1.0, "speed": 1.0,
"stream": False # Explicitly disable streaming
} }
response = client.post("/v1/audio/speech", json=test_request) response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 200 assert response.status_code == 200
@ -84,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
"voice": "invalid_voice", "voice": "invalid_voice",
"response_format": "wav", "response_format": "wav",
"speed": 1.0, "speed": 1.0,
"stream": False # Explicitly disable streaming
} }
response = client.post("/v1/audio/speech", json=test_request) response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 400 # Bad request assert response.status_code == 400 # Bad request
@ -98,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
"voice": "af", "voice": "af",
"response_format": "wav", "response_format": "wav",
"speed": -1.0, # Invalid speed "speed": -1.0, # Invalid speed
"stream": False # Explicitly disable streaming
} }
response = client.post("/v1/audio/speech", json=test_request) response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 422 # Validation error assert response.status_code == 422 # Validation error
@ -112,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
"voice": "af", "voice": "af",
"response_format": "wav", "response_format": "wav",
"speed": 1.0, "speed": 1.0,
"stream": False # Explicitly disable streaming
} }
response = client.post("/v1/audio/speech", json=test_request) response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 500 assert response.status_code == 500
@ -171,13 +182,14 @@ async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
"input": "Hello world", "input": "Hello world",
"voice": "af", "voice": "af",
"response_format": "pcm", "response_format": "pcm",
"stream": True
} }
# Mock streaming response # Create streaming mock for this test
async def mock_stream(): async def mock_stream(*args, **kwargs):
yield b"chunk1" for chunk in [b"chunk1", b"chunk2"]:
yield b"chunk2" yield chunk
mock_tts_service.generate_audio_stream.return_value = mock_stream() mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header # Add streaming header
headers = {"x-raw-response": "stream"} headers = {"x-raw-response": "stream"}
@ -198,13 +210,14 @@ async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
"input": "Hello world", "input": "Hello world",
"voice": "af", "voice": "af",
"response_format": "mp3", "response_format": "mp3",
"stream": True
} }
# Mock streaming response # Create streaming mock for this test
async def mock_stream(): async def mock_stream(*args, **kwargs):
yield b"mp3header" for chunk in [b"mp3header", b"mp3data"]:
yield b"mp3data" yield chunk
mock_tts_service.generate_audio_stream.return_value = mock_stream() mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header # Add streaming header
headers = {"x-raw-response": "stream"} headers = {"x-raw-response": "stream"}
@ -227,14 +240,14 @@ async def test_openai_speech_streaming_generator(mock_tts_service, async_client)
"input": "Hello world", "input": "Hello world",
"voice": "af", "voice": "af",
"response_format": "pcm", "response_format": "pcm",
"stream": True
} }
# Mock streaming response # Create streaming mock for this test
async def mock_stream(): async def mock_stream(*args, **kwargs):
yield b"chunk1" for chunk in [b"chunk1", b"chunk2"]:
yield b"chunk2" yield chunk
mock_tts_service.generate_audio_stream = mock_stream
mock_tts_service.generate_audio_stream.return_value = mock_stream()
# Add streaming header # Add streaming header
headers = {"x-raw-response": "stream"} headers = {"x-raw-response": "stream"}

View file

@ -28,29 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
"""Test successful model warmup in lifespan""" """Test successful model warmup in lifespan"""
# Mock file system for voice counting # Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices" mock_tts_model.VOICES_DIR = "/mock/voices"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
mock_tts_model.setup.return_value = 3 # 3 voice files
mock_tts_model.get_device.return_value = "cuda"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Verify the expected logging sequence
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
# Check for the startup message containing the required info # Create async mock
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list] async def async_setup():
startup_msg = next(msg for msg in startup_calls if "Model loaded and warmed up on" in msg) return 3
assert "Model loaded and warmed up on cuda" in startup_msg mock_tts_model.setup = MagicMock()
assert "3 voice packs loaded successfully" in startup_msg mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Verify model setup was called # Verify the expected logging sequence
mock_tts_model.setup.assert_called_once() mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
# Check for the startup message containing the required info
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
assert "Model warmed up on" in startup_msg
assert "3 voice packs loaded" in startup_msg
# Clean up # Verify model setup was called
await async_gen.__aexit__(None, None, None) mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio @pytest.mark.asyncio
@ -81,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
"""Test model warmup specifically on CUDA""" """Test model warmup specifically on CUDA"""
# Mock file system for voice counting # Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices" mock_tts_model.VOICES_DIR = "/mock/voices"
# Create async mock
async def async_setup():
return 2
mock_tts_model.setup = MagicMock()
mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]): with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
mock_tts_model.setup.return_value = 2 # 2 voice files # Create an async generator from the lifespan context manager
mock_tts_model.get_device.return_value = "cuda" async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Create an async generator from the lifespan context manager # Verify model setup was called
async_gen = lifespan(MagicMock()) mock_tts_model.setup.assert_called_once()
await async_gen.__aenter__()
# Verify model setup was called # Clean up
mock_tts_model.setup.assert_called_once() await async_gen.__aexit__(None, None, None)
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@patch("api.src.main.TTSModel")
async def test_lifespan_cpu_fallback(mock_tts_model):
"""Test model warmup falling back to CPU"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
with patch(
"os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
):
mock_tts_model.setup.return_value = 4 # 4 voice files
mock_tts_model.get_device.return_value = "cpu"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)

View file

@ -16,13 +16,14 @@ def test_get_device_error():
with pytest.raises(RuntimeError, match="Model not initialized"): with pytest.raises(RuntimeError, match="Model not initialized"):
TTSBaseModel.get_device() TTSBaseModel.get_device()
@pytest.mark.asyncio
@patch('torch.cuda.is_available') @patch('torch.cuda.is_available')
@patch('os.path.exists') @patch('os.path.exists')
@patch('os.path.join') @patch('os.path.join')
@patch('os.listdir') @patch('os.listdir')
@patch('torch.load') @patch('torch.load')
@patch('torch.save') @patch('torch.save')
def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available): async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA available""" """Test setup with CUDA available"""
TTSBaseModel._device = None TTSBaseModel._device = None
mock_cuda_available.return_value = True mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3])) TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000)) TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup() voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cuda" assert TTSBaseModel._device == "cuda"
assert voice_count == 2 assert voice_count == 2
@pytest.mark.asyncio
@patch('torch.cuda.is_available') @patch('torch.cuda.is_available')
@patch('os.path.exists') @patch('os.path.exists')
@patch('os.path.join') @patch('os.path.join')
@patch('os.listdir') @patch('os.listdir')
@patch('torch.load') @patch('torch.load')
@patch('torch.save') @patch('torch.save')
def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available): async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA unavailable""" """Test setup with CUDA unavailable"""
TTSBaseModel._device = None TTSBaseModel._device = None
mock_cuda_available.return_value = False mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3])) TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000)) TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup() voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cpu" assert TTSBaseModel._device == "cpu"
assert voice_count == 2 assert voice_count == 2

View file

@ -31,27 +31,6 @@ def sample_audio():
return np.sin(2 * np.pi * frequency * t).astype(np.float32) return np.sin(2 * np.pi * frequency * t).astype(np.float32)
def test_split_text(tts_service):
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = tts_service._split_text(text)
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty(tts_service):
"""Test splitting empty text"""
assert tts_service._split_text("") == []
def test_split_text_single_sentence(tts_service):
"""Test splitting single sentence"""
text = "Just one sentence."
assert tts_service._split_text(text) == ["Just one sentence."]
def test_audio_to_bytes(tts_service, sample_audio): def test_audio_to_bytes(tts_service, sample_audio):
"""Test converting audio tensor to bytes""" """Test converting audio tensor to bytes"""
audio_bytes = tts_service._audio_to_bytes(sample_audio) audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
mock_torch_load.return_value = torch.zeros((10, 24000)) mock_torch_load.return_value = torch.zeros((10, 24000))
mock_generate.return_value = (None, None) mock_generate.return_value = (None, None)
with pytest.raises(ValueError, match="No audio chunks were generated successfully"): with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0) tts_service._generate_audio("Test text", "af", 1.0)
@ -185,7 +164,7 @@ def test_generate_audio_error(
mock_exists.return_value = True mock_exists.return_value = True
mock_torch_load.return_value = torch.zeros((10, 24000)) mock_torch_load.return_value = torch.zeros((10, 24000))
with pytest.raises(ValueError, match="No audio chunks were generated successfully"): with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0) tts_service._generate_audio("Test text", "af", 1.0)

Binary file not shown.

After

Width:  |  Height:  |  Size: 774 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

BIN
assets/voice_analysis.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 958 KiB

View file

@ -43,6 +43,7 @@ services:
- ONNX_OPTIMIZATION_LEVEL=all - ONNX_OPTIMIZATION_LEVEL=all
- ONNX_MEMORY_PATTERN=true - ONNX_MEMORY_PATTERN=true
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
depends_on: depends_on:
model-fetcher: model-fetcher:
condition: service_healthy condition: service_healthy

View file

@ -2,7 +2,7 @@ services:
model-fetcher: model-fetcher:
image: datamachines/git-lfs:latest image: datamachines/git-lfs:latest
environment: environment:
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-true} - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
volumes: volumes:
- ./Kokoro-82M:/app/Kokoro-82M - ./Kokoro-82M:/app/Kokoro-82M
working_dir: /app/Kokoro-82M working_dir: /app/Kokoro-82M
@ -32,10 +32,10 @@ services:
start_period: 1s start_period: 1s
kokoro-tts: kokoro-tts:
image: ghcr.io/remsky/kokoro-fastapi:latest # image: ghcr.io/remsky/kokoro-fastapi:latest
# Uncomment below to build from source instead of using the released image # Uncomment below to build from source instead of using the released image
# build: build:
# context: . context: .
volumes: volumes:
- ./api/src:/app/api/src - ./api/src:/app/api/src
- ./Kokoro-82M:/app/Kokoro-82M - ./Kokoro-82M:/app/Kokoro-82M
@ -54,14 +54,14 @@ services:
model-fetcher: model-fetcher:
condition: service_healthy condition: service_healthy
# # Gradio UI service [Comment out everything below if you don't need it] # Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui: gradio-ui:
# build: build:
# context: ./ui context: ./ui
# ports: ports:
# - "7860:7860" - "7860:7860"
# volumes: volumes:
# - ./ui/data:/app/ui/data - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment: environment:
# - GRADIO_WATCH=True # Enable hot reloading - GRADIO_WATCH=True # Enable hot reloading

View file

@ -1,15 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import time
import json import json
import numpy as np import time
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict: import numpy as np
import pandas as pd
import requests
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_timeline, plot_correlation
from lib.shared_benchmark_utils import enc, get_text_for_tokens
def measure_first_token(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via API calls and save the audio output""" """Measure time to audio via API calls and save the audio output"""
results = { results = {
"text_length": len(text), "text_length": len(text),
@ -18,12 +22,12 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"time_to_first_chunk": None, "time_to_first_chunk": None,
"error": None, "error": None,
"audio_path": None, "audio_path": None,
"audio_length": None # Length of output audio in seconds "audio_length": None, # Length of output audio in seconds
} }
try: try:
start_time = time.time() start_time = time.time()
# Make request without streaming # Make request without streaming
response = requests.post( response = requests.post(
"http://localhost:8880/v1/audio/speech", "http://localhost:8880/v1/audio/speech",
@ -32,58 +36,62 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
"input": text, "input": text,
"voice": "af", "voice": "af",
"response_format": "wav", "response_format": "wav",
"stream": False "stream": False,
}, },
timeout=1800 timeout=1800,
) )
response.raise_for_status() response.raise_for_status()
# Save complete audio # Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav" audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
audio_path = os.path.join(output_dir, audio_filename) audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path results["audio_path"] = audio_path
content = response.content content = response.content
with open(audio_path, 'wb') as f: with open(audio_path, "wb") as f:
f.write(content) f.write(content)
# Calculate audio length using scipy # Calculate audio length using scipy
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path) sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["time_to_first_chunk"] = time.time() - start_time results["time_to_first_chunk"] = time.time() - start_time
results["total_time"] = time.time() - start_time results["total_time"] = time.time() - start_time
return results return results
except Exception as e: except Exception as e:
results["error"] = str(e) results["error"] = str(e)
return results return results
def main(): def main():
# Set up paths # Set up paths
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio") output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data") output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories # Create output directories
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True) os.makedirs(output_data_dir, exist_ok=True)
# Load sample text # Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f: with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read() text = f.read()
# Test specific token counts # Test specific token counts
token_sizes = [10, 25, 50, 100, 200, 500] token_sizes = [10, 25, 50, 100, 200, 500]
all_results = [] all_results = []
for tokens in token_sizes: for tokens in token_sizes:
print(f"\nTesting {tokens} tokens") print(f"\nTesting {tokens} tokens")
test_text = get_text_for_tokens(text, tokens) test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text)) actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...") print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average # Run test 3 times for each size to get average
for i in range(5): for i in range(5):
print(f"Run {i+1}/3...") print(f"Run {i+1}/3...")
@ -91,67 +99,74 @@ def main():
result["target_tokens"] = tokens result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1 result["run_number"] = i + 1
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s") print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s") print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
if result["error"]: if result["error"]:
print(f"Error: {result['error']}") print(f"Error: {result['error']}")
all_results.append(result) all_results.append(result)
# Calculate averages per token size # Calculate averages per token size
summary = {} summary = {}
for tokens in token_sizes: for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]] matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results: if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results) avg_first_chunk = sum(
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results) r["time_to_first_chunk"] for r in matching_results
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results) ) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = { summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3), "avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3), "avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3), "avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results) "num_successful_runs": len(matching_results),
} }
# Save results # Save results
# Save results # Save results
results_data = { results_data = {
"individual_runs": all_results, "individual_runs": all_results,
"summary": summary, "summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S") "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
} }
save_json_results( save_json_results(
results_data, results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
os.path.join(output_data_dir, "first_token_benchmark.json")
) )
# Create plot directory if it doesn't exist # Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots") output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True) os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting # Create DataFrame for plotting
df = pd.DataFrame(all_results) df = pd.DataFrame(all_results)
# Create both plots # Create both plots
plot_correlation( plot_correlation(
df, "target_tokens", "time_to_first_chunk", df,
"target_tokens",
"time_to_first_chunk",
"Time to Audio vs Input Size", "Time to Audio vs Input Size",
"Number of Input Tokens", "Number of Input Tokens",
"Time to Audio (seconds)", "Time to Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency.png") os.path.join(output_plots_dir, "first_token_latency.png"),
) )
plot_timeline( plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
df,
os.path.join(output_plots_dir, "first_token_timeline.png")
)
print("\nResults and plots saved to:") print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}") print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}") print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}") print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -1,193 +0,0 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "pcm",
"stream": True
},
stream=True,
timeout=1800
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
all_audio_data = b''.join(chunks)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(5):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots with _stream suffix
# Plot correlation for both metrics
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
if __name__ == "__main__":
main()

View file

@ -1,184 +0,0 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Initialize OpenAI client
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream_openai suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream_openai")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 5 times for each size to get average
for i in range(5):
print(f"Run {i+1}/5...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream_openai suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots with _stream_openai suffix
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (OpenAI Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
import os
import time
import requests
from openai import OpenAI
from lib.stream_utils import run_benchmark
OPENAI_CLIENT = OpenAI(
base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
)
def measure_first_token_requests(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via direct API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "pcm",
"stream": True,
},
stream=True,
timeout=1800,
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
all_audio_data = b"".join(chunks)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def measure_first_token_openai(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Initialize OpenAI client
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
prefix='cpu'
# Run requests benchmark
print("\n=== Running Direct Requests Benchmark ===")
run_benchmark(
measure_first_token_requests,
output_dir=os.path.join(script_dir, "output_audio_stream"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream",
plot_title_suffix="(Streaming)",
prefix=prefix
)
# Run OpenAI benchmark
print("\n=== Running OpenAI Library Benchmark ===")
run_benchmark(
measure_first_token_openai,
output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream_openai",
plot_title_suffix="(OpenAI Streaming)",
prefix=prefix
)
if __name__ == "__main__":
main()

View file

@ -1,30 +1,37 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import sys
import json import json
import time import time
import threading
import queue import queue
import pandas as pd import threading
import sys
from datetime import datetime from datetime import datetime
from lib.shared_plotting import plot_system_metrics, plot_correlation import pandas as pd
from lib.shared_utils import ( from lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats, real_time_factor,
real_time_factor save_json_results,
get_system_metrics,
write_benchmark_stats,
) )
from lib.shared_plotting import plot_correlation, plot_system_metrics
from lib.shared_benchmark_utils import ( from lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
) )
class SystemMonitor: class SystemMonitor:
def __init__(self, interval=1.0): def __init__(self, interval=1.0):
"""Rough system tracker: Not always accurate"""
self.interval = interval self.interval = interval
self.metrics_queue = queue.Queue() self.metrics_queue = queue.Queue()
self.stop_event = threading.Event() self.stop_event = threading.Event()
self.metrics_timeline = [] self.metrics_timeline = []
self.start_time = None self.start_time = None
def _monitor_loop(self): def _monitor_loop(self):
"""Background thread function to collect system metrics.""" """Background thread function to collect system metrics."""
while not self.stop_event.is_set(): while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
metrics["relative_time"] = time.time() - self.start_time metrics["relative_time"] = time.time() - self.start_time
self.metrics_queue.put(metrics) self.metrics_queue.put(metrics)
time.sleep(self.interval) time.sleep(self.interval)
def start(self): def start(self):
"""Start the monitoring thread.""" """Start the monitoring thread."""
self.start_time = time.time() self.start_time = time.time()
self.monitor_thread = threading.Thread(target=self._monitor_loop) self.monitor_thread = threading.Thread(target=self._monitor_loop)
self.monitor_thread.daemon = True self.monitor_thread.daemon = True
self.monitor_thread.start() self.monitor_thread.start()
def stop(self): def stop(self):
"""Stop the monitoring thread and collect final metrics.""" """Stop the monitoring thread and collect final metrics."""
self.stop_event.set() self.stop_event.set()
if hasattr(self, 'monitor_thread'): if hasattr(self, "monitor_thread"):
self.monitor_thread.join(timeout=2) self.monitor_thread.join(timeout=2)
# Collect all metrics from queue # Collect all metrics from queue
while True: while True:
try: try:
@ -53,23 +60,24 @@ class SystemMonitor:
self.metrics_timeline.append(metrics) self.metrics_timeline.append(metrics)
except queue.Empty: except queue.Empty:
break break
return self.metrics_timeline return self.metrics_timeline
def main(): def main():
# Initialize system monitor # Initialize system monitor
monitor = SystemMonitor(interval=1.0) # 1 second interval monitor = SystemMonitor(interval=1.0) # 1 second interval
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.) # Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
prefix = "gpu" prefix = "cpu"
# Generate token sizes # Generate token sizes
if 'gpu' in prefix: if "gpu" in prefix:
token_sizes = generate_token_sizes( token_sizes = generate_token_sizes(
max_tokens=5000, dense_step=150, max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
dense_max=1000, sparse_step=1000) )
elif 'cpu' in prefix: elif "cpu" in prefix:
token_sizes = generate_token_sizes( token_sizes = generate_token_sizes(
max_tokens=1000, dense_step=300, max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
dense_max=1000, sparse_step=0) )
else: else:
token_sizes = generate_token_sizes(max_tokens=3000) token_sizes = generate_token_sizes(max_tokens=3000)
@ -78,7 +86,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio") output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data") output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots") output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories # Create output directories
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True) os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
filename = f"{prefix}_{filename}" filename = f"{prefix}_{filename}"
return os.path.join(path, filename) return os.path.join(path, filename)
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f: with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read() text = f.read()
total_tokens = len(enc.encode(text)) total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():
results = [] results = []
test_start_time = time.time() test_start_time = time.time()
# Start system monitoring # Start system monitoring
monitor.start() monitor.start()
@ -114,7 +124,8 @@ def main():
processing_time, audio_length = make_tts_request( processing_time, audio_length = make_tts_request(
chunk, chunk,
output_dir=output_dir, output_dir=output_dir,
prefix=prefix prefix=prefix,
stream=False, # Use non-streaming mode for RTF benchmarking
) )
if processing_time is None or audio_length is None: if processing_time is None or audio_length is None:
print("Breaking loop due to error") print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
# Calculate RTF using the correct formula # Calculate RTF using the correct formula
rtf = real_time_factor(processing_time, audio_length) rtf = real_time_factor(processing_time, audio_length)
print(f"Real-Time Factor: {rtf:.5f}") print(f"Real-Time Factor: {rtf:.5f}")
results.append({ results.append(
"tokens": actual_tokens, {
"processing_time": processing_time, "tokens": actual_tokens,
"output_length": audio_length, "processing_time": processing_time,
"rtf": rtf, "output_length": audio_length,
"elapsed_time": round(time.time() - test_start_time, 2), "rtf": rtf,
}) "elapsed_time": round(time.time() - test_start_time, 5),
}
)
df = pd.DataFrame(results) df = pd.DataFrame(results)
if df.empty: if df.empty:
@ -144,89 +157,101 @@ def main():
{ {
"title": "Benchmark Statistics (with correct RTF)", "title": "Benchmark Statistics (with correct RTF)",
"stats": { "stats": {
"Total tokens processed": df['tokens'].sum(), "Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df['output_length'].sum(), "Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df['elapsed_time'].max(), "Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(), "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average RTF": df['rtf'].mean(), "Average RTF": df["rtf"].mean(),
"Average Real Time Speed": 1/df['rtf'].mean() "Average Real Time Speed": 1 / df["rtf"].mean(),
} },
}, },
{ {
"title": "Per-chunk Stats", "title": "Per-chunk Stats",
"stats": { "stats": {
"Average chunk size (tokens)": df['tokens'].mean(), "Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df['tokens'].min(), "Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df['tokens'].max(), "Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df['processing_time'].mean(), "Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df['output_length'].mean() "Average output length (s)": df["output_length"].mean(),
} },
}, },
{ {
"title": "Performance Ranges", "title": "Performance Ranges",
"stats": { "stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}", "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x", "RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x" "Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
} },
} },
] ]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")) write_benchmark_stats(
stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
)
# Plot Processing Time vs Token Count # Plot Processing Time vs Token Count
plot_correlation( plot_correlation(
df, "tokens", "processing_time", df,
"tokens",
"processing_time",
"Processing Time vs Input Size", "Processing Time vs Input Size",
"Number of Input Tokens", "Number of Input Tokens",
"Processing Time (seconds)", "Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time_rtf.png") prefix_path(output_plots_dir, "processing_time_rtf.png"),
) )
# Plot RTF vs Token Count # Plot RTF vs Token Count
plot_correlation( plot_correlation(
df, "tokens", "rtf", df,
"tokens",
"rtf",
"Real-Time Factor vs Input Size", "Real-Time Factor vs Input Size",
"Number of Input Tokens", "Number of Input Tokens",
"Real-Time Factor (processing time / audio length)", "Real-Time Factor (processing time / audio length)",
prefix_path(output_plots_dir, "realtime_factor_rtf.png") prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
) )
# Stop monitoring and get final metrics # Stop monitoring and get final metrics
final_metrics = monitor.stop() final_metrics = monitor.stop()
# Convert metrics timeline to DataFrame for stats # Convert metrics timeline to DataFrame for stats
metrics_df = pd.DataFrame(final_metrics) metrics_df = pd.DataFrame(final_metrics)
# Add system usage stats # Add system usage stats
if not metrics_df.empty: if not metrics_df.empty:
stats.append({ stats.append(
"title": "System Usage Statistics", {
"stats": { "title": "System Usage Statistics",
"Peak CPU Usage (%)": metrics_df['cpu_percent'].max(), "stats": {
"Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(), "Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
"Peak RAM Usage (%)": metrics_df['ram_percent'].max(), "Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
"Avg RAM Usage (%)": metrics_df['ram_percent'].mean(), "Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
"Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(), "Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
"Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(), "Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
"Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
},
} }
}) )
if 'gpu_memory_used' in metrics_df: if "gpu_memory_used" in metrics_df:
stats[-1]["stats"].update({ stats[-1]["stats"].update(
"Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(), {
"Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(), "Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
}) "Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
}
)
# Plot system metrics # Plot system metrics
plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")) plot_system_metrics(
final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
)
# Save final results # Save final results
save_json_results( save_json_results(
{ {
"results": results, "results": results,
"system_metrics": final_metrics, "system_metrics": final_metrics,
"test_duration": time.time() - test_start_time "test_duration": time.time() - test_start_time,
}, },
prefix_path(output_data_dir, "benchmark_results_rtf.json") prefix_path(output_data_dir, "benchmark_results_rtf.json"),
) )
print("\nResults saved to:") print("\nResults saved to:")

View file

@ -1,19 +1,30 @@
import os import os
import json import json
import time import time
import pandas as pd import pandas as pd
from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
from examples.assorted_checks.lib.shared_utils import ( from examples.assorted_checks.lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats save_json_results,
get_system_metrics,
write_benchmark_stats,
)
from examples.assorted_checks.lib.shared_plotting import (
plot_correlation,
plot_system_metrics,
) )
from examples.assorted_checks.lib.shared_benchmark_utils import ( from examples.assorted_checks.lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
) )
def main(): def main():
# Get optional prefix from first command line argument # Get optional prefix from first command line argument
import sys import sys
prefix = sys.argv[1] if len(sys.argv) > 1 else "" prefix = sys.argv[1] if len(sys.argv) > 1 else ""
# Set up paths relative to this file # Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio") output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data") output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots") output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories # Create output directories
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True) os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
total_tokens = len(enc.encode(text)) total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}") print(f"Total tokens in file: {total_tokens}")
token_sizes = generate_token_sizes(total_tokens) token_sizes = generate_token_sizes(total_tokens)
print(f"Testing sizes: {token_sizes}") print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
# Save intermediate results # Save intermediate results
save_json_results( save_json_results(
{"results": results, "system_metrics": system_metrics}, {"results": results, "system_metrics": system_metrics},
prefix_path(output_data_dir, "benchmark_results.json") prefix_path(output_data_dir, "benchmark_results.json"),
) )
# Create DataFrame and calculate stats # Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
{ {
"title": "Benchmark Statistics", "title": "Benchmark Statistics",
"stats": { "stats": {
"Total tokens processed": df['tokens'].sum(), "Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df['output_length'].sum(), "Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df['elapsed_time'].max(), "Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(), "Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average realtime factor": df['realtime_factor'].mean() "Average realtime factor": df["realtime_factor"].mean(),
} },
}, },
{ {
"title": "Per-chunk Stats", "title": "Per-chunk Stats",
"stats": { "stats": {
"Average chunk size (tokens)": df['tokens'].mean(), "Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df['tokens'].min(), "Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df['tokens'].max(), "Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df['processing_time'].mean(), "Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df['output_length'].mean() "Average output length (s)": df["output_length"].mean(),
} },
}, },
{ {
"title": "Performance Ranges", "title": "Performance Ranges",
"stats": { "stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}", "Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x" "Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
} },
} },
] ]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt")) write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
# Plot Processing Time vs Token Count # Plot Processing Time vs Token Count
plot_correlation( plot_correlation(
df, "tokens", "processing_time", df,
"tokens",
"processing_time",
"Processing Time vs Input Size", "Processing Time vs Input Size",
"Number of Input Tokens", "Number of Input Tokens",
"Processing Time (seconds)", "Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time.png") prefix_path(output_plots_dir, "processing_time.png"),
) )
# Plot Realtime Factor vs Token Count # Plot Realtime Factor vs Token Count
plot_correlation( plot_correlation(
df, "tokens", "realtime_factor", df,
"tokens",
"realtime_factor",
"Realtime Factor vs Input Size", "Realtime Factor vs Input Size",
"Number of Input Tokens", "Number of Input Tokens",
"Realtime Factor (output length / processing time)", "Realtime Factor (output length / processing time)",
prefix_path(output_plots_dir, "realtime_factor.png") prefix_path(output_plots_dir, "realtime_factor.png"),
) )
# Plot system metrics # Plot system metrics
plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png")) plot_system_metrics(
system_metrics, prefix_path(output_plots_dir, "system_usage.png")
)
print("\nResults saved to:") print("\nResults saved to:")
print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}") print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")

View file

@ -1,11 +1,12 @@
"""Shared utilities specific to TTS benchmarking.""" """Shared utilities specific to TTS benchmarking."""
import time import time
from typing import List, Optional, Tuple from typing import List, Tuple, Optional
import requests import requests
import tiktoken import tiktoken
from .shared_utils import get_audio_length, save_audio_file from .shared_utils import save_audio_file, get_audio_length
# Global tokenizer instance # Global tokenizer instance
enc = tiktoken.get_encoding("cl100k_base") enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")
def get_text_for_tokens(text: str, num_tokens: int) -> str: def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens. """Get a slice of text that contains exactly num_tokens tokens.
Args: Args:
text: Input text to slice text: Input text to slice
num_tokens: Desired number of tokens num_tokens: Desired number of tokens
Returns: Returns:
str: Text slice containing exactly num_tokens tokens str: Text slice containing exactly num_tokens tokens
""" """
@ -31,44 +32,69 @@ def make_tts_request(
text: str, text: str,
output_dir: str = None, output_dir: str = None,
timeout: int = 1800, timeout: int = 1800,
prefix: str = "" prefix: str = "",
stream: bool = True,
) -> Tuple[Optional[float], Optional[float]]: ) -> Tuple[Optional[float], Optional[float]]:
"""Make TTS request using OpenAI-compatible endpoint. """Make TTS request using OpenAI-compatible endpoint.
Args: Args:
text: Input text to convert to speech text: Input text to convert to speech
output_dir: Directory to save audio files. If None, audio won't be saved. output_dir: Directory to save audio files. If None, audio won't be saved.
timeout: Request timeout in seconds timeout: Request timeout in seconds
prefix: Optional prefix for output filenames prefix: Optional prefix for output filenames
Returns: Returns:
tuple: (processing_time, audio_length) in seconds, or (None, None) on error tuple: (processing_time, audio_length) in seconds, or (None, None) on error
""" """
try: try:
start_time = time.time() start_time = time.time()
response = requests.post( if stream:
"http://localhost:8880/v1/audio/speech", # For streaming, we need to collect all chunks
json={ audio_chunks = []
"model": "kokoro", response = requests.post(
"input": text, "http://localhost:8880/v1/audio/speech",
"voice": "af", json={
"response_format": "wav", "model": "kokoro",
}, "input": text,
timeout=timeout, "voice": "af",
) "response_format": "wav",
response.raise_for_status() "stream": True,
},
timeout=timeout,
stream=True,
)
response.raise_for_status()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
audio_chunks.append(chunk)
# Combine all chunks
audio_data = b"".join(audio_chunks)
else:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False,
},
timeout=timeout,
)
response.raise_for_status()
audio_data = response.content
processing_time = round(time.time() - start_time, 2) processing_time = round(time.time() - start_time, 2)
# Calculate audio length from response content # Calculate audio length from audio data
audio_length = get_audio_length(response.content) audio_length = get_audio_length(audio_data)
# Save the audio file if output_dir is provided # Save the audio file if output_dir is provided
if output_dir: if output_dir:
token_count = len(enc.encode(text)) token_count = len(enc.encode(text))
output_file = save_audio_file( output_file = save_audio_file(
response.content, audio_data, f"chunk_{token_count}_tokens", output_dir
f"chunk_{token_count}_tokens",
output_dir
) )
print(f"Saved audio to {output_file}") print(f"Saved audio to {output_file}")
@ -86,26 +112,26 @@ def generate_token_sizes(
max_tokens: int, max_tokens: int,
dense_step: int = 100, dense_step: int = 100,
dense_max: int = 1000, dense_max: int = 1000,
sparse_step: int = 1000 sparse_step: int = 1000,
) -> List[int]: ) -> List[int]:
"""Generate token size ranges with dense sampling at start. """Generate token size ranges with dense sampling at start.
Args: Args:
max_tokens: Maximum number of tokens to generate sizes up to max_tokens: Maximum number of tokens to generate sizes up to
dense_step: Step size for dense sampling range dense_step: Step size for dense sampling range
dense_max: Maximum value for dense sampling dense_max: Maximum value for dense sampling
sparse_step: Step size for sparse sampling range sparse_step: Step size for sparse sampling range
Returns: Returns:
list: Sorted list of token sizes list: Sorted list of token sizes
""" """
# Dense sampling at start # Dense sampling at start
dense_range = list(range(dense_step, dense_max + 1, dense_step)) dense_range = list(range(dense_step, dense_max + 1, dense_step))
if max_tokens <= dense_max or sparse_step < dense_max: if max_tokens <= dense_max or sparse_step < dense_max:
return sorted(dense_range) return sorted(dense_range)
# Sparse sampling for larger sizes # Sparse sampling for larger sizes
sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step)) sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
# Combine and deduplicate # Combine and deduplicate
return sorted(list(set(dense_range + sparse_range))) return sorted(list(set(dense_range + sparse_range)))

View file

@ -1,7 +1,8 @@
"""Shared plotting utilities for benchmarks and tests.""" """Shared plotting utilities for benchmarks and tests."""
import numpy as np
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
@ -12,66 +13,71 @@ STYLE_CONFIG = {
"secondary_color": "#05d9e8", "secondary_color": "#05d9e8",
"grid_color": "#ffffff", "grid_color": "#ffffff",
"text_color": "#ffffff", "text_color": "#ffffff",
"font_sizes": { "font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
"title": 16,
"label": 14,
"tick": 12,
"text": 10
}
} }
def setup_plot(fig, ax, title, xlabel=None, ylabel=None): def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
"""Configure plot styling with consistent theme. """Configure plot styling with consistent theme.
Args: Args:
fig: matplotlib figure object fig: matplotlib figure object
ax: matplotlib axis object ax: matplotlib axis object
title: str, plot title title: str, plot title
xlabel: str, optional x-axis label xlabel: str, optional x-axis label
ylabel: str, optional y-axis label ylabel: str, optional y-axis label
Returns: Returns:
tuple: (fig, ax) with applied styling tuple: (fig, ax) with applied styling
""" """
# Grid styling # Grid styling
ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"]) ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
# Title and labels # Title and labels
ax.set_title(title, pad=20, ax.set_title(
fontsize=STYLE_CONFIG["font_sizes"]["title"], title,
fontweight="bold", pad=20,
color=STYLE_CONFIG["text_color"]) fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"],
)
if xlabel: if xlabel:
ax.set_xlabel(xlabel, ax.set_xlabel(
fontsize=STYLE_CONFIG["font_sizes"]["label"], xlabel,
fontweight="medium", fontsize=STYLE_CONFIG["font_sizes"]["label"],
color=STYLE_CONFIG["text_color"]) fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
if ylabel: if ylabel:
ax.set_ylabel(ylabel, ax.set_ylabel(
fontsize=STYLE_CONFIG["font_sizes"]["label"], ylabel,
fontweight="medium", fontsize=STYLE_CONFIG["font_sizes"]["label"],
color=STYLE_CONFIG["text_color"]) fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
# Tick styling # Tick styling
ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"], ax.tick_params(
colors=STYLE_CONFIG["text_color"]) labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
)
# Spine styling # Spine styling
for spine in ax.spines.values(): for spine in ax.spines.values():
spine.set_color(STYLE_CONFIG["text_color"]) spine.set_color(STYLE_CONFIG["text_color"])
spine.set_alpha(0.3) spine.set_alpha(0.3)
spine.set_linewidth(0.5) spine.set_linewidth(0.5)
# Background colors # Background colors
ax.set_facecolor(STYLE_CONFIG["background_color"]) ax.set_facecolor(STYLE_CONFIG["background_color"])
fig.patch.set_facecolor(STYLE_CONFIG["background_color"]) fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
return fig, ax return fig, ax
def plot_system_metrics(metrics_data, output_path): def plot_system_metrics(metrics_data, output_path):
"""Create plots for system metrics over time. """Create plots for system metrics over time.
Args: Args:
metrics_data: list of dicts containing system metrics metrics_data: list of dicts containing system metrics
output_path: str, path to save the output plot output_path: str, path to save the output plot
@ -79,68 +85,118 @@ def plot_system_metrics(metrics_data, output_path):
df = pd.DataFrame(metrics_data) df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"]) df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds() elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
# Get baseline values # Get baseline values
baseline_cpu = df["cpu_percent"].iloc[0] baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0] baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None baseline_gpu = (
df["gpu_memory_used"].iloc[0] / 1024
if "gpu_memory_used" in df.columns
else None
)
# Convert GPU memory to GB if present # Convert GPU memory to GB if present
if "gpu_memory_used" in df.columns: if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024 df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
plt.style.use("dark_background") plt.style.use("dark_background")
# Create subplots based on available metrics # Create subplots based on available metrics
has_gpu = "gpu_memory_used" in df.columns has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2 num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots)) fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor(STYLE_CONFIG["background_color"]) fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
# Smoothing window # Smoothing window
window = min(5, len(df) // 2) window = min(5, len(df) // 2)
# Plot CPU Usage # Plot CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean() smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], sns.lineplot(
color=STYLE_CONFIG["primary_color"], linewidth=2) x=elapsed_time,
axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"], y=smoothed_cpu,
linestyle="--", alpha=0.5, label="Baseline") ax=axes[0],
setup_plot(fig, axes[0], "CPU Usage Over Time", color=STYLE_CONFIG["primary_color"],
xlabel="Time (seconds)", ylabel="CPU Usage (%)") linewidth=2,
)
axes[0].axhline(
y=baseline_cpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[0],
"CPU Usage Over Time",
xlabel="Time (seconds)",
ylabel="CPU Usage (%)",
)
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1) axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
axes[0].legend() axes[0].legend()
# Plot RAM Usage # Plot RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean() smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], sns.lineplot(
color=STYLE_CONFIG["secondary_color"], linewidth=2) x=elapsed_time,
axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"], y=smoothed_ram,
linestyle="--", alpha=0.5, label="Baseline") ax=axes[1],
setup_plot(fig, axes[1], "RAM Usage Over Time", color=STYLE_CONFIG["secondary_color"],
xlabel="Time (seconds)", ylabel="RAM Usage (GB)") linewidth=2,
)
axes[1].axhline(
y=baseline_ram,
color=STYLE_CONFIG["primary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[1],
"RAM Usage Over Time",
xlabel="Time (seconds)",
ylabel="RAM Usage (GB)",
)
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1) axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
axes[1].legend() axes[1].legend()
# Plot GPU Memory if available # Plot GPU Memory if available
if has_gpu: if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean() smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], sns.lineplot(
color=STYLE_CONFIG["primary_color"], linewidth=2) x=elapsed_time,
axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"], y=smoothed_gpu,
linestyle="--", alpha=0.5, label="Baseline") ax=axes[2],
setup_plot(fig, axes[2], "GPU Memory Usage Over Time", color=STYLE_CONFIG["primary_color"],
xlabel="Time (seconds)", ylabel="GPU Memory (GB)") linewidth=2,
)
axes[2].axhline(
y=baseline_gpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[2],
"GPU Memory Usage Over Time",
xlabel="Time (seconds)",
ylabel="GPU Memory (GB)",
)
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1) axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
axes[2].legend() axes[2].legend()
plt.tight_layout() plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight") plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()
def plot_timeline(df, output_path, suffix=""):
def plot_timeline(df, output_path, suffix="", prefix=""):
"""Create timeline plot showing latency for each run. """Create timeline plot showing latency for each run.
Args: Args:
df: pandas DataFrame containing run data with columns: df: pandas DataFrame containing run data with columns:
- target_tokens: number of tokens - target_tokens: number of tokens
@ -149,124 +205,161 @@ def plot_timeline(df, output_path, suffix=""):
output_path: str, path to save the output plot output_path: str, path to save the output plot
""" """
plt.style.use("dark_background") plt.style.use("dark_background")
# Sort by tokens and run number # Sort by tokens and run number
df = df.sort_values(['target_tokens', 'run_number']) df = df.sort_values(["target_tokens", "run_number"])
# Create figure and axis # Create figure and axis
fig, ax = plt.subplots(figsize=(12, 6)) fig, ax = plt.subplots(figsize=(12, 6))
# Calculate y positions for each run with tighter grouping # Calculate y positions for each run with tighter grouping
unique_tokens = sorted(df['target_tokens'].unique()) unique_tokens = sorted(df["target_tokens"].unique())
y_positions = {} y_positions = {}
current_y = 0 current_y = 0
group_spacing = 0.8 # Space between groups group_spacing = 0.8 # Space between groups
run_spacing = 0.2 # Space between runs in a group run_spacing = 0.2 # Space between runs in a group
for tokens in unique_tokens: for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens] runs = df[df["target_tokens"] == tokens]
base_y = current_y base_y = current_y
for i, (_, run) in enumerate(runs.iterrows()): for i, (_, run) in enumerate(runs.iterrows()):
y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing) y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
current_y = base_y + (len(runs) * run_spacing) + group_spacing current_y = base_y + (len(runs) * run_spacing) + group_spacing
# Plot bars and points with more transparency # Plot bars and points with more transparency
bar_height = 0.15 bar_height = 0.15
for _, row in df.iterrows(): for _, row in df.iterrows():
y = y_positions[(row['target_tokens'], row['run_number'])] y = y_positions[(row["target_tokens"], row["run_number"])]
latency = row['time_to_first_chunk'] latency = row["time_to_first_chunk"]
# Latency bar # Latency bar
ax.add_patch(patches.Rectangle( ax.add_patch(
(0, y - bar_height/2), patches.Rectangle(
latency, (0, y - bar_height / 2),
bar_height, latency,
facecolor=STYLE_CONFIG["primary_color"], bar_height,
alpha=0.3 facecolor=STYLE_CONFIG["primary_color"],
)) alpha=0.3,
)
)
# End point # End point
ax.plot(latency, y, 'o', ax.plot(
color=STYLE_CONFIG["secondary_color"], latency,
markersize=4, y,
alpha=0.5) "o",
color=STYLE_CONFIG["secondary_color"],
markersize=4,
alpha=0.5,
)
# Add mean lines and values for each token group # Add mean lines and values for each token group
for tokens in unique_tokens: for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens] token_runs = df[df["target_tokens"] == tokens]
mean_latency = token_runs['time_to_first_chunk'].mean() mean_latency = token_runs["time_to_first_chunk"].mean()
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()] y_positions_for_token = [
y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
]
min_y = min(y_positions_for_token) min_y = min(y_positions_for_token)
max_y = max(y_positions_for_token) max_y = max(y_positions_for_token)
group_center = (min_y + max_y) / 2 group_center = (min_y + max_y) / 2
# Plot mean line with gradient alpha # Plot mean line with gradient alpha
gradient = np.linspace(0.2, 0.8, 100) gradient = np.linspace(0.2, 0.8, 100)
for i in range(len(gradient)-1): for i in range(len(gradient) - 1):
y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient)) y1 = (
y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient)) min_y
ax.plot([mean_latency, mean_latency], [y1, y2], - bar_height
'-', color=STYLE_CONFIG["secondary_color"], + (max_y - min_y + 2 * bar_height) * (i / len(gradient))
linewidth=3, alpha=gradient[i]) )
y2 = (
min_y
- bar_height
+ (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
)
ax.plot(
[mean_latency, mean_latency],
[y1, y2],
"-",
color=STYLE_CONFIG["secondary_color"],
linewidth=3,
alpha=gradient[i],
)
# Add mean value label with background # Add mean value label with background
label_text = f'Mean: {mean_latency:.3f}s' label_text = f"Mean: {mean_latency:.3f}s"
bbox_props = dict( bbox_props = dict(
facecolor=STYLE_CONFIG["background_color"], facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["secondary_color"], edgecolor=STYLE_CONFIG["secondary_color"],
alpha=0.8, alpha=0.8,
pad=3, pad=3,
linewidth=1 linewidth=1,
) )
ax.text(mean_latency + 0.02, group_center, ax.text(
label_text, mean_latency + 0.02,
color=STYLE_CONFIG["secondary_color"], group_center,
va='center', label_text,
fontsize=10, color=STYLE_CONFIG["secondary_color"],
fontweight='bold', va="center",
bbox=bbox_props) fontsize=10,
fontweight="bold",
bbox=bbox_props,
)
# Customize plot # Customize plot
ax.set_ylim(-1, current_y) ax.set_ylim(-1, current_y)
ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3) # Extra space for labels ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3) # Extra space for labels
# Add labels for token groups with tighter spacing # Add labels for token groups with tighter spacing
group_positions = {} group_positions = {}
for tokens in unique_tokens: for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens] runs = df[df["target_tokens"] == tokens]
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()] y_positions_for_token = [
group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token) y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
plt.axhline(y=min(y_positions_for_token) - bar_height, ]
color='white', alpha=0.1, linestyle='-') group_positions[tokens] = sum(y_positions_for_token) / len(
y_positions_for_token
)
plt.axhline(
y=min(y_positions_for_token) - bar_height,
color="white",
alpha=0.1,
linestyle="-",
)
# Calculate mean audio length for each token group # Calculate mean audio length for each token group
audio_lengths = {} audio_lengths = {}
for tokens in unique_tokens: for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens] token_runs = df[df["target_tokens"] == tokens]
audio_lengths[tokens] = token_runs['audio_length'].mean() audio_lengths[tokens] = token_runs["audio_length"].mean()
# Set y-ticks at group centers with token counts and audio lengths # Set y-ticks at group centers with token counts and audio lengths
plt.yticks( plt.yticks(
list(group_positions.values()), list(group_positions.values()),
[f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()], [
fontsize=10 f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
for tokens in group_positions.keys()
],
fontsize=10,
) )
# Customize appearance # Customize appearance
setup_plot( setup_plot(
fig, ax, fig,
"Time-To-Audio Latency" + suffix, ax,
prefix.upper() + " Time-To-Audio Latency " + suffix,
xlabel="Time (seconds)", xlabel="Time (seconds)",
ylabel="Input Size" ylabel="Input Size",
) )
plt.tight_layout() plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight") plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path): def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
"""Create correlation plot with regression line and correlation coefficient. """Create correlation plot with regression line and correlation coefficient.
Args: Args:
df: pandas DataFrame containing the data df: pandas DataFrame containing the data
x: str, column name for x-axis x: str, column name for x-axis
@ -277,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
output_path: str, path to save the output plot output_path: str, path to save the output plot
""" """
plt.style.use("dark_background") plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(12, 8)) fig, ax = plt.subplots(figsize=(12, 8))
# Scatter plot # Scatter plot
sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6, sns.scatterplot(
color=STYLE_CONFIG["primary_color"]) data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
)
# Regression line # Regression line
sns.regplot(data=df, x=x, y=y, scatter=False, sns.regplot(
color=STYLE_CONFIG["secondary_color"], data=df,
line_kws={"linewidth": 2}) x=x,
y=y,
scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2},
)
# Add correlation coefficient # Add correlation coefficient
corr = df[x].corr(df[y]) corr = df[x].corr(df[y])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", plt.text(
transform=ax.transAxes, 0.05,
fontsize=STYLE_CONFIG["font_sizes"]["text"], 0.95,
color=STYLE_CONFIG["text_color"], f"Correlation: {corr:.2f}",
bbox=dict(facecolor=STYLE_CONFIG["background_color"], transform=ax.transAxes,
edgecolor=STYLE_CONFIG["text_color"], fontsize=STYLE_CONFIG["font_sizes"]["text"],
alpha=0.7)) color=STYLE_CONFIG["text_color"],
bbox=dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7,
),
)
setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel) setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
plt.savefig(output_path, dpi=300, bbox_inches="tight") plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()

View file

@ -1,9 +1,10 @@
"""Shared utilities for benchmarks and tests.""" """Shared utilities for benchmarks and tests."""
import os import os
import json import json
import subprocess import subprocess
from typing import Any, Dict, List, Union, Optional
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional, Union
import psutil import psutil
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
TORCH_AVAILABLE = False TORCH_AVAILABLE = False
try: try:
import torch import torch
TORCH_AVAILABLE = torch.cuda.is_available() TORCH_AVAILABLE = torch.cuda.is_available()
except ImportError: except ImportError:
pass pass
def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
"""Check if an audio file is silent by comparing peak amplitude to a threshold.
Args:
audio_path: Path to the audio file
threshold: Peak amplitude threshold for silence
Returns:
bool: True if audio is silent, False otherwise
"""
rate, data = wavfile.read(audio_path)
peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0 # 16-bit audio
return peak_amplitude < threshold
def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float: def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
"""Get audio length in seconds from bytes data. """Get audio length in seconds from bytes data.
Args: Args:
audio_data: Raw audio bytes audio_data: Raw audio bytes
temp_dir: Directory for temporary file. If None, uses system temp directory. temp_dir: Directory for temporary file. If None, uses system temp directory.
Returns: Returns:
float: Audio length in seconds float: Audio length in seconds
""" """
if temp_dir is None: if temp_dir is None:
import tempfile import tempfile
temp_dir = tempfile.gettempdir() temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, "temp.wav") temp_path = os.path.join(temp_dir, "temp.wav")
os.makedirs(temp_dir, exist_ok=True) os.makedirs(temp_dir, exist_ok=True)
with open(temp_path, "wb") as f: with open(temp_path, "wb") as f:
f.write(audio_data) f.write(audio_data)
@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]: def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
"""Get GPU memory usage using PyTorch if available, falling back to nvidia-smi. """Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
Args: Args:
average: If True and multiple GPUs present, returns average memory usage. average: If True and multiple GPUs present, returns average memory usage.
If False, returns list of memory usage per GPU. If False, returns list of memory usage per GPU.
Returns: Returns:
float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available. float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
If average=False and multiple GPUs present, returns list of values. If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
n_gpus = torch.cuda.device_count() n_gpus = torch.cuda.device_count()
memory_used = [] memory_used = []
for i in range(n_gpus): for i in range(n_gpus):
memory_used.append(torch.cuda.memory_allocated(i) / 1024**2) # Convert to MB memory_used.append(
torch.cuda.memory_allocated(i) / 1024**2
) # Convert to MB
if average and len(memory_used) > 0: if average and len(memory_used) > 0:
return sum(memory_used) / len(memory_used) return sum(memory_used) / len(memory_used)
return memory_used if len(memory_used) > 1 else memory_used[0] return memory_used if len(memory_used) > 1 else memory_used[0]
# Fall back to nvidia-smi # Fall back to nvidia-smi
try: try:
result = subprocess.check_output( result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"] ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
) )
memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()] memory_values = [
float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
]
if average and len(memory_values) > 0: if average and len(memory_values) > 0:
return sum(memory_values) / len(memory_values) return sum(memory_values) / len(memory_values)
return memory_values if len(memory_values) > 1 else memory_values[0] return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
def get_system_metrics() -> Dict[str, Union[str, float]]: def get_system_metrics() -> Dict[str, Union[str, float]]:
"""Get current system metrics including CPU, RAM, and GPU if available. """Get current system metrics including CPU, RAM, and GPU if available.
Returns: Returns:
dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
""" """
# Get per-CPU percentages and calculate average # Get per-CPU percentages and calculate average
cpu_percentages = psutil.cpu_percent(percpu=True) cpu_percentages = psutil.cpu_percent(percpu=True)
avg_cpu = sum(cpu_percentages) / len(cpu_percentages) avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
metrics = { metrics = {
"timestamp": datetime.now().isoformat(), "timestamp": datetime.now().isoformat(),
"cpu_percent": round(avg_cpu, 2), "cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:
def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str: def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
"""Save audio data to a file with proper naming and directory creation. """Save audio data to a file with proper naming and directory creation.
Args: Args:
audio_data: Raw audio bytes audio_data: Raw audio bytes
identifier: String to identify this audio file (e.g. token count, test name) identifier: String to identify this audio file (e.g. token count, test name)
output_dir: Directory to save the file output_dir: Directory to save the file
Returns: Returns:
str: Path to the saved audio file str: Path to the saved audio file
""" """
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{identifier}.wav") output_file = os.path.join(output_dir, f"{identifier}.wav")
with open(output_file, "wb") as f: with open(output_file, "wb") as f:
f.write(audio_data) f.write(audio_data)
return output_file return output_file
def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None: def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
"""Write benchmark statistics to a file in a clean, organized format. """Write benchmark statistics to a file in a clean, organized format.
Args: Args:
stats: List of dictionaries containing stat name/value pairs stats: List of dictionaries containing stat name/value pairs
output_file: Path to output file output_file: Path to output file
""" """
os.makedirs(os.path.dirname(output_file), exist_ok=True) os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w") as f: with open(output_file, "w") as f:
for section in stats: for section in stats:
# Write section header # Write section header
f.write(f"=== {section['title']} ===\n\n") f.write(f"=== {section['title']} ===\n\n")
# Write stats # Write stats
for label, value in section['stats'].items(): for label, value in section["stats"].items():
if isinstance(value, float): if isinstance(value, float):
f.write(f"{label}: {value:.2f}\n") f.write(f"{label}: {value:.2f}\n")
else: else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None
def save_json_results(results: Dict[str, Any], output_file: str) -> None: def save_json_results(results: Dict[str, Any], output_file: str) -> None:
"""Save benchmark results to a JSON file with proper formatting. """Save benchmark results to a JSON file with proper formatting.
Args: Args:
results: Dictionary of results to save results: Dictionary of results to save
output_file: Path to output file output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
json.dump(results, f, indent=2) json.dump(results, f, indent=2)
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float: def real_time_factor(
processing_time: float, audio_length: float, decimals: int = 2
) -> float:
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio. """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
Args: Args:
processing_time: Time taken to process/generate audio processing_time: Time taken to process/generate audio
audio_length: Length of the generated audio audio_length: Length of the generated audio
decimals: Number of decimal places to round to decimals: Number of decimal places to round to
Returns: Returns:
float: RTF value float: RTF value
""" """

View file

@ -0,0 +1,205 @@
#!/usr/bin/env python3
import os
import time
import wave
from typing import Any, Dict, List, Callable, Optional
import pandas as pd
import scipy.io.wavfile as wavfile
from .shared_utils import save_json_results
from .shared_plotting import plot_timeline, plot_correlation
from .shared_benchmark_utils import enc, get_text_for_tokens
def check_audio_silence(audio_path: str) -> bool:
"""Check if audio file contains only silence"""
sample_rate, audio_data = wavfile.read(audio_path)
# Convert to float for RMS calculation
audio_float = audio_data.astype(float)
# Calculate RMS value
rms = (audio_float**2).mean() ** 0.5
# Define silence threshold (adjust if needed)
SILENCE_THRESHOLD = 50.0
return rms < SILENCE_THRESHOLD
def process_benchmark_results(
all_results: List[Dict[str, Any]], token_sizes: List[int]
) -> Dict[str, Any]:
"""Process benchmark results and generate summary"""
summary = {}
for tokens in token_sizes:
matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results:
avg_first_chunk = sum(
r["time_to_first_chunk"] for r in matching_results
) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results),
}
return summary
def save_benchmark_results(
all_results: List[Dict[str, Any]],
summary: Dict[str, Any],
output_data_dir: str,
output_plots_dir: str,
suffix: str,
plot_title_suffix: str,
prefix: str = "",
):
"""Save benchmark results and generate plots"""
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
save_json_results(
results_data,
os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots
plot_correlation(
df,
"target_tokens",
"time_to_first_chunk",
f"Time to First Audio vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
)
plot_correlation(
df,
"target_tokens",
"total_time",
f"Total Time vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
)
plot_timeline(
df,
os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
suffix=plot_title_suffix,
)
def run_benchmark(
measure_func: Callable,
output_dir: str,
output_data_dir: str,
output_plots_dir: str,
suffix: str = "",
plot_title_suffix: str = "",
num_runs: int = 5,
client=None,
prefix="",
):
"""Run benchmark with the given measurement function"""
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(output_plots_dir, exist_ok=True)
# Load sample text
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 50, 100, 250, 500]
all_results = []
silent_files = []
for tokens in token_sizes:
print(
f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
)
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
for i in range(num_runs):
print(f"Run {i+1}/{num_runs}...")
result = measure_func(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
# Handle time to first audio
first_chunk = result.get('time_to_first_chunk')
print(
f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
)
# Handle total time
total_time = result.get('total_time')
print(
f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
)
# Handle audio length
audio_length = result.get('audio_length')
print(
f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
)
# Calculate streaming overhead only if both values exist
if total_time is not None and first_chunk is not None:
print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
else:
print("Streaming overhead: N/A")
if result["error"]:
print(f"Error: {result['error']}")
elif result["audio_path"] and check_audio_silence(result["audio_path"]):
silent_files.append(result["audio_path"])
all_results.append(result)
# Process and save results
summary = process_benchmark_results(all_results, token_sizes)
save_benchmark_results(
all_results,
summary,
output_data_dir,
output_plots_dir,
suffix,
plot_title_suffix,
)
# Print paths
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
# Print silence check summary
if silent_files:
print("\nWARNING: The following files contain only silence:")
for file in silent_files:
print(f"- {file}")
else:
print("\nAll generated audio files contain valid audio content.")

View file

@ -1,111 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 18.833295583724976,
"output_length": 31.15,
"realtime_factor": 1.6539856161403135,
"elapsed_time": 19.024322748184204
},
{
"tokens": 200,
"processing_time": 38.95506024360657,
"output_length": 62.6,
"realtime_factor": 1.6069799304257042,
"elapsed_time": 58.21527123451233
},
{
"tokens": 300,
"processing_time": 49.74252939224243,
"output_length": 96.325,
"realtime_factor": 1.9364716908630366,
"elapsed_time": 108.19673728942871
},
{
"tokens": 400,
"processing_time": 61.349056243896484,
"output_length": 128.575,
"realtime_factor": 2.095794261102292,
"elapsed_time": 169.733656167984
},
{
"tokens": 500,
"processing_time": 82.86568236351013,
"output_length": 158.575,
"realtime_factor": 1.9136389815071193,
"elapsed_time": 252.7968451976776
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:13:49.865330",
"cpu_percent": 8.0,
"ram_percent": 39.4,
"ram_used_gb": 25.03811264038086,
"gpu_memory_used": 1204.0
},
{
"timestamp": "2025-01-03T00:14:08.781551",
"cpu_percent": 26.8,
"ram_percent": 42.6,
"ram_used_gb": 27.090862274169922,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:08.916973",
"cpu_percent": 16.1,
"ram_percent": 42.6,
"ram_used_gb": 27.089553833007812,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:47.979053",
"cpu_percent": 31.5,
"ram_percent": 43.6,
"ram_used_gb": 27.714427947998047,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:48.098976",
"cpu_percent": 20.0,
"ram_percent": 43.6,
"ram_used_gb": 27.704315185546875,
"gpu_memory_used": 1211.0
},
{
"timestamp": "2025-01-03T00:15:37.944729",
"cpu_percent": 29.7,
"ram_percent": 38.6,
"ram_used_gb": 24.53925323486328,
"gpu_memory_used": 1217.0
},
{
"timestamp": "2025-01-03T00:15:38.071915",
"cpu_percent": 8.6,
"ram_percent": 38.5,
"ram_used_gb": 24.51690673828125,
"gpu_memory_used": 1208.0
},
{
"timestamp": "2025-01-03T00:16:39.525449",
"cpu_percent": 23.4,
"ram_percent": 38.8,
"ram_used_gb": 24.71230697631836,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:16:39.612442",
"cpu_percent": 5.5,
"ram_percent": 38.9,
"ram_used_gb": 24.72066879272461,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:18:02.569076",
"cpu_percent": 27.4,
"ram_percent": 39.1,
"ram_used_gb": 24.868202209472656,
"gpu_memory_used": 1264.0
}
]
}

View file

@ -1,216 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 14.349808931350708,
"output_length": 31.15,
"rtf": 0.46,
"elapsed_time": 14.716031074523926
},
{
"tokens": 200,
"processing_time": 28.341803312301636,
"output_length": 62.6,
"rtf": 0.45,
"elapsed_time": 43.44207406044006
},
{
"tokens": 300,
"processing_time": 43.352553606033325,
"output_length": 96.325,
"rtf": 0.45,
"elapsed_time": 87.26906609535217
},
{
"tokens": 400,
"processing_time": 71.02449822425842,
"output_length": 128.575,
"rtf": 0.55,
"elapsed_time": 158.7198133468628
},
{
"tokens": 500,
"processing_time": 70.92521691322327,
"output_length": 158.575,
"rtf": 0.45,
"elapsed_time": 230.01379895210266
},
{
"tokens": 600,
"processing_time": 83.6328592300415,
"output_length": 189.25,
"rtf": 0.44,
"elapsed_time": 314.02610969543457
},
{
"tokens": 700,
"processing_time": 103.0810194015503,
"output_length": 222.075,
"rtf": 0.46,
"elapsed_time": 417.5678551197052
},
{
"tokens": 800,
"processing_time": 127.02162909507751,
"output_length": 253.85,
"rtf": 0.5,
"elapsed_time": 545.0128681659698
},
{
"tokens": 900,
"processing_time": 130.49781227111816,
"output_length": 283.775,
"rtf": 0.46,
"elapsed_time": 675.8943417072296
},
{
"tokens": 1000,
"processing_time": 154.76425909996033,
"output_length": 315.475,
"rtf": 0.49,
"elapsed_time": 831.0677945613861
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:23:52.896889",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.86032485961914,
"gpu_memory_used": 1281.0
},
{
"timestamp": "2025-01-03T00:24:07.429461",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.847564697265625,
"gpu_memory_used": 1285.0
},
{
"timestamp": "2025-01-03T00:24:07.620587",
"cpu_percent": 2.7,
"ram_percent": 39.1,
"ram_used_gb": 24.846607208251953,
"gpu_memory_used": 1275.0
},
{
"timestamp": "2025-01-03T00:24:36.140754",
"cpu_percent": 5.4,
"ram_percent": 39.1,
"ram_used_gb": 24.857810974121094,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:24:36.340675",
"cpu_percent": 6.2,
"ram_percent": 39.1,
"ram_used_gb": 24.85773468017578,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:25:19.905634",
"cpu_percent": 29.1,
"ram_percent": 39.2,
"ram_used_gb": 24.920318603515625,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:25:20.182219",
"cpu_percent": 20.0,
"ram_percent": 39.2,
"ram_used_gb": 24.930198669433594,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:26:31.414760",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.127891540527344,
"gpu_memory_used": 1259.0
},
{
"timestamp": "2025-01-03T00:26:31.617256",
"cpu_percent": 3.6,
"ram_percent": 39.5,
"ram_used_gb": 25.126346588134766,
"gpu_memory_used": 1252.0
},
{
"timestamp": "2025-01-03T00:27:42.736097",
"cpu_percent": 10.5,
"ram_percent": 39.5,
"ram_used_gb": 25.100231170654297,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:27:42.912870",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.098285675048828,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:29:06.725264",
"cpu_percent": 8.9,
"ram_percent": 39.5,
"ram_used_gb": 25.123123168945312,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:29:06.928826",
"cpu_percent": 5.5,
"ram_percent": 39.5,
"ram_used_gb": 25.128646850585938,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:30:50.206349",
"cpu_percent": 49.6,
"ram_percent": 39.6,
"ram_used_gb": 25.162948608398438,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:30:50.491837",
"cpu_percent": 14.8,
"ram_percent": 39.5,
"ram_used_gb": 25.13379669189453,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:32:57.721467",
"cpu_percent": 6.2,
"ram_percent": 39.6,
"ram_used_gb": 25.187721252441406,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:32:57.913350",
"cpu_percent": 3.6,
"ram_percent": 39.6,
"ram_used_gb": 25.199390411376953,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:35:08.608730",
"cpu_percent": 6.3,
"ram_percent": 39.8,
"ram_used_gb": 25.311710357666016,
"gpu_memory_used": 1330.0
},
{
"timestamp": "2025-01-03T00:35:08.791851",
"cpu_percent": 5.3,
"ram_percent": 39.8,
"ram_used_gb": 25.326683044433594,
"gpu_memory_used": 1333.0
},
{
"timestamp": "2025-01-03T00:37:43.782406",
"cpu_percent": 6.8,
"ram_percent": 40.6,
"ram_used_gb": 25.803058624267578,
"gpu_memory_used": 1409.0
}
]
}

View file

@ -1,300 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 0.96,
"output_length": 31.1,
"rtf": 0.03,
"elapsed_time": 1.11
},
{
"tokens": 250,
"processing_time": 2.23,
"output_length": 77.17,
"rtf": 0.03,
"elapsed_time": 3.49
},
{
"tokens": 400,
"processing_time": 4.05,
"output_length": 128.05,
"rtf": 0.03,
"elapsed_time": 7.77
},
{
"tokens": 550,
"processing_time": 4.06,
"output_length": 171.45,
"rtf": 0.02,
"elapsed_time": 12.0
},
{
"tokens": 700,
"processing_time": 6.01,
"output_length": 221.6,
"rtf": 0.03,
"elapsed_time": 18.16
},
{
"tokens": 850,
"processing_time": 6.9,
"output_length": 269.1,
"rtf": 0.03,
"elapsed_time": 25.21
},
{
"tokens": 1000,
"processing_time": 7.65,
"output_length": 315.05,
"rtf": 0.02,
"elapsed_time": 33.03
},
{
"tokens": 6000,
"processing_time": 48.7,
"output_length": 1837.1,
"rtf": 0.03,
"elapsed_time": 82.21
},
{
"tokens": 11000,
"processing_time": 92.44,
"output_length": 3388.57,
"rtf": 0.03,
"elapsed_time": 175.46
},
{
"tokens": 16000,
"processing_time": 163.61,
"output_length": 4977.32,
"rtf": 0.03,
"elapsed_time": 340.46
},
{
"tokens": 21000,
"processing_time": 209.72,
"output_length": 6533.3,
"rtf": 0.03,
"elapsed_time": 551.92
},
{
"tokens": 26000,
"processing_time": 329.35,
"output_length": 8068.15,
"rtf": 0.04,
"elapsed_time": 883.37
},
{
"tokens": 31000,
"processing_time": 473.52,
"output_length": 9611.48,
"rtf": 0.05,
"elapsed_time": 1359.28
},
{
"tokens": 36000,
"processing_time": 650.98,
"output_length": 11157.15,
"rtf": 0.06,
"elapsed_time": 2012.9
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T14:41:01.331735",
"cpu_percent": 7.5,
"ram_percent": 50.2,
"ram_used_gb": 31.960269927978516,
"gpu_memory_used": 3191.0
},
{
"timestamp": "2025-01-03T14:41:02.357116",
"cpu_percent": 17.01,
"ram_percent": 50.2,
"ram_used_gb": 31.96163558959961,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:02.445009",
"cpu_percent": 9.5,
"ram_percent": 50.3,
"ram_used_gb": 31.966781616210938,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:04.742152",
"cpu_percent": 18.27,
"ram_percent": 50.4,
"ram_used_gb": 32.08788299560547,
"gpu_memory_used": 3642.0
},
{
"timestamp": "2025-01-03T14:41:04.847795",
"cpu_percent": 16.27,
"ram_percent": 50.5,
"ram_used_gb": 32.094364166259766,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.019590",
"cpu_percent": 15.97,
"ram_percent": 50.7,
"ram_used_gb": 32.23244094848633,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.110324",
"cpu_percent": 3.54,
"ram_percent": 50.7,
"ram_used_gb": 32.234458923339844,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:13.252607",
"cpu_percent": 13.4,
"ram_percent": 50.6,
"ram_used_gb": 32.194271087646484,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:13.327557",
"cpu_percent": 4.69,
"ram_percent": 50.6,
"ram_used_gb": 32.191776275634766,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:19.413633",
"cpu_percent": 12.92,
"ram_percent": 50.9,
"ram_used_gb": 32.3467903137207,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:19.492758",
"cpu_percent": 7.5,
"ram_percent": 50.8,
"ram_used_gb": 32.34375,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:26.467284",
"cpu_percent": 13.09,
"ram_percent": 51.2,
"ram_used_gb": 32.56281280517578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:26.553559",
"cpu_percent": 8.39,
"ram_percent": 51.2,
"ram_used_gb": 32.56183624267578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:34.284362",
"cpu_percent": 12.61,
"ram_percent": 51.7,
"ram_used_gb": 32.874778747558594,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:34.362353",
"cpu_percent": 1.25,
"ram_percent": 51.7,
"ram_used_gb": 32.87461471557617,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:42:23.471312",
"cpu_percent": 11.64,
"ram_percent": 54.9,
"ram_used_gb": 34.90264129638672,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:42:23.547203",
"cpu_percent": 5.31,
"ram_percent": 54.9,
"ram_used_gb": 34.91563415527344,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:43:56.724933",
"cpu_percent": 12.97,
"ram_percent": 59.5,
"ram_used_gb": 37.84241485595703,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:43:56.815453",
"cpu_percent": 11.75,
"ram_percent": 59.5,
"ram_used_gb": 37.832679748535156,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:46:41.705155",
"cpu_percent": 12.94,
"ram_percent": 66.3,
"ram_used_gb": 42.1534538269043,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:46:41.835177",
"cpu_percent": 7.73,
"ram_percent": 66.2,
"ram_used_gb": 42.13554000854492,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:50:13.166236",
"cpu_percent": 11.62,
"ram_percent": 73.4,
"ram_used_gb": 46.71288299560547,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:50:13.261611",
"cpu_percent": 8.16,
"ram_percent": 73.4,
"ram_used_gb": 46.71356201171875,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:55:44.623607",
"cpu_percent": 12.92,
"ram_percent": 82.8,
"ram_used_gb": 52.65533447265625,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T14:55:44.735410",
"cpu_percent": 15.29,
"ram_percent": 82.7,
"ram_used_gb": 52.63290786743164,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T15:03:40.534449",
"cpu_percent": 13.88,
"ram_percent": 85.0,
"ram_used_gb": 54.050071716308594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:03:40.638708",
"cpu_percent": 12.21,
"ram_percent": 85.0,
"ram_used_gb": 54.053733825683594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:14:34.159142",
"cpu_percent": 14.51,
"ram_percent": 78.1,
"ram_used_gb": 49.70396423339844,
"gpu_memory_used": 4739.0
}
]
}

View file

@ -1,19 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 5500
Total audio generated: 1741.65s
Total test duration: 831.07s
Average processing rate: 6.72 tokens/second
Average RTF: 0.47x
Per-chunk Stats:
Average chunk size: 550.00 tokens
Min chunk size: 100.00 tokens
Max chunk size: 1000.00 tokens
Average processing time: 82.70s
Average output length: 174.17s
Performance Ranges:
Processing rate range: 5.63 - 7.17 tokens/second
RTF range: 0.44x - 0.55x

View file

@ -1,9 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 150850
Total audio generated: 46786.59s
Total test duration: 2012.90s
Average processing rate: 104.34 tokens/second
Average RTF: 0.03x

View file

@ -1,23 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 1800
Total audio generated (s): 568.53
Total test duration (s): 244.10
Average processing rate (tokens/s): 7.34
Average RTF: 0.43
Average Real Time Speed: 2.33
=== Per-chunk Stats ===
Average chunk size (tokens): 600.00
Min chunk size (tokens): 300
Max chunk size (tokens): 900
Average processing time (s): 81.30
Average output length (s): 189.51
=== Performance Ranges ===
Processing rate range (tokens/s): 7.21 - 7.47
RTF range: 0.43x - 0.43x
Real Time Speed range: 2.33x - 2.33x

View file

@ -1,403 +0,0 @@
{
"individual_runs": [
{
"text_length": 37,
"token_count": 10,
"total_time": 0.16574740409851074,
"time_to_first_chunk": 0.16574740409851074,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18812799453735352,
"time_to_first_chunk": 0.18812799453735352,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18645429611206055,
"time_to_first_chunk": 0.18645429611206055,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.17632031440734863,
"time_to_first_chunk": 0.17632031440734863,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.13381195068359375,
"time_to_first_chunk": 0.13381195068359375,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2086498737335205,
"time_to_first_chunk": 0.2086498737335205,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 1
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2727653980255127,
"time_to_first_chunk": 0.2727653980255127,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 2
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2096250057220459,
"time_to_first_chunk": 0.2096250057220459,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 3
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2256758213043213,
"time_to_first_chunk": 0.2256758213043213,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 4
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.1945042610168457,
"time_to_first_chunk": 0.1945042610168457,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 5
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4975121021270752,
"time_to_first_chunk": 0.4975121021270752,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4518404006958008,
"time_to_first_chunk": 0.4518404006958008,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5640325546264648,
"time_to_first_chunk": 0.5640325546264648,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5305957794189453,
"time_to_first_chunk": 0.5305957794189453,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5540030002593994,
"time_to_first_chunk": 0.5540030002593994,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.7963137626647949,
"time_to_first_chunk": 0.7963137626647949,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9320805072784424,
"time_to_first_chunk": 0.9320805072784424,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.824256181716919,
"time_to_first_chunk": 0.824256181716919,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9034836292266846,
"time_to_first_chunk": 0.9034836292266846,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8364357948303223,
"time_to_first_chunk": 0.8364357948303223,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8122682571411133,
"time_to_first_chunk": 1.8122682571411133,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7290427684783936,
"time_to_first_chunk": 1.7290427684783936,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.141728401184082,
"time_to_first_chunk": 2.141728401184082,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.0155680179595947,
"time_to_first_chunk": 2.0155680179595947,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8707575798034668,
"time_to_first_chunk": 1.8707575798034668,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.822713851928711,
"time_to_first_chunk": 4.822713851928711,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.227782726287842,
"time_to_first_chunk": 4.227782726287842,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.414916276931763,
"time_to_first_chunk": 4.414916276931763,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.579505681991577,
"time_to_first_chunk": 4.579505681991577,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.332529067993164,
"time_to_first_chunk": 4.332529067993164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 0.17,
"avg_total_time": 0.17,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"25": {
"avg_time_to_first_chunk": 0.222,
"avg_total_time": 0.222,
"avg_audio_length": 7.225,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 0.52,
"avg_total_time": 0.52,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.859,
"avg_total_time": 0.859,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 1.914,
"avg_total_time": 1.914,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 4.475,
"avg_total_time": 4.475,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 13:52:28"
}

View file

@ -1,271 +1,337 @@
{ {
"individual_runs": [ "individual_runs": [
{ {
"text_length": 212, "text_length": 37,
"token_count": 50, "token_count": null,
"total_time": 0.7278211116790771, "total_time": 0.4376556873321533,
"time_to_first_chunk": 0.3613290786743164, "time_to_first_chunk": 0.4189143180847168,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
"audio_length": 16.325, "audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.37163758277893066,
"time_to_first_chunk": 0.34892702102661133,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.2654602527618408,
"time_to_first_chunk": 0.2409076690673828,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.24376440048217773,
"time_to_first_chunk": 0.23003816604614258,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.25968003273010254,
"time_to_first_chunk": 0.24081206321716309,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 1.049060344696045,
"time_to_first_chunk": 0.3336215019226074,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.4556088447570801, "total_time": 0.8934676647186279,
"time_to_first_chunk": 0.18642044067382812, "time_to_first_chunk": 0.3011031150817871,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.5538768768310547, "total_time": 0.9444286823272705,
"time_to_first_chunk": 0.2720797061920166, "time_to_first_chunk": 0.3198091983795166,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.4395604133605957, "total_time": 0.9735183715820312,
"time_to_first_chunk": 0.15613913536071777, "time_to_first_chunk": 0.369948148727417,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.45748305320739746, "total_time": 0.8089118003845215,
"time_to_first_chunk": 0.18805718421936035, "time_to_first_chunk": 0.30179858207702637,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 0.7347762584686279, "total_time": 1.641003131866455,
"time_to_first_chunk": 0.16963744163513184, "time_to_first_chunk": 0.2979745864868164,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 0.8288509845733643, "total_time": 1.3709619045257568,
"time_to_first_chunk": 0.20123004913330078, "time_to_first_chunk": 0.4272146224975586,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 0.7503848075866699, "total_time": 1.2554471492767334,
"time_to_first_chunk": 0.21662068367004395, "time_to_first_chunk": 0.29790568351745605,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 0.694899320602417, "total_time": 1.3761844635009766,
"time_to_first_chunk": 0.1966841220855713, "time_to_first_chunk": 0.32633328437805176,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 0.68701171875, "total_time": 1.56705904006958,
"time_to_first_chunk": 0.19341063499450684, "time_to_first_chunk": 0.32801246643066406,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.6845426559448242, "total_time": 5.086699962615967,
"time_to_first_chunk": 0.21096158027648926, "time_to_first_chunk": 0.33925390243530273,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.3545098304748535, "total_time": 3.827953338623047,
"time_to_first_chunk": 0.18648386001586914, "time_to_first_chunk": 0.39266157150268555,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.426060676574707, "total_time": 3.9389824867248535,
"time_to_first_chunk": 0.20081472396850586, "time_to_first_chunk": 0.3231511116027832,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.4084081649780273, "total_time": 3.942399740219116,
"time_to_first_chunk": 0.18551135063171387, "time_to_first_chunk": 0.34731340408325195,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.4703152179718018, "total_time": 3.7748308181762695,
"time_to_first_chunk": 0.17750859260559082, "time_to_first_chunk": 0.40787601470947266,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.289574384689331, "total_time": 9.003147840499878,
"time_to_first_chunk": 0.1997976303100586, "time_to_first_chunk": 0.5455703735351562,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 3.7089381217956543, "total_time": 10.081491231918335,
"time_to_first_chunk": 0.25969815254211426, "time_to_first_chunk": 0.4591703414916992,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.138366222381592, "total_time": 9.767668962478638,
"time_to_first_chunk": 0.1831505298614502, "time_to_first_chunk": 0.31237053871154785,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 3.980635643005371, "total_time": 9.090342998504639,
"time_to_first_chunk": 0.20493030548095703, "time_to_first_chunk": 0.41753244400024414,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.1370298862457275, "total_time": 9.876578330993652,
"time_to_first_chunk": 0.19150757789611816, "time_to_first_chunk": 0.3965120315551758,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 5 "run_number": 5
} }
], ],
"summary": { "summary": {
"10": {
"avg_time_to_first_chunk": 0.296,
"avg_total_time": 0.316,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": { "50": {
"avg_time_to_first_chunk": 0.233, "avg_time_to_first_chunk": 0.325,
"avg_total_time": 0.527, "avg_total_time": 0.934,
"avg_audio_length": 16.325, "avg_audio_length": 15.925,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"100": { "100": {
"avg_time_to_first_chunk": 0.196, "avg_time_to_first_chunk": 0.335,
"avg_total_time": 0.739, "avg_total_time": 1.442,
"avg_audio_length": 31.1, "avg_audio_length": 30.5,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"200": { "250": {
"avg_time_to_first_chunk": 0.192, "avg_time_to_first_chunk": 0.362,
"avg_total_time": 1.469, "avg_total_time": 4.114,
"avg_audio_length": 62.625, "avg_audio_length": 78.775,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"500": { "500": {
"avg_time_to_first_chunk": 0.208, "avg_time_to_first_chunk": 0.426,
"avg_total_time": 4.051, "avg_total_time": 9.564,
"avg_audio_length": 157.875, "avg_audio_length": 156.475,
"num_successful_runs": 5 "num_successful_runs": 5
} }
}, },
"timestamp": "2025-01-04 22:16:30" "timestamp": "2025-01-06 00:00:43"
} }

View file

@ -1,271 +1,337 @@
{ {
"individual_runs": [ "individual_runs": [
{ {
"text_length": 212, "text_length": 37,
"token_count": 50, "token_count": null,
"total_time": 1.149611473083496, "total_time": 0.7105245590209961,
"time_to_first_chunk": 0.8767304420471191, "time_to_first_chunk": 0.6905441284179688,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
"audio_length": 16.325, "audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.35063982009887695,
"time_to_first_chunk": 0.32647228240966797,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.43519043922424316,
"time_to_first_chunk": 0.41011548042297363,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.33886170387268066,
"time_to_first_chunk": 0.32068943977355957,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 0.31725525856018066,
"time_to_first_chunk": 0.29624342918395996,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 1.0215234756469727,
"time_to_first_chunk": 0.38323354721069336,
"error": null,
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.9325947761535645, "total_time": 1.38511061668396,
"time_to_first_chunk": 0.5965914726257324, "time_to_first_chunk": 0.47052764892578125,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 0.9205234050750732, "total_time": 1.0185234546661377,
"time_to_first_chunk": 0.5961906909942627, "time_to_first_chunk": 0.3535764217376709,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 1.1321916580200195, "total_time": 0.8875925540924072,
"time_to_first_chunk": 0.6946916580200195, "time_to_first_chunk": 0.3373105525970459,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 212, "text_length": 212,
"token_count": 50, "token_count": null,
"total_time": 1.1146185398101807, "total_time": 0.9557526111602783,
"time_to_first_chunk": 0.6918885707855225, "time_to_first_chunk": 0.3364882469177246,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 16.325, "audio_length": 15.925,
"target_tokens": 50, "target_tokens": 50,
"actual_tokens": 50, "actual_tokens": 50,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 1.3645410537719727, "total_time": 1.569596767425537,
"time_to_first_chunk": 0.6802399158477783, "time_to_first_chunk": 0.42070746421813965,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 1.4154777526855469, "total_time": 1.5172030925750732,
"time_to_first_chunk": 0.7297353744506836, "time_to_first_chunk": 0.3982264995574951,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 1.3589520454406738, "total_time": 1.5318474769592285,
"time_to_first_chunk": 0.698603630065918, "time_to_first_chunk": 0.3533785343170166,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 1.2276430130004883, "total_time": 1.3858752250671387,
"time_to_first_chunk": 0.6705801486968994, "time_to_first_chunk": 0.3360786437988281,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 448, "text_length": 448,
"token_count": 100, "token_count": null,
"total_time": 1.0949454307556152, "total_time": 1.7841475009918213,
"time_to_first_chunk": 0.5698442459106445, "time_to_first_chunk": 0.34446048736572266,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 31.1, "audio_length": 30.5,
"target_tokens": 100, "target_tokens": 100,
"actual_tokens": 100, "actual_tokens": 100,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.8211240768432617, "total_time": 4.334965467453003,
"time_to_first_chunk": 0.6070489883422852, "time_to_first_chunk": 0.4336512088775635,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.8376774787902832, "total_time": 5.265941858291626,
"time_to_first_chunk": 0.6538689136505127, "time_to_first_chunk": 0.5461773872375488,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.6953792572021484, "total_time": 5.66066575050354,
"time_to_first_chunk": 0.5554308891296387, "time_to_first_chunk": 0.4757547378540039,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.887030839920044, "total_time": 9.289174318313599,
"time_to_first_chunk": 0.5866930484771729, "time_to_first_chunk": 0.40159058570861816,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 906, "text_length": 1140,
"token_count": 200, "token_count": null,
"total_time": 1.7908406257629395, "total_time": 4.425869703292847,
"time_to_first_chunk": 0.5897490978240967, "time_to_first_chunk": 0.40808558464050293,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
"audio_length": 62.625, "audio_length": 78.775,
"target_tokens": 200, "target_tokens": 250,
"actual_tokens": 200, "actual_tokens": 250,
"run_number": 5 "run_number": 5
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.228837013244629, "total_time": 9.600461483001709,
"time_to_first_chunk": 0.5315976142883301, "time_to_first_chunk": 0.3966805934906006,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 1 "run_number": 1
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.489210367202759, "total_time": 8.82239580154419,
"time_to_first_chunk": 0.5261838436126709, "time_to_first_chunk": 0.3900904655456543,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 2 "run_number": 2
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.5290446281433105, "total_time": 10.99152159690857,
"time_to_first_chunk": 0.6186764240264893, "time_to_first_chunk": 0.4041757583618164,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 3 "run_number": 3
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.209261178970337, "total_time": 9.12995958328247,
"time_to_first_chunk": 0.5990591049194336, "time_to_first_chunk": 0.43430614471435547,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 4 "run_number": 4
}, },
{ {
"text_length": 2232, "text_length": 2232,
"token_count": 500, "token_count": null,
"total_time": 4.218762636184692, "total_time": 10.043727159500122,
"time_to_first_chunk": 0.5466251373291016, "time_to_first_chunk": 0.41181445121765137,
"error": null, "error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav", "audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 157.875, "audio_length": 156.475,
"target_tokens": 500, "target_tokens": 500,
"actual_tokens": 500, "actual_tokens": 500,
"run_number": 5 "run_number": 5
} }
], ],
"summary": { "summary": {
"10": {
"avg_time_to_first_chunk": 0.409,
"avg_total_time": 0.43,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": { "50": {
"avg_time_to_first_chunk": 0.691, "avg_time_to_first_chunk": 0.376,
"avg_total_time": 1.05, "avg_total_time": 1.054,
"avg_audio_length": 16.325, "avg_audio_length": 15.925,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"100": { "100": {
"avg_time_to_first_chunk": 0.67, "avg_time_to_first_chunk": 0.371,
"avg_total_time": 1.292, "avg_total_time": 1.558,
"avg_audio_length": 31.1, "avg_audio_length": 30.5,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"200": { "250": {
"avg_time_to_first_chunk": 0.599, "avg_time_to_first_chunk": 0.453,
"avg_total_time": 1.806, "avg_total_time": 5.795,
"avg_audio_length": 62.625, "avg_audio_length": 78.775,
"num_successful_runs": 5 "num_successful_runs": 5
}, },
"500": { "500": {
"avg_time_to_first_chunk": 0.564, "avg_time_to_first_chunk": 0.407,
"avg_total_time": 4.335, "avg_total_time": 9.718,
"avg_audio_length": 157.875, "avg_audio_length": 156.475,
"num_successful_runs": 5 "num_successful_runs": 5
} }
}, },
"timestamp": "2025-01-04 22:18:03" "timestamp": "2025-01-06 00:02:21"
} }

View file

@ -1,23 +1,23 @@
=== Benchmark Statistics (with correct RTF) === === Benchmark Statistics (with correct RTF) ===
Total tokens processed: 17150 Total tokens processed: 3150
Total audio generated (s): 5296.38 Total audio generated (s): 1056.03
Total test duration (s): 155.23 Total test duration (s): 70.20
Average processing rate (tokens/s): 102.86 Average processing rate (tokens/s): 46.46
Average RTF: 0.03 Average RTF: 0.07
Average Real Time Speed: 31.25 Average Real Time Speed: 15.00
=== Per-chunk Stats === === Per-chunk Stats ===
Average chunk size (tokens): 1715.00 Average chunk size (tokens): 525.00
Min chunk size (tokens): 150 Min chunk size (tokens): 150
Max chunk size (tokens): 5000 Max chunk size (tokens): 900
Average processing time (s): 15.39 Average processing time (s): 11.57
Average output length (s): 529.64 Average output length (s): 176.00
=== Performance Ranges === === Performance Ranges ===
Processing rate range (tokens/s): 80.65 - 125.10 Processing rate range (tokens/s): 40.07 - 53.57
RTF range: 0.03x - 0.04x RTF range: 0.06x - 0.08x
Real Time Speed range: 25.00x - 33.33x Real Time Speed range: 12.50x - 16.67x

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 181 KiB

After

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 454 KiB

After

Width:  |  Height:  |  Size: 491 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 246 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 210 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 268 KiB

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 233 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 193 KiB

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 196 KiB

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 764 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 238 KiB

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 250 KiB

After

Width:  |  Height:  |  Size: 221 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 459 KiB

After

Width:  |  Height:  |  Size: 463 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 198 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 252 KiB

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 258 KiB

After

Width:  |  Height:  |  Size: 263 KiB

View file

@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""Script to generate all plots needed for the README."""
import os
import sys
import shutil
from pathlib import Path
from validate_wav import validate_tts
# Get absolute paths
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent.parent
# Add directories to Python path for imports
sys.path.append(str(script_dir))
sys.path.append(str(script_dir / "benchmarks"))
# Import test scripts
from benchmark_tts_rtf import main as benchmark_rtf
from test_formats.test_audio_formats import main as test_formats
from benchmark_first_token_stream_unified import main as benchmark_stream
from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
# Remove directories from path after imports
sys.path.remove(str(script_dir))
sys.path.remove(str(script_dir / "benchmarks"))
def ensure_assets_dir():
"""Create assets directory if it doesn't exist."""
assets_dir = project_root / "assets"
assets_dir.mkdir(exist_ok=True)
return assets_dir
def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
"""Copy a plot to the assets directory with a new name."""
if os.path.exists(src_path):
shutil.copy2(src_path, assets_dir / dest_name)
print(f"Copied {src_path} to {assets_dir / dest_name}")
else:
print(f"Warning: Source plot not found at {src_path}")
def validate_and_print(wav_path: str, category: str):
"""Validate a WAV file and print results."""
if not os.path.exists(wav_path):
print(f"Warning: WAV file not found at {wav_path}")
return
print(f"\n=== Validating {category} Audio ===")
result = validate_tts(wav_path)
if "error" in result:
print(f"Error: {result['error']}")
else:
print(f"Duration: {result['duration']}")
print(f"Sample Rate: {result['sample_rate']} Hz")
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
print(f"- {issue}")
else:
print("\nNo issues found")
def main():
"""Generate all plots needed for the README."""
# Ensure assets directory exists
prefix = "gpu"
assets_dir = ensure_assets_dir()
print("\n=== Generating Format Comparison Plot ===")
test_formats()
copy_plot(
str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
"format_comparison.png",
assets_dir,
)
# Validate WAV output from format test
validate_and_print(
str(script_dir / "test_formats/output/test_formats/speech.wav"),
"Format Test WAV",
)
print("\n=== Generating Voice Analysis Plot ===")
test_voice_analysis()
copy_plot(
str(script_dir / "test_combinations/output/analysis_comparison.png"),
"voice_analysis.png",
assets_dir,
)
# Validate combined voice output
validate_and_print(
str(
script_dir
/ "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
),
"Combined Voice",
)
print("\n=== Generating Performance Benchmark Plots ===")
benchmark_rtf()
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
f"{prefix}_processing_time.png",
assets_dir,
)
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
f"{prefix}_realtime_factor.png",
assets_dir,
)
# Validate RTF benchmark output (~500 tokens)
validate_and_print(
str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
"RTF Benchmark",
)
print("\n=== Generating Streaming Benchmark Plots ===")
benchmark_stream()
# Copy direct streaming plots
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
f"{prefix}_first_token_latency_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
f"{prefix}_first_token_timeline_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
f"{prefix}_total_time_latency_direct.png",
assets_dir,
)
# Copy OpenAI streaming plots
copy_plot(
str(
script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
),
f"{prefix}_first_token_latency_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir
/ "benchmarks/output_plots/first_token_timeline_stream_openai.png"
),
f"{prefix}_first_token_timeline_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
),
f"{prefix}_total_time_latency_openai.png",
assets_dir,
)
# Wait a moment for files to be generated
import time
time.sleep(2)
# Validate streaming outputs (~500 tokens)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
),
"Direct Streaming",
)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
),
"OpenAI Streaming",
)
validate_and_print(
str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
"Format Test WAV",
)
print("\nAll plots have been generated and copied to the assets directory")
if __name__ == "__main__":
main()

View file

@ -73,6 +73,7 @@ def generate_speech(
"voice": voice, "voice": voice,
"speed": 1.0, "speed": 1.0,
"response_format": "wav", # Use WAV for analysis "response_format": "wav", # Use WAV for analysis
"stream": False,
}, },
) )
@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
fig.patch.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#1a1a2e")
num_files = len(audio_files) num_files = len(audio_files)
# Create subplot grid with proper spacing # Create subplot grid with proper spacing for waveforms and metrics
total_rows = num_files + 2 # Add one more row for metrics
gs = plt.GridSpec( gs = plt.GridSpec(
num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3 total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
) )
# Analyze all files first # Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
# Colors for voices # Colors for voices
colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"] colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]
# Create two subplots for metrics with similar scales # Create metrics for each subplot
# Left subplot: Brightness and Volume metrics = [
ax1 = plt.subplot(gs[num_files, 0])
metrics1 = [
( (
"Brightness", plt.subplot(gs[num_files, 0]),
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()], [
"kHz", (
), "Volume",
("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"), [chars["rms"] * 100 for chars in all_chars.values()],
] "RMS×100",
)
# Right subplot: Voice Pitch and Texture ],
ax2 = plt.subplot(gs[num_files, 1])
metrics2 = [
(
"Voice Pitch",
[min(chars["dominant_frequencies"]) for chars in all_chars.values()],
"Hz",
), ),
( (
"Texture", plt.subplot(gs[num_files, 1]),
[chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()], [
"ZCR×1000", (
"Brightness",
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
"kHz",
)
],
),
(
plt.subplot(gs[num_files + 1, 0]),
[
(
"Voice Pitch",
[
min(chars["dominant_frequencies"])
for chars in all_chars.values()
],
"Hz",
)
],
),
(
plt.subplot(gs[num_files + 1, 1]),
[
(
"Texture",
[
chars["zero_crossing_rate"] * 1000
for chars in all_chars.values()
],
"ZCR×1000",
)
],
), ),
] ]
def plot_grouped_bars(ax, metrics, show_legend=True): # Plot each metric
n_groups = len(metrics) for i, (ax, metric_data) in enumerate(metrics):
n_voices = len(audio_files) n_voices = len(audio_files)
bar_width = 0.25 bar_width = 0.25
indices = np.array([0])
indices = np.arange(n_groups) values = metric_data[0][1]
max_val = max(values)
# Get max value for y-axis scaling for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
max_val = max(max(m[1]) for m in metrics) offset = (j - n_voices / 2 + 0.5) * bar_width
for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
values = [m[1][i] for m in metrics]
offset = (i - n_voices / 2 + 0.5) * bar_width
bars = ax.bar( bars = ax.bar(
indices + offset, values, bar_width, label=voice, color=color, alpha=0.8 indices + offset,
[values[j]],
bar_width,
label=voice,
color=color,
alpha=0.8,
) )
# Add value labels on top of bars # Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
) )
ax.set_xticks(indices) ax.set_xticks(indices)
ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics]) ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
# Set y-axis limits with some padding
ax.set_ylim(0, max_val * 1.2) ax.set_ylim(0, max_val * 1.2)
ax.set_ylabel("Value")
if show_legend: # Only show legend on first metric plot
if i == 0:
ax.legend( ax.legend(
bbox_to_anchor=(1.05, 1), bbox_to_anchor=(1.05, 1),
loc="upper left", loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
edgecolor="#ffffff", edgecolor="#ffffff",
) )
# Plot both subplots # Style the subplot
plot_grouped_bars(ax1, metrics1, show_legend=True) setup_plot(fig, ax, metric_data[0][0])
plot_grouped_bars(ax2, metrics2, show_legend=False)
# Style both subplots # Adjust the figure size and padding
setup_plot(fig, ax1, "Brightness and Volume") fig.set_size_inches(15, 20)
setup_plot(fig, ax2, "Voice Pitch and Texture")
# Add y-axis labels
ax1.set_ylabel("Value")
ax2.set_ylabel("Value")
# Adjust the figure size to accommodate the legend
fig.set_size_inches(15, 15)
# Add padding around the entire figure
plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1) plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300) plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png") print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
) )
parser.add_argument("--url", default="http://localhost:8880", help="API base URL") parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
parser.add_argument( parser.add_argument(
"--output-dir", "--output-dir",
default="examples/assorted_checks/test_combinations/output", default="examples/assorted_checks/test_combinations/output",
help="Output directory for audio files", help="Output directory for audio files",
) )

View file

@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
for i, stat in enumerate(stats): for i, stat in enumerate(stats):
format_name = stat["format"].upper() format_name = stat["format"].upper()
try: try:
# Handle PCM format differently file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
if stat["format"] == "pcm":
# Read raw PCM data (16-bit mono)
with open(
os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
) as f:
raw_data = f.read()
data = np.frombuffer(raw_data, dtype=np.int16)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000
else:
# Read other formats with soundfile
data, sr = sf.read(
os.path.join(output_dir, f"test_audio.{stat['format']}")
)
# Plot waveform if stat["format"] == "wav":
# Use scipy.io.wavfile for WAV files
sr, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
elif stat["format"] == "pcm":
# Read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000 # Known sample rate for our endpoint
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sr = sf.read(file_path)
# Plot waveform with consistent normalization
ax = plt.subplot(gs_waves[i]) ax = plt.subplot(gs_waves[i])
time = np.arange(len(data)) / sr time = np.arange(len(data)) / sr
plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d") plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
ax.set_xlabel("Time (seconds)") ax.set_xlabel("Time (seconds)")
ax.set_ylabel("") ax.set_ylabel("")
ax.set_ylim(-1.1, 1.1) ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
"""Get audio file statistics""" """Get audio file statistics"""
file_size = os.path.getsize(file_path) file_size = os.path.getsize(file_path)
file_size_kb = file_size / 1024 # Convert to KB file_size_kb = file_size / 1024 # Convert to KB
format_name = Path(file_path).suffix[1:]
try: if format_name == "wav":
# Try reading with soundfile first # Use scipy.io.wavfile for WAV files
sample_rate, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1]
elif format_name == "pcm":
# For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
sample_rate = 24000 # Known sample rate for our endpoint
duration = len(data) / sample_rate
channels = 1
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sample_rate = sf.read(file_path) data, sample_rate = sf.read(file_path)
duration = len(data) / sample_rate duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1] channels = 1 if len(data.shape) == 1 else data.shape[1]
# Calculate audio statistics # Calculate audio statistics
stats = { stats = {
"format": Path(file_path).suffix[1:], "format": format_name,
"file_size_kb": round(file_size_kb, 2), "file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2), "duration_seconds": round(duration, 2),
"sample_rate": sample_rate, "sample_rate": sample_rate,
"channels": channels, "channels": channels,
"min_amplitude": float(np.min(data)), "min_amplitude": float(np.min(data)),
"max_amplitude": float(np.max(data)), "max_amplitude": float(np.max(data)),
"mean_amplitude": float(np.mean(np.abs(data))), "mean_amplitude": float(np.mean(np.abs(data))),
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))), "rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
} }
return stats return stats
except:
# For PCM, read raw bytes and estimate duration
with open(file_path, "rb") as f:
data = f.read()
# Assuming 16-bit PCM mono at 24kHz
samples = len(data) // 2 # 2 bytes per sample
duration = samples / 24000
return {
"format": "pcm",
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": 24000,
"channels": 1,
"note": "PCM stats are estimated from raw bytes",
}
def main(): def main():
@ -254,13 +256,49 @@ def main():
# Generate and save # Generate and save
start_time = time.time() start_time = time.time()
response = client.audio.speech.create(
model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt # Use requests with stream=False for consistent data handling
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"voice": voice,
"input": SAMPLE_TEXT,
"response_format": fmt,
"stream": False, # Explicitly disable streaming to get single complete chunk
},
stream=False,
headers={"Accept": f"audio/{fmt}"}, # Explicitly request audio format
) )
generation_time = time.time() - start_time generation_time = time.time() - start_time
with open(output_path, "wb") as f: print(f"\nResponse headers for {fmt}:")
f.write(response.content) for header, value in response.headers.items():
print(f"{header}: {value}")
print(f"Content length: {len(response.content)} bytes")
print(f"First few bytes: {response.content[:20].hex()}")
# Write the file and verify it was written correctly
try:
with open(output_path, "wb") as f:
f.write(response.content)
# Verify file was written
if not output_path.exists():
raise Exception(f"Failed to write {fmt} file")
# Check file size matches content length
written_size = output_path.stat().st_size
if written_size != len(response.content):
raise Exception(
f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
)
print(f"Successfully wrote {fmt} file")
except Exception as e:
print(f"Error writing {fmt} file: {e}")
continue
# Get stats # Get stats
file_stats = get_audio_stats(str(output_path)) file_stats = get_audio_stats(str(output_path))

View file

@ -4,15 +4,19 @@ import random
import string import string
from typing import List, Tuple from typing import List, Tuple
def create_test_cases() -> List[str]: def create_test_cases() -> List[str]:
"""Create a variety of test cases with different characteristics""" """Create a variety of test cases with different characteristics"""
# Helper to create random text with specific patterns # Helper to create random text with specific patterns
def random_text(length: int) -> str: def random_text(length: int) -> str:
return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length)) return "".join(
random.choice(string.ascii_letters + string.digits + " .,!?")
for _ in range(length)
)
test_cases = [] test_cases = []
# Base test cases that hit specific patterns # Base test cases that hit specific patterns
base_cases = [ base_cases = [
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.", "Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
@ -21,10 +25,10 @@ def create_test_cases() -> List[str]:
"X's and Y's properties cost £50 million in the 1990s", "X's and Y's properties cost £50 million in the 1990s",
"こんにちは。今日は!", "こんにちは。今日は!",
] ]
# Add base cases # Add base cases
test_cases.extend(base_cases) test_cases.extend(base_cases)
# Add variations with random content # Add variations with random content
for length in [100, 1000, 10000]: for length in [100, 1000, 10000]:
# Create 3 variations of each length # Create 3 variations of each length
@ -35,23 +39,24 @@ def create_test_cases() -> List[str]:
text = text.replace(text[30:40], "$1,234.56") text = text.replace(text[30:40], "$1,234.56")
text = text.replace(text[50:60], "A.B.C. xyz") text = text.replace(text[50:60], "A.B.C. xyz")
test_cases.append(text) test_cases.append(text)
return test_cases return test_cases
class TextNormalizerInline: class TextNormalizerInline:
"""Text normalizer using inline patterns""" """Text normalizer using inline patterns"""
def normalize(self, text: str) -> str: def normalize(self, text: str) -> str:
# Replace quotes and brackets # Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»") text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation # Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"): for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ") text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text) text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text) text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text) text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -61,108 +66,132 @@ class TextNormalizerInline:
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text) text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text) text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text) text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text) text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
split_num,
text,
)
text = re.sub(r"(?<=\d),(?=\d)", "", text) text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text) text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
handle_money,
text,
)
text = re.sub(r"\d*\.\d+", handle_decimal, text) text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text) text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text) text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text) text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text) text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text) text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text) text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip() return text.strip()
class TextNormalizerCompiled: class TextNormalizerCompiled:
"""Text normalizer using all compiled patterns""" """Text normalizer using all compiled patterns"""
def __init__(self): def __init__(self):
self.patterns = { self.patterns = {
'whitespace': re.compile(r"[^\S \n]"), "whitespace": re.compile(r"[^\S \n]"),
'multi_space': re.compile(r" +"), "multi_space": re.compile(r" +"),
'newline_space': re.compile(r"(?<=\n) +(?=\n)"), "newline_space": re.compile(r"(?<=\n) +(?=\n)"),
'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"), "doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"), "mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"), "miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"), "mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
'etc': re.compile(r"\betc\.(?! [A-Z])"), "etc": re.compile(r"\betc\.(?! [A-Z])"),
'yeah': re.compile(r"(?i)\b(y)eah?\b"), "yeah": re.compile(r"(?i)\b(y)eah?\b"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"), "numbers": re.compile(
'comma_in_number': re.compile(r"(?<=\d),(?=\d)"), r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"), ),
'decimal': re.compile(r"\d*\.\d+"), "comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
'range': re.compile(r"(?<=\d)-(?=\d)"), "money": re.compile(
's_after_number': re.compile(r"(?<=\d)S"), r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"), ),
'x_possessive': re.compile(r"(?<=X')S\b"), "decimal": re.compile(r"\d*\.\d+"),
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"), "range": re.compile(r"(?<=\d)-(?=\d)"),
'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])") "s_after_number": re.compile(r"(?<=\d)S"),
"possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
"x_possessive": re.compile(r"(?<=X')S\b"),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
"single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
} }
def normalize(self, text: str) -> str: def normalize(self, text: str) -> str:
# Replace quotes and brackets # Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»") text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation # Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"): for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ") text = text.replace(a, b + " ")
# Use compiled patterns # Use compiled patterns
text = self.patterns['whitespace'].sub(" ", text) text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns['multi_space'].sub(" ", text) text = self.patterns["multi_space"].sub(" ", text)
text = self.patterns['newline_space'].sub("", text) text = self.patterns["newline_space"].sub("", text)
text = self.patterns['doctor'].sub("Doctor", text) text = self.patterns["doctor"].sub("Doctor", text)
text = self.patterns['mister'].sub("Mister", text) text = self.patterns["mister"].sub("Mister", text)
text = self.patterns['miss'].sub("Miss", text) text = self.patterns["miss"].sub("Miss", text)
text = self.patterns['mrs'].sub("Mrs", text) text = self.patterns["mrs"].sub("Mrs", text)
text = self.patterns['etc'].sub("etc", text) text = self.patterns["etc"].sub("etc", text)
text = self.patterns['yeah'].sub(r"\1e'a", text) text = self.patterns["yeah"].sub(r"\1e'a", text)
text = self.patterns['numbers'].sub(split_num, text) text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns['comma_in_number'].sub("", text) text = self.patterns["comma_in_number"].sub("", text)
text = self.patterns['money'].sub(handle_money, text) text = self.patterns["money"].sub(handle_money, text)
text = self.patterns['decimal'].sub(handle_decimal, text) text = self.patterns["decimal"].sub(handle_decimal, text)
text = self.patterns['range'].sub(" to ", text) text = self.patterns["range"].sub(" to ", text)
text = self.patterns['s_after_number'].sub(" S", text) text = self.patterns["s_after_number"].sub(" S", text)
text = self.patterns['possessive_s'].sub("'S", text) text = self.patterns["possessive_s"].sub("'S", text)
text = self.patterns['x_possessive'].sub("s", text) text = self.patterns["x_possessive"].sub("s", text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text) text = self.patterns["initials"].sub(
text = self.patterns['single_initial'].sub("-", text) lambda m: m.group().replace(".", "-"), text
)
text = self.patterns["single_initial"].sub("-", text)
return text.strip() return text.strip()
class TextNormalizerHybrid: class TextNormalizerHybrid:
"""Text normalizer using hybrid approach - compile only complex/frequent patterns""" """Text normalizer using hybrid approach - compile only complex/frequent patterns"""
def __init__(self): def __init__(self):
# Only compile patterns that are complex or frequently used # Only compile patterns that are complex or frequently used
self.patterns = { self.patterns = {
'whitespace': re.compile(r"[^\S \n]"), "whitespace": re.compile(r"[^\S \n]"),
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"), "numbers": re.compile(
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"), r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]") ),
"money": re.compile(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
} }
def normalize(self, text: str) -> str: def normalize(self, text: str) -> str:
# Replace quotes and brackets # Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»") text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation # Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"): for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ") text = text.replace(a, b + " ")
# Use compiled patterns for complex operations # Use compiled patterns for complex operations
text = self.patterns['whitespace'].sub(" ", text) text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns['numbers'].sub(split_num, text) text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns['money'].sub(handle_money, text) text = self.patterns["money"].sub(handle_money, text)
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text) text = self.patterns["initials"].sub(
lambda m: m.group().replace(".", "-"), text
)
# Use inline patterns for simpler operations # Use inline patterns for simpler operations
text = re.sub(r" +", " ", text) text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text) text = re.sub(r"(?<=\n) +(?=\n)", "", text)
@ -179,9 +208,10 @@ class TextNormalizerHybrid:
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text) text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text) text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text) text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip() return text.strip()
def split_num(match: re.Match) -> str: def split_num(match: re.Match) -> str:
"""Split numbers for TTS processing""" """Split numbers for TTS processing"""
num = match.group(0) num = match.group(0)
@ -192,61 +222,70 @@ def split_num(match: re.Match) -> str:
return f"{num[:-1]} s" return f"{num[:-1]} s"
return num return num
def handle_money(match: re.Match) -> str: def handle_money(match: re.Match) -> str:
"""Format money strings for TTS""" """Format money strings for TTS"""
text = match.group(0) text = match.group(0)
return text.replace("$", " dollars ").replace("£", " pounds ") return text.replace("$", " dollars ").replace("£", " pounds ")
def handle_decimal(match: re.Match) -> str: def handle_decimal(match: re.Match) -> str:
"""Format decimal numbers for TTS""" """Format decimal numbers for TTS"""
num = match.group(0) num = match.group(0)
return num.replace(".", " point ") return num.replace(".", " point ")
def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
def benchmark_normalizers(
test_cases: List[str], iterations: int = 100
) -> Tuple[float, float, float]:
"""Benchmark all three implementations""" """Benchmark all three implementations"""
normalizers = { normalizers = {
'inline': TextNormalizerInline(), "inline": TextNormalizerInline(),
'compiled': TextNormalizerCompiled(), "compiled": TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid() "hybrid": TextNormalizerHybrid(),
} }
results = {} results = {}
# Test each normalizer # Test each normalizer
for name, normalizer in normalizers.items(): for name, normalizer in normalizers.items():
start = time.perf_counter() start = time.perf_counter()
# Run normalizations # Run normalizations
for _ in range(iterations): for _ in range(iterations):
for test in test_cases: for test in test_cases:
normalizer.normalize(test) normalizer.normalize(test)
results[name] = time.perf_counter() - start results[name] = time.perf_counter() - start
return results return results
def verify_outputs(test_cases: List[str]) -> bool: def verify_outputs(test_cases: List[str]) -> bool:
"""Verify that all implementations produce identical output""" """Verify that all implementations produce identical output"""
normalizers = { normalizers = {
'inline': TextNormalizerInline(), "inline": TextNormalizerInline(),
'compiled': TextNormalizerCompiled(), "compiled": TextNormalizerCompiled(),
'hybrid': TextNormalizerHybrid() "hybrid": TextNormalizerHybrid(),
} }
for test in test_cases: for test in test_cases:
results = [norm.normalize(test) for norm in normalizers.values()] results = [norm.normalize(test) for norm in normalizers.values()]
if not all(r == results[0] for r in results): if not all(r == results[0] for r in results):
return False return False
return True return True
def main(): def main():
# Create test cases # Create test cases
print("Generating test cases...") print("Generating test cases...")
test_cases = create_test_cases() test_cases = create_test_cases()
total_chars = sum(len(t) for t in test_cases) total_chars = sum(len(t) for t in test_cases)
print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters") print(
f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
)
# Verify output consistency # Verify output consistency
print("\nVerifying output consistency...") print("\nVerifying output consistency...")
if verify_outputs(test_cases): if verify_outputs(test_cases):
@ -254,15 +293,16 @@ def main():
else: else:
print("✗ Warning: Implementations produce different outputs!") print("✗ Warning: Implementations produce different outputs!")
return return
# Run benchmarks # Run benchmarks
print("\nRunning benchmarks...") print("\nRunning benchmarks...")
iterations = 100 iterations = 100
results = benchmark_normalizers(test_cases, iterations) results = benchmark_normalizers(test_cases, iterations)
# Print results # Print results
print(f"\nResults for {iterations} iterations: ") print(f"\nResults for {iterations} iterations: ")
for name, time_taken in results.items(): for name, time_taken in results.items():
print(f"{name.capitalize()}: {time_taken:.3f}s") print(f"{name.capitalize()}: {time_taken:.3f}s")
main()
main()

View file

@ -1,8 +1,11 @@
import argparse
from typing import Any, Dict
from pathlib import Path
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import argparse from tqdm import tqdm
from pathlib import Path
from typing import Dict, Any
def validate_tts(wav_path: str) -> dict: def validate_tts(wav_path: str) -> dict:
""" """
@ -13,34 +16,40 @@ def validate_tts(wav_path: str) -> dict:
audio, sr = sf.read(wav_path) audio, sr = sf.read(wav_path)
if len(audio.shape) > 1: if len(audio.shape) > 1:
audio = np.mean(audio, axis=1) audio = np.mean(audio, axis=1)
duration = len(audio) / sr duration = len(audio) / sr
issues = [] issues = []
# Basic quality checks # Basic quality checks
abs_audio = np.abs(audio) abs_audio = np.abs(audio)
stats = { stats = {
'rms': float(np.sqrt(np.mean(audio**2))), "rms": float(np.sqrt(np.mean(audio**2))),
'peak': float(np.max(abs_audio)), "peak": float(np.max(abs_audio)),
'dc_offset': float(np.mean(audio)) "dc_offset": float(np.mean(audio)),
} }
clip_count = np.sum(abs_audio >= 0.99) clip_count = np.sum(abs_audio >= 0.99)
clip_percent = (clip_count / len(audio)) * 100 clip_percent = (clip_count / len(audio)) * 100
if duration < 0.1: if duration < 0.1:
issues.append("WARNING: Audio is suspiciously short - possible failed generation") issues.append(
"WARNING: Audio is suspiciously short - possible failed generation"
if stats['peak'] >= 1.0: )
if stats["peak"] >= 1.0:
if clip_percent > 1.0: if clip_percent > 1.0:
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)") issues.append(
f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
)
elif clip_percent > 0.01: elif clip_percent > 0.01:
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)") issues.append(
f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
if stats['rms'] < 0.01: )
if stats["rms"] < 0.01:
issues.append("WARNING: Audio is very quiet - possible failed generation") issues.append("WARNING: Audio is very quiet - possible failed generation")
if abs(stats['dc_offset']) > 0.1: if abs(stats["dc_offset"]) > 0.1:
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})") issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
# Check for long silence gaps # Check for long silence gaps
@ -51,66 +60,79 @@ def validate_tts(wav_path: str) -> dict:
window_size = int(min_silence * sr) window_size = int(min_silence * sr)
silence_count = 0 silence_count = 0
last_silence = -1 last_silence = -1
start_idx = int(0.2 * sr) # Skip first 0.2s start_idx = int(0.2 * sr) # Skip first 0.2s
for i in range(start_idx, len(db) - window_size, window_size): for i in tqdm(
window = db[i:i+window_size] range(start_idx, len(db) - window_size, window_size),
desc="Checking for silence",
):
window = db[i : i + window_size]
if np.mean(window) < silence_threshold: if np.mean(window) < silence_threshold:
silent_ratio = np.mean(window < silence_threshold) silent_ratio = np.mean(window < silence_threshold)
if silent_ratio > 0.9: if silent_ratio > 0.9:
if last_silence == -1 or (i/sr - last_silence) > 2.0: if last_silence == -1 or (i / sr - last_silence) > 2.0:
silence_count += 1 silence_count += 1
last_silence = i/sr last_silence = i / sr
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)") issues.append(
f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
)
if silence_count > 2: if silence_count > 2:
issues.append(f"WARNING: Multiple long silences found ({silence_count} total)") issues.append(
f"WARNING: Multiple long silences found ({silence_count} total)"
)
# Detect audio artifacts # Detect audio artifacts
diff = np.diff(audio) diff = np.diff(audio)
abs_diff = np.abs(diff) abs_diff = np.abs(diff)
window_size = min(int(0.005 * sr), 256) window_size = min(int(0.005 * sr), 256)
window = np.ones(window_size)/window_size window = np.ones(window_size) / window_size
local_avg_diff = np.convolve(abs_diff, window, mode='same') local_avg_diff = np.convolve(abs_diff, window, mode="same")
spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1) spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
artifact_indices = np.nonzero(spikes)[0] artifact_indices = np.nonzero(spikes)[0]
artifacts = [] artifacts = []
if len(artifact_indices) > 0: if len(artifact_indices) > 0:
gaps = np.diff(artifact_indices) gaps = np.diff(artifact_indices)
min_gap = int(0.005 * sr) min_gap = int(0.005 * sr)
break_points = np.nonzero(gaps > min_gap)[0] + 1 break_points = np.nonzero(gaps > min_gap)[0] + 1
groups = np.split(artifact_indices, break_points) groups = np.split(artifact_indices, break_points)
for group in groups: for group in groups:
if len(group) >= 5: if len(group) >= 5:
severity = np.max(abs_diff[group]) severity = np.max(abs_diff[group])
if severity > 0.2: if severity > 0.2:
center_idx = group[len(group)//2] center_idx = group[len(group) // 2]
artifacts.append({ artifacts.append(
'time': float(center_idx/sr), # Ensure float for consistent timing {
'severity': float(severity) "time": float(
}) center_idx / sr
), # Ensure float for consistent timing
"severity": float(severity),
}
)
issues.append( issues.append(
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s " f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
f"(severity: {severity:.3f})" f"(severity: {severity:.3f})"
) )
# Check for repeated speech segments # Check for repeated speech segments
for chunk_duration in [5.0, 10.0]: for chunk_duration in tqdm(
[0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
):
chunk_size = int(chunk_duration * sr) chunk_size = int(chunk_duration * sr)
overlap = int(0.2 * chunk_size) overlap = int(0.2 * chunk_size)
for i in range(0, len(audio) - 2*chunk_size, overlap): for i in range(0, len(audio) - 2 * chunk_size, overlap):
chunk1 = audio[i:i+chunk_size] chunk1 = audio[i : i + chunk_size]
chunk2 = audio[i+chunk_size:i+2*chunk_size] chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01: if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
continue continue
try: try:
correlation = np.corrcoef(chunk1, chunk2)[0,1] correlation = np.corrcoef(chunk1, chunk2)[0, 1]
if not np.isnan(correlation) and correlation > 0.92: if not np.isnan(correlation) and correlation > 0.92:
issues.append( issues.append(
f"WARNING: Possible repeated speech at {i/sr:.1f}s " f"WARNING: Possible repeated speech at {i/sr:.1f}s "
@ -128,92 +150,113 @@ def validate_tts(wav_path: str) -> dict:
"rms_level": f"{stats['rms']:.3f}", "rms_level": f"{stats['rms']:.3f}",
"dc_offset": f"{stats['dc_offset']:.3f}", "dc_offset": f"{stats['dc_offset']:.3f}",
"artifact_count": len(artifacts), "artifact_count": len(artifacts),
"artifact_locations": [a['time'] for a in artifacts], "artifact_locations": [a["time"] for a in artifacts],
"artifact_severities": [a['severity'] for a in artifacts], "artifact_severities": [a["severity"] for a in artifacts],
"issues": issues, "issues": issues,
"valid": len(issues) == 0 "valid": len(issues) == 0,
}
except Exception as e:
return {
"file": wav_path,
"error": str(e),
"valid": False
} }
def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]): except Exception as e:
return {"file": wav_path, "error": str(e), "valid": False}
def generate_analysis_plots(
wav_path: str, output_dir: str, validation_result: Dict[str, Any]
):
""" """
Generate analysis plots for audio file with time-aligned visualizations. Generate analysis plots for audio file with time-aligned visualizations.
""" """
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.signal import spectrogram from scipy.signal import spectrogram
# Load audio # Load audio
audio, sr = sf.read(wav_path) audio, sr = sf.read(wav_path)
if len(audio.shape) > 1: if len(audio.shape) > 1:
audio = np.mean(audio, axis=1) audio = np.mean(audio, axis=1)
# Create figure with shared x-axis # Create figure with shared x-axis
fig = plt.figure(figsize=(15, 8)) fig = plt.figure(figsize=(15, 8))
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1) gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
ax1 = fig.add_subplot(gs[0]) ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1], sharex=ax1) ax2 = fig.add_subplot(gs[1], sharex=ax1)
# Calculate spectrogram # Calculate spectrogram
nperseg = 2048 nperseg = 2048
noverlap = 1536 noverlap = 1536
f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap, f, t, Sxx = spectrogram(
window='hann', scaling='spectrum') audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
)
# Plot spectrogram # Plot spectrogram
im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), im = ax1.pcolormesh(
shading='gouraud', cmap='viridis', t,
vmin=-100, vmax=-20) f,
ax1.set_ylabel('Frequency [Hz]', fontsize=10) 10 * np.log10(Sxx + 1e-10),
cbar = plt.colorbar(im, ax=ax1, label='dB') shading="gouraud",
ax1.set_title('Spectrogram', pad=10, fontsize=12) cmap="viridis",
vmin=-100,
vmax=-20,
)
ax1.set_ylabel("Frequency [Hz]", fontsize=10)
cbar = plt.colorbar(im, ax=ax1, label="dB")
ax1.set_title("Spectrogram", pad=10, fontsize=12)
# Plot waveform with exact time alignment # Plot waveform with exact time alignment
times = np.arange(len(audio)) / sr times = np.arange(len(audio)) / sr
ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio') ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
ax2.set_ylabel('Amplitude', fontsize=10) ax2.set_ylabel("Amplitude", fontsize=10)
ax2.set_xlabel('Time [sec]', fontsize=10) ax2.set_xlabel("Time [sec]", fontsize=10)
ax2.grid(True, alpha=0.2) ax2.grid(True, alpha=0.2)
# Add artifact markers # Add artifact markers
if 'artifact_locations' in validation_result and validation_result['artifact_locations']: if (
for loc in validation_result['artifact_locations']: "artifact_locations" in validation_result
ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2) and validation_result["artifact_locations"]
ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts') ):
for loc in validation_result["artifact_locations"]:
ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
ax2.axvline(
x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
)
# Add legend to both plots # Add legend to both plots
if len(validation_result['artifact_locations']) > 0: if len(validation_result["artifact_locations"]) > 0:
ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts') ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
ax1.legend(loc='upper right', fontsize=8) ax1.legend(loc="upper right", fontsize=8)
# Only add unique labels to legend # Only add unique labels to legend
handles, labels = ax2.get_legend_handles_labels() handles, labels = ax2.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles)) unique_labels = dict(zip(labels, handles))
ax2.legend(unique_labels.values(), unique_labels.keys(), ax2.legend(
loc='upper right', fontsize=8) unique_labels.values(),
unique_labels.keys(),
loc="upper right",
fontsize=8,
)
# Set common x limits # Set common x limits
xlim = (0, len(audio)/sr) xlim = (0, len(audio) / sr)
ax1.set_xlim(xlim) ax1.set_xlim(xlim)
ax2.set_xlim(xlim) ax2.set_xlim(xlim)
og_filename = Path(wav_path).name.split(".")[0] og_filename = Path(wav_path).name.split(".")[0]
# Save plot # Save plot
plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight') plt.savefig(
Path(output_dir) / f"{og_filename}_audio_analysis.png",
dpi=300,
bbox_inches="tight",
)
plt.close() plt.close()
if __name__ == "__main__":
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
silent=False
if __name__ == "__main__":
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
silent = False
print(f"\n\n Processing:\n\t{wav_file}")
result = validate_tts(wav_file) result = validate_tts(wav_file)
if not silent: if not silent:
wav_root_dir = Path(wav_file).parent wav_root_dir = Path(wav_file).parent
generate_analysis_plots(wav_file, wav_root_dir, result) generate_analysis_plots(wav_file, wav_root_dir, result)
print(f"\nValidating: {result['file']}") print(f"\nValidating: {result['file']}")
if "error" in result: if "error" in result:
print(f"Error: {result['error']}") print(f"Error: {result['error']}")
@ -224,10 +267,10 @@ if __name__ == "__main__":
print(f"RMS Level: {result['rms_level']}") print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}") print(f"DC Offset: {result['dc_offset']}")
print(f"Detected Artifacts: {result['artifact_count']}") print(f"Detected Artifacts: {result['artifact_count']}")
if result["issues"]: if result["issues"]:
print("\nIssues Found:") print("\nIssues Found:")
for issue in result["issues"]: for issue in result["issues"]:
print(f"- {issue}") print(f"- {issue}")
else: else:
print("\nNo issues found") print("\nNo issues found")

View file

@ -1,7 +1,9 @@
import argparse import argparse
from pathlib import Path from pathlib import Path
from validate_wav import validate_tts from validate_wav import validate_tts
def print_validation_result(result: dict, rel_path: Path): def print_validation_result(result: dict, rel_path: Path):
"""Print full validation details for a single file.""" """Print full validation details for a single file."""
print(f"\nValidating: {rel_path}") print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
print(f"Peak Amplitude: {result['peak_amplitude']}") print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}") print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}") print(f"DC Offset: {result['dc_offset']}")
if result["issues"]: if result["issues"]:
print("\nIssues Found:") print("\nIssues Found:")
for issue in result["issues"]: for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
else: else:
print("\nNo issues found") print("\nNo issues found")
def validate_directory(directory: str): def validate_directory(directory: str):
"""Validate all wav files in a directory with detailed output and summary.""" """Validate all wav files in a directory with detailed output and summary."""
dir_path = Path(directory) dir_path = Path(directory)
# Find all wav files (including nested directories) # Find all wav files (including nested directories)
wav_files = list(dir_path.rglob("*.wav")) wav_files = list(dir_path.rglob("*.wav"))
wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s
wav_files = sorted(wav_files) wav_files = sorted(wav_files)
if not wav_files: if not wav_files:
print(f"No .wav or .mp3 files found in {directory}") print(f"No .wav or .mp3 files found in {directory}")
return return
print(f"Found {len(wav_files)} files in {directory}") print(f"Found {len(wav_files)} files in {directory}")
print("=" * 80) print("=" * 80)
# Store results for summary # Store results for summary
results = [] results = []
# Detailed validation output # Detailed validation output
for wav_file in wav_files: for wav_file in wav_files:
result = validate_tts(str(wav_file)) result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
print_validation_result(result, rel_path) print_validation_result(result, rel_path)
results.append((rel_path, result)) results.append((rel_path, result))
print("=" * 80) print("=" * 80)
# Summary with detailed issues # Summary with detailed issues
print("\nSUMMARY:") print("\nSUMMARY:")
for rel_path, result in results: for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
issues = result["issues"] issues = result["issues"]
first_issue = issues[0].replace("WARNING: ", "") first_issue = issues[0].replace("WARNING: ", "")
if len(issues) > 1: if len(issues) > 1:
print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)") print(
f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
)
else: else:
print(f"{rel_path}: FAIL - {first_issue}") print(f"{rel_path}: FAIL - {first_issue}")
else: else:
print(f"{rel_path}: PASS") print(f"{rel_path}: PASS")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Batch validate TTS wav files") parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
parser.add_argument("directory", help="Directory containing wav files to validate") parser.add_argument("directory", help="Directory containing wav files to validate")
args = parser.parse_args() args = parser.parse_args()
validate_directory(args.directory) validate_directory(args.directory)

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

View file

@ -13,7 +13,7 @@ numpy==2.2.1
scipy==1.14.1 scipy==1.14.1
# Audio processing # Audio processing
soundfile==0.12.1 soundfile==0.13.0
# Text processing # Text processing
phonemizer==3.3.0 phonemizer==3.3.0