Merge pull request #9 from remsky/feat/streaming

-Added Streaming support
-Improved model warmup operations
-Minor optimizations to inference model structure
-Chunking configurations added
This commit is contained in:
remsky 2025-01-06 03:53:50 -07:00 committed by GitHub
commit ab8e3c98f6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
76 changed files with 6247 additions and 4141 deletions

BIN
.coverage

Binary file not shown.

3
.gitignore vendored
View file

@ -23,4 +23,7 @@ examples/assorted_checks/test_openai/output/*
examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/*
examples/assorted_checks/benchmarks/output_audio_stream/*
ui/RepoScreenshot.png
examples/assorted_checks/benchmarks/output_audio_stream_openai/*

View file

@ -10,12 +10,12 @@
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- OpenAI-compatible Speech endpoint, with voice combination functionality
- NVIDIA GPU accelerated inference (or CPU) option
- very fast generation time (~35x real time generation speed via 4060Ti)
- very fast generation time (~30x real time speed via 4060Ti)
- automatic chunking/stitching for long texts
- streaming support w/ variable chunking to control latency
- simple audio generation web ui utility
## Quick Start
The service can be accessed through either the API endpoints or the Gradio web interface.
@ -129,7 +129,7 @@ response = requests.post(
)
```
<p align="center">
<img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
<img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
</p>
</details>
@ -144,7 +144,7 @@ response = requests.post(
- pcm
<p align="center">
<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
</p>
</details>
@ -162,6 +162,76 @@ If you only want the API, just comment out everything in the docker-compose.yml
Currently, voices created via the API are accessible here, but voice combination/creation has not yet been added
</details>
<details>
<summary>Streaming Support</summary>
```python
# OpenAI-compatible streaming
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8880", api_key="not-needed")
# Stream to file
with client.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af_bella",
input="Hello world!"
) as response:
response.stream_to_file("output.mp3")
# Stream to speakers (requires PyAudio)
import pyaudio
player = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
output=True
)
with client.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af_bella",
response_format="pcm",
input="Hello world!"
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
player.write(chunk)
```
Or via requests:
```python
import requests
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"input": "Hello world!",
"voice": "af_bella",
"response_format": "pcm"
},
stream=True
)
for chunk in response.iter_content(chunk_size=1024):
if chunk:
# Process streaming chunks
pass
```
<p align="center">
<img src="assets/gpu_first_token_timeline_openai.png" width="45%" alt="GPU First Token Timeline" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="assets/cpu_first_token_timeline_stream_openai.png" width="45%" alt="CPU First Token Timeline" style="border: 2px solid #333; padding: 10px;">
</p>
Key Streaming Metrics:
- First token latency @ chunksize
- ~300ms (GPU) @ 400
- ~3500ms (CPU) @ 200
- Adjustable chunking settings for real-time playback
*Note: Artifacts in intonation can increase with smaller chunks*
</details>
## Processing Details
<details>
<summary>Performance Benchmarks</summary>
@ -175,8 +245,8 @@ Benchmarking was performed on generation via the local API using text lengths up
- H.G. Wells - The Time Machine (full text)
<p align="center">
<img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
<img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
<img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
</p>
Key Performance Metrics:

View file

@ -18,6 +18,8 @@ class Settings(BaseSettings):
onnx_model_path: str = "kokoro-v0_19.onnx"
voices_dir: str = "voices"
sample_rate: int = 24000
max_chunk_size: int = 300 # Maximum size of text chunks for processing
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
# ONNX Optimization Settings
onnx_num_threads: int = 4 # Number of threads for intra-op parallelism

View file

@ -0,0 +1,9 @@
In a village of La Mancha, the name of which I have no desire to call
to mind, there lived not long since one of those gentlemen that keep a
lance in the lance-rack, an old buckler, a lean hack, and a greyhound
for coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
extra on Sundays, made away with three-quarters of his income. The rest
of it went in a doublet of fine cloth and velvet breeches and shoes to
match for holidays, while on week-days he made a brave figure in his
best homespun.

View file

@ -22,9 +22,28 @@ async def lifespan(app: FastAPI):
logger.info("Loading TTS model and voice packs...")
# Initialize the main model with warm-up
voicepack_count = TTSModel.setup()
logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
logger.info(f"{voicepack_count} voice packs loaded successfully")
voicepack_count = await TTSModel.setup()
# boundary = "█████╗"*9
boundary = "" * 24
startup_msg =f"""
{boundary}
{boundary}
"""
# TODO: Improve CPU warmup, threads, memory, etc
startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
startup_msg += f"\n{voicepack_count} voice packs loaded\n"
startup_msg += f"\n{boundary}\n"
logger.info(startup_msg)
yield

View file

@ -2,10 +2,12 @@ from typing import List
from loguru import logger
from fastapi import Depends, Response, APIRouter, HTTPException
from fastapi import Header
from fastapi.responses import StreamingResponse
from ..services.tts_service import TTSService
from ..services.audio import AudioService
from ..structures.schemas import OpenAISpeechRequest
from typing import AsyncGenerator
router = APIRouter(
tags=["OpenAI Compatible TTS"],
@ -18,9 +20,23 @@ def get_tts_service() -> TTSService:
return TTSService() # Initialize TTSService with default settings
async def stream_audio_chunks(tts_service: TTSService, request: OpenAISpeechRequest) -> AsyncGenerator[bytes, None]:
"""Stream audio chunks as they're generated"""
async for chunk in tts_service.generate_audio_stream(
text=request.input,
voice=request.voice,
speed=request.speed,
output_format=request.response_format
):
yield chunk
@router.post("/audio/speech")
async def create_speech(
request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
request: OpenAISpeechRequest,
tts_service: TTSService = Depends(get_tts_service),
x_raw_response: str = Header(None, alias="x-raw-response"),
):
"""OpenAI-compatible endpoint for text-to-speech"""
try:
@ -31,24 +47,53 @@ async def create_speech(
f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
)
# Generate audio directly using TTSService's method
audio, _ = tts_service._generate_audio(
text=request.input,
voice=request.voice,
speed=request.speed,
stitch_long_output=True,
)
# Set content type based on format
content_type = {
"mp3": "audio/mpeg",
"opus": "audio/opus",
"aac": "audio/aac",
"flac": "audio/flac",
"wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
# Convert to requested format
content = AudioService.convert_audio(audio, 24000, request.response_format)
# Check if streaming is requested (default for OpenAI client)
if request.stream:
# Stream audio chunks as they're generated
return StreamingResponse(
stream_audio_chunks(tts_service, request),
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"X-Accel-Buffering": "no", # Disable proxy buffering
"Cache-Control": "no-cache", # Prevent caching
},
)
else:
# Generate complete audio
audio, _ = tts_service._generate_audio(
text=request.input,
voice=request.voice,
speed=request.speed,
stitch_long_output=True,
)
return Response(
content=content,
media_type=f"audio/{request.response_format}",
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}"
},
)
# Convert to requested format
content = AudioService.convert_audio(
audio,
24000,
request.response_format,
is_first_chunk=True,
stream=False)
return Response(
content=content,
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"Cache-Control": "no-cache", # Prevent caching
},
)
except ValueError as e:
logger.error(f"Invalid request: {str(e)}")

View file

@ -4,15 +4,61 @@ from io import BytesIO
import numpy as np
import soundfile as sf
import scipy.io.wavfile as wavfile
from loguru import logger
from ..core.config import settings
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
def __init__(self):
self.int16_max = np.iinfo(np.int16).max
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
"""Normalize audio data to int16 range and trim chunk boundaries"""
# Convert to float32 if not already
audio_float = audio_data.astype(np.float32)
# Normalize to [-1, 1] range first
if np.max(np.abs(audio_float)) > 0:
audio_float = audio_float / np.max(np.abs(audio_float))
# Trim end of non-final chunks to reduce gaps
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
audio_float = audio_float[:-self.samples_to_trim]
# Scale to int16 range
return (audio_float * self.int16_max).astype(np.int16)
class AudioService:
"""Service for audio format conversions"""
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
"mp3": {
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
"compression_level": 0.0, # Balanced compression
},
"opus": {
"compression_level": 0.0, # Good balance for speech
},
"flac": {
"compression_level": 0.0, # Light compression, still fast
}
}
@staticmethod
def convert_audio(
audio_data: np.ndarray, sample_rate: int, output_format: str
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
is_first_chunk: bool = True,
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
stream: bool = True
) -> bytes:
"""Convert audio data to specified format
@ -20,6 +66,20 @@ class AudioService:
audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, opus, flac, pcm)
is_first_chunk: Whether this is the first chunk of a stream
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
format_settings: Optional dict of format-specific settings to override defaults
Example: {
"mp3": {
"bitrate_mode": "VARIABLE",
"compression_level": 0.8
}
}
Default settings balance speed and compression:
optimized for localhost @ 0.0
- MP3: constant bitrate, no compression (0.0)
- OPUS: no compression (0.0)
- FLAC: no compression (0.0)
Returns:
Bytes of the converted audio
@ -27,34 +87,58 @@ class AudioService:
buffer = BytesIO()
try:
if output_format == "wav":
logger.info("Writing to WAV format...")
# Ensure audio_data is in int16 format for WAV
audio_data_wav = (
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
).astype(np.int16) # Normalize
sf.write(buffer, audio_data_wav, sample_rate, format="WAV")
elif output_format == "mp3":
logger.info("Converting to MP3 format...")
# soundfile can write MP3 if ffmpeg or libsox is installed
sf.write(buffer, audio_data, sample_rate, format="MP3")
elif output_format == "opus":
logger.info("Converting to Opus format...")
sf.write(buffer, audio_data, sample_rate, format="OGG", subtype="OPUS")
elif output_format == "flac":
logger.info("Converting to FLAC format...")
sf.write(buffer, audio_data, sample_rate, format="FLAC")
elif output_format == "pcm":
logger.info("Extracting PCM data...")
# Ensure audio_data is in int16 format for PCM
audio_data_pcm = (
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
).astype(np.int16) # Normalize
buffer.write(audio_data_pcm.tobytes())
# Always normalize audio to ensure proper amplitude scaling
if stream:
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
else:
raise ValueError(
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
)
normalized_audio = audio_data
if output_format == "pcm":
# Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes())
elif output_format == "wav":
if stream:
# Use soundfile for streaming to ensure proper headers
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
else:
# Trying scipy.io.wavfile for non-streaming WAV generation
# seems faster than soundfile
# avoids overhead from header generation and PCM encoding
wavfile.write(buffer, sample_rate, normalized_audio)
elif output_format == "mp3":
# Use format settings or defaults
settings = format_settings.get("mp3", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
sf.write(
buffer, normalized_audio,
sample_rate, format="MP3",
**settings
)
elif output_format == "opus":
settings = format_settings.get("opus", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="OGG",
subtype="OPUS", **settings)
elif output_format == "flac":
if is_first_chunk:
logger.info("Starting FLAC stream...")
settings = format_settings.get("flac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
subtype='PCM_16', **settings)
else:
if output_format == "aac":
raise ValueError(
"Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm."
)
else:
raise ValueError(
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
)
buffer.seek(0)
return buffer.getvalue()

View file

@ -0,0 +1,52 @@
"""Text chunking service"""
import re
from ...core.config import settings
def split_text(text: str, max_chunk=None):
"""Split text into chunks on natural pause points
Args:
text: Text to split into chunks
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
"""
if max_chunk is None:
max_chunk = settings.max_chunk_size
if not isinstance(text, str):
text = str(text) if text is not None else ""
text = text.strip()
if not text:
return
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For medium-length sentences, split on punctuation
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
# First try splitting on semicolons and colons
parts = re.split(r"(?<=[;:])\s+", sentence)
for part in parts:
part = part.strip()
if not part:
continue
# If part is still long, split on commas
if len(part) > max_chunk:
subparts = re.split(r"(?<=,)\s+", part)
for subpart in subparts:
subpart = subpart.strip()
if subpart:
yield subpart
else:
yield part
else:
yield sentence

View file

@ -1,4 +1,5 @@
import re
from functools import lru_cache
def split_num(num: re.Match) -> str:
"""Handle number splitting for various formats"""
@ -48,6 +49,7 @@ def handle_decimal(num: re.Match) -> str:
a, b = num.group().split(".")
return " point ".join([a, " ".join(b)])
# @lru_cache(maxsize=1000) # Cache normalized text results
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing

View file

@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
@classmethod
def setup(cls):
async def setup(cls):
"""Initialize model and setup voices"""
with cls._lock:
# Set device
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
except Exception as e:
logger.error(f"Error copying voice {voice_name}: {str(e)}")
# Warm up with default voice
# Load warmup text
try:
dummy_text = "Hello"
voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
# Process text and generate audio
phonemes, tokens = cls.process_text(dummy_text, "a")
cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
logger.info("Model warm-up complete")
with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
warmup_text = f.read()
except Exception as e:
logger.warning(f"Model warm-up failed: {e}")
logger.warning(f"Failed to load warmup text: {e}")
warmup_text = "This is a warmup text that will be split into chunks for processing."
# Use warmup service
from .warmup import WarmupService
warmup = WarmupService()
# Load and warm up voices
loaded_voices = warmup.load_voices()
await warmup.warmup_voices(warmup_text, loaded_voices)
logger.info("Model warm-up complete")
# Count voices in directory
voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])

View file

@ -1,6 +1,7 @@
import os
import numpy as np
import torch
import time
from loguru import logger
from models import build_model
from .text_processing import phonemize, tokenize
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
from .tts_base import TTSBaseModel
from ..core.config import settings
# @torch.no_grad()
# def forward(model, tokens, ref_s, speed):
# """Forward pass through the model"""
# device = ref_s.device
# tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
# input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
# text_mask = length_to_mask(input_lengths).to(device)
# bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
# d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
# s = ref_s[:, 128:]
# d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
# x, _ = model.predictor.lstm(d)
# duration = model.predictor.duration_proj(x)
# duration = torch.sigmoid(duration).sum(axis=-1) / speed
# pred_dur = torch.round(duration).clamp(min=1).long()
# pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# c_frame = 0
# for i in range(pred_aln_trg.size(0)):
# pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
# c_frame += pred_dur[0, i].item()
# en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
# F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
# t_en = model.text_encoder(tokens, input_lengths, text_mask)
# asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
"""Forward pass through the model"""
"""Forward pass through the model with light optimizations that preserve output quality"""
device = ref_s.device
# Keep original token handling but optimize device placement
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
# BERT and encoder pass
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
# Split reference signal once for efficiency
s_content = ref_s[:, 128:]
s_ref = ref_s[:, :128]
# Predictor forward pass
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
# Duration prediction - keeping original logic
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
# Alignment matrix construction - keeping original approach for quality
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
# Matrix multiplications - reuse unsqueezed tensor
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
en = d.transpose(-1, -2) @ pred_aln_trg
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
# Text encoding and final decoding
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
asr = t_en @ pred_aln_trg
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
# def length_to_mask(lengths):
# """Create attention mask from lengths"""
# mask = (
# torch.arange(lengths.max())
# .unsqueeze(0)
# .expand(lengths.shape[0], -1)
# .type_as(lengths)
# )
# mask = torch.gt(mask + 1, lengths.unsqueeze(1))
# return mask
def length_to_mask(lengths):
"""Create attention mask from lengths"""
mask = (
torch.arange(lengths.max())
.unsqueeze(0)
.expand(lengths.shape[0], -1)
.type_as(lengths)
)
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
return mask
"""Create attention mask from lengths - possibly optimized version"""
max_len = lengths.max()
# Create mask directly on the same device as lengths
mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
# Avoid type_as by using the correct dtype from the start
if lengths.dtype != mask.dtype:
mask = mask.to(dtype=lengths.dtype)
# Fuse operations using broadcasting
return mask + 1 > lengths[:, None]
class TTSGPUModel(TTSBaseModel):
_instance = None

View file

@ -3,26 +3,29 @@ import os
import re
import time
from typing import List, Tuple, Optional
from functools import lru_cache
import numpy as np
import torch
import scipy.io.wavfile as wavfile
from .text_processing import normalize_text
from .text_processing import normalize_text, chunker
from loguru import logger
from ..core.config import settings
from .tts_model import TTSModel
from .audio import AudioService, AudioNormalizer
class TTSService:
def __init__(self, output_dir: str = None):
self.output_dir = output_dir
def _split_text(self, text: str) -> List[str]:
"""Split text into sentences"""
if not isinstance(text, str):
text = str(text) if text is not None else ""
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
@staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
def _load_voice(voice_path: str) -> torch.Tensor:
"""Load and cache a voice model"""
return torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
def _get_voice_path(self, voice_name: str) -> Optional[str]:
"""Get the path to a voice file"""
@ -31,6 +34,13 @@ class TTSService:
def _generate_audio(
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
) -> Tuple[torch.Tensor, float]:
"""Generate complete audio and return with processing time"""
audio, processing_time = self._generate_audio_internal(text, voice, speed, stitch_long_output)
return audio, processing_time
def _generate_audio_internal(
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
) -> Tuple[torch.Tensor, float]:
"""Generate audio and measure processing time"""
start_time = time.time()
@ -49,42 +59,42 @@ class TTSService:
if not voice_path:
raise ValueError(f"Voice not found: {voice}")
# Load voice
voicepack = torch.load(
voice_path, map_location=TTSModel.get_device(), weights_only=True
)
# Load voice using cached loader
voicepack = self._load_voice(voice_path)
# Generate audio with or without stitching
# For non-streaming, preprocess all chunks first
if stitch_long_output:
chunks = self._split_text(text)
audio_chunks = []
# Process all chunks
for i, chunk in enumerate(chunks):
# Preprocess all chunks to phonemes/tokens
chunks_data = []
for chunk in chunker.split_text(text):
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunks_data.append((chunk, tokens))
except Exception as e:
logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
continue
if not chunks_data:
raise ValueError("No chunks were processed successfully")
# Generate audio for all chunks
audio_chunks = []
for chunk, tokens in chunks_data:
try:
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
if chunk_audio is not None:
audio_chunks.append(chunk_audio)
else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
logger.error(f"No audio generated for chunk: '{chunk}'")
except Exception as e:
logger.error(
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
)
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
continue
if not audio_chunks:
raise ValueError("No audio chunks were generated successfully")
audio = (
np.concatenate(audio_chunks)
if len(audio_chunks) > 1
else audio_chunks[0]
)
# Concatenate all chunks
audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
else:
# Process single chunk
phonemes, tokens = TTSModel.process_text(text, voice[0])
@ -97,6 +107,99 @@ class TTSService:
logger.error(f"Error in audio generation: {str(e)}")
raise
async def generate_audio_stream(
self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
):
"""Generate and yield audio chunks as they're generated for real-time streaming"""
try:
stream_start = time.time()
# Create normalizer for consistent audio levels
stream_normalizer = AudioNormalizer()
# Input validation and preprocessing
if not text:
raise ValueError("Text is empty")
preprocess_start = time.time()
normalized = normalize_text(text)
if not normalized:
raise ValueError("Text is empty after preprocessing")
text = str(normalized)
logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
# Voice validation and loading
voice_start = time.time()
voice_path = self._get_voice_path(voice)
if not voice_path:
raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path)
logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
# Process chunks as they're generated
is_first = True
chunks_processed = 0
# last_chunk_end = time.time()
# Process chunks as they come from generator
chunk_gen = chunker.split_text(text)
current_chunk = next(chunk_gen, None)
while current_chunk is not None:
next_chunk = next(chunk_gen, None) # Peek at next chunk
# chunk_start = time.time()
chunks_processed += 1
try:
# Process text and generate audio
# text_process_start = time.time()
phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
# text_process_time = time.time() - text_process_start
# audio_gen_start = time.time()
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
# audio_gen_time = time.time() - audio_gen_start
if chunk_audio is not None:
# Convert chunk with proper header handling
convert_start = time.time()
chunk_bytes = AudioService.convert_audio(
chunk_audio,
24000,
output_format,
is_first_chunk=is_first,
normalizer=stream_normalizer,
is_last_chunk=(next_chunk is None) # Last if no next chunk
)
# convert_time = time.time() - convert_start
# Calculate gap from last chunk
# gap_time = chunk_start - last_chunk_end
# Log timing details if not silent
# if not silent:
# logger.debug(
# f"\nChunk {chunks_processed} timing:"
# f"\n Gap from last chunk: {gap_time*1000:.1f}ms"
# f"\n Text processing: {text_process_time*1000:.1f}ms"
# f"\n Audio generation: {audio_gen_time*1000:.1f}ms"
# f"\n Audio conversion: {convert_time*1000:.1f}ms"
# f"\n Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
# )
yield chunk_bytes
is_first = False
# last_chunk_end = time.time()
else:
logger.error(f"No audio generated for chunk: '{current_chunk}'")
except Exception as e:
logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
current_chunk = next_chunk # Move to next chunk
except Exception as e:
logger.error(f"Error in audio generation stream: {str(e)}")
raise
def _save_audio(self, audio: torch.Tensor, filepath: str):
"""Save audio to file"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)

View file

@ -0,0 +1,52 @@
import os
from typing import List, Tuple
import torch
from loguru import logger
from .tts_service import TTSService
from .tts_model import TTSModel
class WarmupService:
"""Service for warming up TTS models and voice caches"""
def __init__(self):
self.tts_service = TTSService()
def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
"""Load and cache voices up to LRU limit"""
# Get all voices sorted by filename length (shorter names first, usually base voices)
voice_files = sorted(
[f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
key=len
)
# Load up to LRU cache limit (20)
loaded_voices = []
for voice_file in voice_files[:20]:
try:
voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
loaded_voices.append((voice_file[:-3], voicepack)) # Store name and tensor
# logger.info(f"Loaded voice {voice_file[:-3]} into cache")
except Exception as e:
logger.error(f"Failed to load voice {voice_file}: {e}")
logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
return loaded_voices
async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
"""Warm up voice inference and streaming"""
n_warmups = 1
for voice_name, _ in loaded_voices[:n_warmups]:
try:
logger.info(f"Running warmup inference on voice {voice_name}")
async for _ in self.tts_service.generate_audio_stream(
warmup_text,
voice_name,
1.0,
"pcm"
):
pass # Process all chunks to properly warm up
logger.info(f"Completed warmup for voice {voice_name}")
except Exception as e:
logger.warning(f"Warmup failed for voice {voice_name}: {e}")

View file

@ -22,7 +22,7 @@ class OpenAISpeechRequest(BaseModel):
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
default="mp3",
description="The format to return audio in. Supported formats: mp3, opus, flac, wav. AAC and PCM are not currently supported.",
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
)
speed: float = Field(
default=1.0,
@ -30,3 +30,7 @@ class OpenAISpeechRequest(BaseModel):
le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
stream: bool = Field(
default=True, # Default to streaming for OpenAI compatibility
description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
)

35
api/tests/test_chunker.py Normal file
View file

@ -0,0 +1,35 @@
"""Tests for text chunking service"""
import pytest
from api.src.services.text_processing import chunker
def test_split_text():
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = list(chunker.split_text(text))
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty():
"""Test splitting empty text"""
assert list(chunker.split_text("")) == []
def test_split_text_single_sentence():
"""Test splitting single sentence"""
text = "Just one sentence."
assert list(chunker.split_text(text)) == ["Just one sentence."]
def test_split_text_with_custom_chunk_size():
"""Test splitting with custom max chunk size"""
text = "First part, second part, third part."
chunks = list(chunker.split_text(text, max_chunk=15))
assert len(chunks) == 3
assert chunks[0] == "First part,"
assert chunks[1] == "second part,"
assert chunks[2] == "third part."

View file

@ -1,19 +1,34 @@
from unittest.mock import Mock
from unittest.mock import Mock, AsyncMock
import pytest
import pytest_asyncio
import asyncio
from fastapi.testclient import TestClient
from httpx import AsyncClient
from ..src.main import app
# Create test client
client = TestClient(app)
# Create async client fixture
@pytest_asyncio.fixture
async def async_client():
async with AsyncClient(app=app, base_url="http://test") as ac:
yield ac
# Mock services
@pytest.fixture
def mock_tts_service(monkeypatch):
mock_service = Mock()
mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
# Create proper async generator mock
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_service.generate_audio_stream = mock_stream
mock_service.list_voices.return_value = [
"af",
"bm_lewis",
@ -34,12 +49,12 @@ def mock_tts_service(monkeypatch):
@pytest.fixture
def mock_audio_service(monkeypatch):
def mock_convert(*args):
return b"converted mock audio data"
mock_service = Mock()
mock_service.convert_audio.return_value = b"converted mock audio data"
monkeypatch.setattr(
"api.src.routers.openai_compatible.AudioService.convert_audio", mock_convert
"api.src.routers.openai_compatible.AudioService", mock_service
)
return mock_service
def test_health_check():
@ -57,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
"voice": "bm_lewis",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 200
@ -76,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
"voice": "invalid_voice",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 400 # Bad request
@ -90,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
"voice": "af",
"response_format": "wav",
"speed": -1.0, # Invalid speed
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 422 # Validation error
@ -104,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
"voice": "af",
"response_format": "wav",
"speed": 1.0,
"stream": False # Explicitly disable streaming
}
response = client.post("/v1/audio/speech", json=test_request)
assert response.status_code == 500
@ -153,3 +172,89 @@ def test_combine_voices_error(mock_tts_service):
assert response.status_code == 500
assert "Combination failed" in response.json()["detail"]["message"]
@pytest.mark.asyncio
async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
"""Test streaming PCM audio for real-time playback"""
test_request = {
"model": "kokoro",
"input": "Hello world",
"voice": "af",
"response_format": "pcm",
"stream": True
}
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}
response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
assert response.status_code == 200
assert response.headers["content-type"] == "audio/pcm"
# Just verify status and content type
assert response.status_code == 200
assert response.headers["content-type"] == "audio/pcm"
@pytest.mark.asyncio
async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
"""Test streaming MP3 audio to file"""
test_request = {
"model": "kokoro",
"input": "Hello world",
"voice": "af",
"response_format": "mp3",
"stream": True
}
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"mp3header", b"mp3data"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}
response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
assert response.status_code == 200
assert response.headers["content-type"] == "audio/mpeg"
assert response.headers["content-disposition"] == "attachment; filename=speech.mp3"
# Just verify status and content type
assert response.status_code == 200
assert response.headers["content-type"] == "audio/mpeg"
assert response.headers["content-disposition"] == "attachment; filename=speech.mp3"
@pytest.mark.asyncio
async def test_openai_speech_streaming_generator(mock_tts_service, async_client):
"""Test streaming with async generator"""
test_request = {
"model": "kokoro",
"input": "Hello world",
"voice": "af",
"response_format": "pcm",
"stream": True
}
# Create streaming mock for this test
async def mock_stream(*args, **kwargs):
for chunk in [b"chunk1", b"chunk2"]:
yield chunk
mock_tts_service.generate_audio_stream = mock_stream
# Add streaming header
headers = {"x-raw-response": "stream"}
response = await async_client.post("/v1/audio/speech", json=test_request, headers=headers)
assert response.status_code == 200
assert response.headers["content-type"] == "audio/pcm"
# Just verify status and content type
assert response.status_code == 200
assert response.headers["content-type"] == "audio/pcm"

View file

@ -1,6 +1,6 @@
"""Tests for FastAPI application"""
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, patch, call
import pytest
from fastapi.testclient import TestClient
@ -28,25 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
"""Test successful model warmup in lifespan"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
# Create async mock
async def async_setup():
return 3
mock_tts_model.setup = MagicMock()
mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
mock_tts_model.setup.return_value = 3 # 3 voice files
mock_tts_model.get_device.return_value = "cuda"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
# Start the context manager
await async_gen.__aenter__()
# Verify the expected logging sequence
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
# Check for the startup message containing the required info
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
assert "Model warmed up on" in startup_msg
assert "3 voice packs loaded" in startup_msg
# Verify the expected logging sequence
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
mock_logger.info.assert_any_call("Model loaded and warmed up on cuda")
mock_logger.info.assert_any_call("3 voice packs loaded successfully")
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@ -77,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
"""Test model warmup specifically on CUDA"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
# Create async mock
async def async_setup():
return 2
mock_tts_model.setup = MagicMock()
mock_tts_model.setup.side_effect = async_setup
mock_tts_model.get_device.return_value = "cuda"
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
mock_tts_model.setup.return_value = 2 # 2 voice files
mock_tts_model.get_device.return_value = "cuda"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
@pytest.mark.asyncio
@patch("api.src.main.TTSModel")
async def test_lifespan_cpu_fallback(mock_tts_model):
"""Test model warmup falling back to CPU"""
# Mock file system for voice counting
mock_tts_model.VOICES_DIR = "/mock/voices"
with patch(
"os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
):
mock_tts_model.setup.return_value = 4 # 4 voice files
mock_tts_model.get_device.return_value = "cpu"
# Create an async generator from the lifespan context manager
async_gen = lifespan(MagicMock())
await async_gen.__aenter__()
# Verify model setup was called
mock_tts_model.setup.assert_called_once()
# Clean up
await async_gen.__aexit__(None, None, None)
# Clean up
await async_gen.__aexit__(None, None, None)

View file

@ -16,13 +16,14 @@ def test_get_device_error():
with pytest.raises(RuntimeError, match="Model not initialized"):
TTSBaseModel.get_device()
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA available"""
TTSBaseModel._device = None
mock_cuda_available.return_value = True
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cuda"
assert voice_count == 2
@pytest.mark.asyncio
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA unavailable"""
TTSBaseModel._device = None
mock_cuda_available.return_value = False
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
voice_count = await TTSBaseModel.setup()
assert TTSBaseModel._device == "cpu"
assert voice_count == 2

View file

@ -31,27 +31,6 @@ def sample_audio():
return np.sin(2 * np.pi * frequency * t).astype(np.float32)
def test_split_text(tts_service):
"""Test text splitting into sentences"""
text = "First sentence. Second sentence! Third sentence?"
sentences = tts_service._split_text(text)
assert len(sentences) == 3
assert sentences[0] == "First sentence."
assert sentences[1] == "Second sentence!"
assert sentences[2] == "Third sentence?"
def test_split_text_empty(tts_service):
"""Test splitting empty text"""
assert tts_service._split_text("") == []
def test_split_text_single_sentence(tts_service):
"""Test splitting single sentence"""
text = "Just one sentence."
assert tts_service._split_text(text) == ["Just one sentence."]
def test_audio_to_bytes(tts_service, sample_audio):
"""Test converting audio tensor to bytes"""
audio_bytes = tts_service._audio_to_bytes(sample_audio)
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
mock_torch_load.return_value = torch.zeros((10, 24000))
mock_generate.return_value = (None, None)
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0)
@ -185,7 +164,7 @@ def test_generate_audio_error(
mock_exists.return_value = True
mock_torch_load.return_value = torch.zeros((10, 24000))
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
with pytest.raises(ValueError, match="No chunks were processed successfully"):
tts_service._generate_audio("Test text", "af", 1.0)

Binary file not shown.

After

Width:  |  Height:  |  Size: 227 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 774 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

BIN
assets/voice_analysis.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 958 KiB

View file

@ -45,6 +45,7 @@ services:
- ONNX_OPTIMIZATION_LEVEL=all
- ONNX_MEMORY_PATTERN=true
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
depends_on:
model-fetcher:
condition: service_healthy

View file

@ -1,20 +1,26 @@
services:
model-fetcher:
image: datamachines/git-lfs:latest
environment:
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
volumes:
- ./Kokoro-82M:/app/Kokoro-82M
working_dir: /app/Kokoro-82M
command: >
sh -c "
rm -f .git/index.lock;
if [ -z \"$(ls -A .)\" ]; then
git clone https://huggingface.co/hexgrad/Kokoro-82M .
touch .cloned;
if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
echo 'Skipping model fetch...' && touch .cloned;
else
rm -f .git/index.lock && \
git checkout main && \
git pull origin main && \
touch .cloned;
rm -f .git/index.lock;
if [ -z \"$(ls -A .)\" ]; then
git clone https://huggingface.co/hexgrad/Kokoro-82M .
touch .cloned;
else
rm -f .git/index.lock && \
git checkout main && \
git pull origin main && \
touch .cloned;
fi;
fi;
tail -f /dev/null
"
@ -26,10 +32,10 @@ services:
start_period: 1s
kokoro-tts:
image: ghcr.io/remsky/kokoro-fastapi:latest
# image: ghcr.io/remsky/kokoro-fastapi:latest
# Uncomment below to build from source instead of using the released image
# build:
# context: .
build:
context: .
volumes:
- ./api/src:/app/api/src
- ./Kokoro-82M:/app/Kokoro-82M

View file

@ -0,0 +1,172 @@
#!/usr/bin/env python3
import os
import json
import time
import numpy as np
import pandas as pd
import requests
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_timeline, plot_correlation
from lib.shared_benchmark_utils import enc, get_text_for_tokens
def measure_first_token(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None, # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request without streaming
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False,
},
timeout=1800,
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
content = response.content
with open(audio_path, "wb") as f:
f.write(content)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["time_to_first_chunk"] = time.time() - start_time
results["total_time"] = time.time() - start_time
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 25, 50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(5):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results:
avg_first_chunk = sum(
r["time_to_first_chunk"] for r in matching_results
) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results),
}
# Save results
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
save_json_results(
results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots
plot_correlation(
df,
"target_tokens",
"time_to_first_chunk",
"Time to Audio vs Input Size",
"Number of Input Tokens",
"Time to Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency.png"),
)
plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
import os
import time
import requests
from openai import OpenAI
from lib.stream_utils import run_benchmark
OPENAI_CLIENT = OpenAI(
base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
)
def measure_first_token_requests(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via direct API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "pcm",
"stream": True,
},
stream=True,
timeout=1800,
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Concatenate all PCM chunks
if not chunks:
raise ValueError("No audio chunks received")
all_audio_data = b"".join(chunks)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def measure_first_token_openai(
text: str, output_dir: str, tokens: int, run_number: int
) -> dict:
"""Measure time to audio via OpenAI API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": None, # Will be set by run_benchmark
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None,
}
try:
start_time = time.time()
# Initialize OpenAI client
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
all_audio_data = bytearray()
chunk_count = 0
# Make streaming request using OpenAI client
with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm",
input=text,
) as response:
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunk_count += 1
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
all_audio_data.extend(chunk)
# Write as WAV file
import wave
with wave.open(audio_path, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # Known sample rate for Kokoro
wav_file.writeframes(all_audio_data)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(all_audio_data)} bytes")
print(f"Number of chunks received: {chunk_count}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
prefix='cpu'
# Run requests benchmark
print("\n=== Running Direct Requests Benchmark ===")
run_benchmark(
measure_first_token_requests,
output_dir=os.path.join(script_dir, "output_audio_stream"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream",
plot_title_suffix="(Streaming)",
prefix=prefix
)
# Run OpenAI benchmark
print("\n=== Running OpenAI Library Benchmark ===")
run_benchmark(
measure_first_token_openai,
output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
output_data_dir=os.path.join(script_dir, "output_data"),
output_plots_dir=os.path.join(script_dir, "output_plots"),
suffix="_stream_openai",
plot_title_suffix="(OpenAI Streaming)",
prefix=prefix
)
if __name__ == "__main__":
main()

View file

@ -1,30 +1,37 @@
#!/usr/bin/env python3
import os
import sys
import json
import time
import threading
import queue
import pandas as pd
import sys
import threading
from datetime import datetime
from lib.shared_plotting import plot_system_metrics, plot_correlation
import pandas as pd
from lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats,
real_time_factor
real_time_factor,
save_json_results,
get_system_metrics,
write_benchmark_stats,
)
from lib.shared_plotting import plot_correlation, plot_system_metrics
from lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
)
class SystemMonitor:
def __init__(self, interval=1.0):
"""Rough system tracker: Not always accurate"""
self.interval = interval
self.metrics_queue = queue.Queue()
self.stop_event = threading.Event()
self.metrics_timeline = []
self.start_time = None
def _monitor_loop(self):
"""Background thread function to collect system metrics."""
while not self.stop_event.is_set():
@ -32,20 +39,20 @@ class SystemMonitor:
metrics["relative_time"] = time.time() - self.start_time
self.metrics_queue.put(metrics)
time.sleep(self.interval)
def start(self):
"""Start the monitoring thread."""
self.start_time = time.time()
self.monitor_thread = threading.Thread(target=self._monitor_loop)
self.monitor_thread.daemon = True
self.monitor_thread.start()
def stop(self):
"""Stop the monitoring thread and collect final metrics."""
self.stop_event.set()
if hasattr(self, 'monitor_thread'):
if hasattr(self, "monitor_thread"):
self.monitor_thread.join(timeout=2)
# Collect all metrics from queue
while True:
try:
@ -53,23 +60,24 @@ class SystemMonitor:
self.metrics_timeline.append(metrics)
except queue.Empty:
break
return self.metrics_timeline
def main():
# Initialize system monitor
monitor = SystemMonitor(interval=1.0) # 1 second interval
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
prefix = "gpu"
prefix = "cpu"
# Generate token sizes
if 'gpu' in prefix:
if "gpu" in prefix:
token_sizes = generate_token_sizes(
max_tokens=5000, dense_step=150,
dense_max=1000, sparse_step=1000)
elif 'cpu' in prefix:
max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
)
elif "cpu" in prefix:
token_sizes = generate_token_sizes(
max_tokens=1000, dense_step=300,
dense_max=1000, sparse_step=0)
max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
)
else:
token_sizes = generate_token_sizes(max_tokens=3000)
@ -78,7 +86,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
@ -90,7 +98,9 @@ def main():
filename = f"{prefix}_{filename}"
return os.path.join(path, filename)
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
total_tokens = len(enc.encode(text))
@ -100,7 +110,7 @@ def main():
results = []
test_start_time = time.time()
# Start system monitoring
monitor.start()
@ -114,7 +124,8 @@ def main():
processing_time, audio_length = make_tts_request(
chunk,
output_dir=output_dir,
prefix=prefix
prefix=prefix,
stream=False, # Use non-streaming mode for RTF benchmarking
)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
@ -123,14 +134,16 @@ def main():
# Calculate RTF using the correct formula
rtf = real_time_factor(processing_time, audio_length)
print(f"Real-Time Factor: {rtf:.5f}")
results.append({
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 2),
})
results.append(
{
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 5),
}
)
df = pd.DataFrame(results)
if df.empty:
@ -144,89 +157,101 @@ def main():
{
"title": "Benchmark Statistics (with correct RTF)",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average RTF": df['rtf'].mean(),
"Average Real Time Speed": 1/df['rtf'].mean()
}
"Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average RTF": df["rtf"].mean(),
"Average Real Time Speed": 1 / df["rtf"].mean(),
},
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
"Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df["output_length"].mean(),
},
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
}
}
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
},
},
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
write_benchmark_stats(
stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
)
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
df,
"tokens",
"processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time_rtf.png")
prefix_path(output_plots_dir, "processing_time_rtf.png"),
)
# Plot RTF vs Token Count
plot_correlation(
df, "tokens", "rtf",
df,
"tokens",
"rtf",
"Real-Time Factor vs Input Size",
"Number of Input Tokens",
"Real-Time Factor (processing time / audio length)",
prefix_path(output_plots_dir, "realtime_factor_rtf.png")
prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
)
# Stop monitoring and get final metrics
final_metrics = monitor.stop()
# Convert metrics timeline to DataFrame for stats
metrics_df = pd.DataFrame(final_metrics)
# Add system usage stats
if not metrics_df.empty:
stats.append({
"title": "System Usage Statistics",
"stats": {
"Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
"Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
"Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
"Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
"Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
"Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
stats.append(
{
"title": "System Usage Statistics",
"stats": {
"Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
"Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
"Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
"Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
"Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
"Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
},
}
})
if 'gpu_memory_used' in metrics_df:
stats[-1]["stats"].update({
"Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
"Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
})
)
if "gpu_memory_used" in metrics_df:
stats[-1]["stats"].update(
{
"Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
"Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
}
)
# Plot system metrics
plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
plot_system_metrics(
final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
)
# Save final results
save_json_results(
{
"results": results,
"system_metrics": final_metrics,
"test_duration": time.time() - test_start_time
"test_duration": time.time() - test_start_time,
},
prefix_path(output_data_dir, "benchmark_results_rtf.json")
prefix_path(output_data_dir, "benchmark_results_rtf.json"),
)
print("\nResults saved to:")

View file

@ -1,19 +1,30 @@
import os
import json
import time
import pandas as pd
from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
from examples.assorted_checks.lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats
save_json_results,
get_system_metrics,
write_benchmark_stats,
)
from examples.assorted_checks.lib.shared_plotting import (
plot_correlation,
plot_system_metrics,
)
from examples.assorted_checks.lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
enc,
make_tts_request,
get_text_for_tokens,
generate_token_sizes,
)
def main():
# Get optional prefix from first command line argument
import sys
prefix = sys.argv[1] if len(sys.argv) > 1 else ""
# Set up paths relative to this file
@ -21,7 +32,7 @@ def main():
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
@ -43,7 +54,6 @@ def main():
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
token_sizes = generate_token_sizes(total_tokens)
print(f"Testing sizes: {token_sizes}")
@ -85,7 +95,7 @@ def main():
# Save intermediate results
save_json_results(
{"results": results, "system_metrics": system_metrics},
prefix_path(output_data_dir, "benchmark_results.json")
prefix_path(output_data_dir, "benchmark_results.json"),
)
# Create DataFrame and calculate stats
@ -102,53 +112,59 @@ def main():
{
"title": "Benchmark Statistics",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average realtime factor": df['realtime_factor'].mean()
}
"Total tokens processed": df["tokens"].sum(),
"Total audio generated (s)": df["output_length"].sum(),
"Total test duration (s)": df["elapsed_time"].max(),
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
"Average realtime factor": df["realtime_factor"].mean(),
},
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
"Average chunk size (tokens)": df["tokens"].mean(),
"Min chunk size (tokens)": df["tokens"].min(),
"Max chunk size (tokens)": df["tokens"].max(),
"Average processing time (s)": df["processing_time"].mean(),
"Average output length (s)": df["output_length"].mean(),
},
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
}
}
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
},
},
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
df,
"tokens",
"processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time.png")
prefix_path(output_plots_dir, "processing_time.png"),
)
# Plot Realtime Factor vs Token Count
plot_correlation(
df, "tokens", "realtime_factor",
df,
"tokens",
"realtime_factor",
"Realtime Factor vs Input Size",
"Number of Input Tokens",
"Realtime Factor (output length / processing time)",
prefix_path(output_plots_dir, "realtime_factor.png")
prefix_path(output_plots_dir, "realtime_factor.png"),
)
# Plot system metrics
plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
plot_system_metrics(
system_metrics, prefix_path(output_plots_dir, "system_usage.png")
)
print("\nResults saved to:")
print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")

View file

@ -1,11 +1,12 @@
"""Shared utilities specific to TTS benchmarking."""
import time
from typing import List, Optional, Tuple
from typing import List, Tuple, Optional
import requests
import tiktoken
from .shared_utils import get_audio_length, save_audio_file
from .shared_utils import save_audio_file, get_audio_length
# Global tokenizer instance
enc = tiktoken.get_encoding("cl100k_base")
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")
def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens.
Args:
text: Input text to slice
num_tokens: Desired number of tokens
Returns:
str: Text slice containing exactly num_tokens tokens
"""
@ -31,44 +32,69 @@ def make_tts_request(
text: str,
output_dir: str = None,
timeout: int = 1800,
prefix: str = ""
prefix: str = "",
stream: bool = True,
) -> Tuple[Optional[float], Optional[float]]:
"""Make TTS request using OpenAI-compatible endpoint.
Args:
text: Input text to convert to speech
output_dir: Directory to save audio files. If None, audio won't be saved.
timeout: Request timeout in seconds
prefix: Optional prefix for output filenames
Returns:
tuple: (processing_time, audio_length) in seconds, or (None, None) on error
"""
try:
start_time = time.time()
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
},
timeout=timeout,
)
response.raise_for_status()
if stream:
# For streaming, we need to collect all chunks
audio_chunks = []
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": True,
},
timeout=timeout,
stream=True,
)
response.raise_for_status()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
audio_chunks.append(chunk)
# Combine all chunks
audio_data = b"".join(audio_chunks)
else:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False,
},
timeout=timeout,
)
response.raise_for_status()
audio_data = response.content
processing_time = round(time.time() - start_time, 2)
# Calculate audio length from response content
audio_length = get_audio_length(response.content)
# Calculate audio length from audio data
audio_length = get_audio_length(audio_data)
# Save the audio file if output_dir is provided
if output_dir:
token_count = len(enc.encode(text))
output_file = save_audio_file(
response.content,
f"chunk_{token_count}_tokens",
output_dir
audio_data, f"chunk_{token_count}_tokens", output_dir
)
print(f"Saved audio to {output_file}")
@ -86,26 +112,26 @@ def generate_token_sizes(
max_tokens: int,
dense_step: int = 100,
dense_max: int = 1000,
sparse_step: int = 1000
sparse_step: int = 1000,
) -> List[int]:
"""Generate token size ranges with dense sampling at start.
Args:
max_tokens: Maximum number of tokens to generate sizes up to
dense_step: Step size for dense sampling range
dense_max: Maximum value for dense sampling
sparse_step: Step size for sparse sampling range
Returns:
list: Sorted list of token sizes
"""
# Dense sampling at start
dense_range = list(range(dense_step, dense_max + 1, dense_step))
if max_tokens <= dense_max or sparse_step < dense_max:
return sorted(dense_range)
# Sparse sampling for larger sizes
sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
# Combine and deduplicate
return sorted(list(set(dense_range + sparse_range)))

View file

@ -1,7 +1,10 @@
"""Shared plotting utilities for benchmarks and tests."""
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Common style configurations
STYLE_CONFIG = {
@ -10,66 +13,71 @@ STYLE_CONFIG = {
"secondary_color": "#05d9e8",
"grid_color": "#ffffff",
"text_color": "#ffffff",
"font_sizes": {
"title": 16,
"label": 14,
"tick": 12,
"text": 10
}
"font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
}
def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
"""Configure plot styling with consistent theme.
Args:
fig: matplotlib figure object
ax: matplotlib axis object
title: str, plot title
xlabel: str, optional x-axis label
ylabel: str, optional y-axis label
Returns:
tuple: (fig, ax) with applied styling
"""
# Grid styling
ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
# Title and labels
ax.set_title(title, pad=20,
fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"])
ax.set_title(
title,
pad=20,
fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"],
)
if xlabel:
ax.set_xlabel(xlabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
ax.set_xlabel(
xlabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
if ylabel:
ax.set_ylabel(ylabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
ax.set_ylabel(
ylabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"],
)
# Tick styling
ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"],
colors=STYLE_CONFIG["text_color"])
ax.tick_params(
labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
)
# Spine styling
for spine in ax.spines.values():
spine.set_color(STYLE_CONFIG["text_color"])
spine.set_alpha(0.3)
spine.set_linewidth(0.5)
# Background colors
ax.set_facecolor(STYLE_CONFIG["background_color"])
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
return fig, ax
def plot_system_metrics(metrics_data, output_path):
"""Create plots for system metrics over time.
Args:
metrics_data: list of dicts containing system metrics
output_path: str, path to save the output plot
@ -77,68 +85,281 @@ def plot_system_metrics(metrics_data, output_path):
df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
# Get baseline values
baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
baseline_gpu = (
df["gpu_memory_used"].iloc[0] / 1024
if "gpu_memory_used" in df.columns
else None
)
# Convert GPU memory to GB if present
if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
plt.style.use("dark_background")
# Create subplots based on available metrics
has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
# Smoothing window
window = min(5, len(df) // 2)
# Plot CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[0], "CPU Usage Over Time",
xlabel="Time (seconds)", ylabel="CPU Usage (%)")
sns.lineplot(
x=elapsed_time,
y=smoothed_cpu,
ax=axes[0],
color=STYLE_CONFIG["primary_color"],
linewidth=2,
)
axes[0].axhline(
y=baseline_cpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[0],
"CPU Usage Over Time",
xlabel="Time (seconds)",
ylabel="CPU Usage (%)",
)
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
axes[0].legend()
# Plot RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1],
color=STYLE_CONFIG["secondary_color"], linewidth=2)
axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[1], "RAM Usage Over Time",
xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
sns.lineplot(
x=elapsed_time,
y=smoothed_ram,
ax=axes[1],
color=STYLE_CONFIG["secondary_color"],
linewidth=2,
)
axes[1].axhline(
y=baseline_ram,
color=STYLE_CONFIG["primary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[1],
"RAM Usage Over Time",
xlabel="Time (seconds)",
ylabel="RAM Usage (GB)",
)
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
axes[1].legend()
# Plot GPU Memory if available
if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[2], "GPU Memory Usage Over Time",
xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
sns.lineplot(
x=elapsed_time,
y=smoothed_gpu,
ax=axes[2],
color=STYLE_CONFIG["primary_color"],
linewidth=2,
)
axes[2].axhline(
y=baseline_gpu,
color=STYLE_CONFIG["secondary_color"],
linestyle="--",
alpha=0.5,
label="Baseline",
)
setup_plot(
fig,
axes[2],
"GPU Memory Usage Over Time",
xlabel="Time (seconds)",
ylabel="GPU Memory (GB)",
)
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
axes[2].legend()
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_timeline(df, output_path, suffix="", prefix=""):
"""Create timeline plot showing latency for each run.
Args:
df: pandas DataFrame containing run data with columns:
- target_tokens: number of tokens
- run_number: run iteration
- time_to_first_chunk: latency to first token
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
# Sort by tokens and run number
df = df.sort_values(["target_tokens", "run_number"])
# Create figure and axis
fig, ax = plt.subplots(figsize=(12, 6))
# Calculate y positions for each run with tighter grouping
unique_tokens = sorted(df["target_tokens"].unique())
y_positions = {}
current_y = 0
group_spacing = 0.8 # Space between groups
run_spacing = 0.2 # Space between runs in a group
for tokens in unique_tokens:
runs = df[df["target_tokens"] == tokens]
base_y = current_y
for i, (_, run) in enumerate(runs.iterrows()):
y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
current_y = base_y + (len(runs) * run_spacing) + group_spacing
# Plot bars and points with more transparency
bar_height = 0.15
for _, row in df.iterrows():
y = y_positions[(row["target_tokens"], row["run_number"])]
latency = row["time_to_first_chunk"]
# Latency bar
ax.add_patch(
patches.Rectangle(
(0, y - bar_height / 2),
latency,
bar_height,
facecolor=STYLE_CONFIG["primary_color"],
alpha=0.3,
)
)
# End point
ax.plot(
latency,
y,
"o",
color=STYLE_CONFIG["secondary_color"],
markersize=4,
alpha=0.5,
)
# Add mean lines and values for each token group
for tokens in unique_tokens:
token_runs = df[df["target_tokens"] == tokens]
mean_latency = token_runs["time_to_first_chunk"].mean()
y_positions_for_token = [
y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
]
min_y = min(y_positions_for_token)
max_y = max(y_positions_for_token)
group_center = (min_y + max_y) / 2
# Plot mean line with gradient alpha
gradient = np.linspace(0.2, 0.8, 100)
for i in range(len(gradient) - 1):
y1 = (
min_y
- bar_height
+ (max_y - min_y + 2 * bar_height) * (i / len(gradient))
)
y2 = (
min_y
- bar_height
+ (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
)
ax.plot(
[mean_latency, mean_latency],
[y1, y2],
"-",
color=STYLE_CONFIG["secondary_color"],
linewidth=3,
alpha=gradient[i],
)
# Add mean value label with background
label_text = f"Mean: {mean_latency:.3f}s"
bbox_props = dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["secondary_color"],
alpha=0.8,
pad=3,
linewidth=1,
)
ax.text(
mean_latency + 0.02,
group_center,
label_text,
color=STYLE_CONFIG["secondary_color"],
va="center",
fontsize=10,
fontweight="bold",
bbox=bbox_props,
)
# Customize plot
ax.set_ylim(-1, current_y)
ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3) # Extra space for labels
# Add labels for token groups with tighter spacing
group_positions = {}
for tokens in unique_tokens:
runs = df[df["target_tokens"] == tokens]
y_positions_for_token = [
y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
]
group_positions[tokens] = sum(y_positions_for_token) / len(
y_positions_for_token
)
plt.axhline(
y=min(y_positions_for_token) - bar_height,
color="white",
alpha=0.1,
linestyle="-",
)
# Calculate mean audio length for each token group
audio_lengths = {}
for tokens in unique_tokens:
token_runs = df[df["target_tokens"] == tokens]
audio_lengths[tokens] = token_runs["audio_length"].mean()
# Set y-ticks at group centers with token counts and audio lengths
plt.yticks(
list(group_positions.values()),
[
f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
for tokens in group_positions.keys()
],
fontsize=10,
)
# Customize appearance
setup_plot(
fig,
ax,
prefix.upper() + " Time-To-Audio Latency " + suffix,
xlabel="Time (seconds)",
ylabel="Input Size",
)
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
"""Create correlation plot with regression line and correlation coefficient.
Args:
df: pandas DataFrame containing the data
x: str, column name for x-axis
@ -149,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(12, 8))
# Scatter plot
sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6,
color=STYLE_CONFIG["primary_color"])
sns.scatterplot(
data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
)
# Regression line
sns.regplot(data=df, x=x, y=y, scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2})
sns.regplot(
data=df,
x=x,
y=y,
scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2},
)
# Add correlation coefficient
corr = df[x].corr(df[y])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=STYLE_CONFIG["font_sizes"]["text"],
color=STYLE_CONFIG["text_color"],
bbox=dict(facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7))
plt.text(
0.05,
0.95,
f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=STYLE_CONFIG["font_sizes"]["text"],
color=STYLE_CONFIG["text_color"],
bbox=dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7,
),
)
setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()

View file

@ -1,9 +1,10 @@
"""Shared utilities for benchmarks and tests."""
import os
import json
import subprocess
from typing import Any, Dict, List, Union, Optional
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
import psutil
import scipy.io.wavfile as wavfile
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
TORCH_AVAILABLE = False
try:
import torch
TORCH_AVAILABLE = torch.cuda.is_available()
except ImportError:
pass
def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
"""Check if an audio file is silent by comparing peak amplitude to a threshold.
Args:
audio_path: Path to the audio file
threshold: Peak amplitude threshold for silence
Returns:
bool: True if audio is silent, False otherwise
"""
rate, data = wavfile.read(audio_path)
peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0 # 16-bit audio
return peak_amplitude < threshold
def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
"""Get audio length in seconds from bytes data.
Args:
audio_data: Raw audio bytes
temp_dir: Directory for temporary file. If None, uses system temp directory.
Returns:
float: Audio length in seconds
"""
if temp_dir is None:
import tempfile
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, "temp.wav")
os.makedirs(temp_dir, exist_ok=True)
with open(temp_path, "wb") as f:
f.write(audio_data)
@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
"""Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
Args:
average: If True and multiple GPUs present, returns average memory usage.
If False, returns list of memory usage per GPU.
Returns:
float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
If average=False and multiple GPUs present, returns list of values.
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
n_gpus = torch.cuda.device_count()
memory_used = []
for i in range(n_gpus):
memory_used.append(torch.cuda.memory_allocated(i) / 1024**2) # Convert to MB
memory_used.append(
torch.cuda.memory_allocated(i) / 1024**2
) # Convert to MB
if average and len(memory_used) > 0:
return sum(memory_used) / len(memory_used)
return memory_used if len(memory_used) > 1 else memory_used[0]
# Fall back to nvidia-smi
try:
result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
)
memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
memory_values = [
float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
]
if average and len(memory_values) > 0:
return sum(memory_values) / len(memory_values)
return memory_values if len(memory_values) > 1 else memory_values[0]
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
def get_system_metrics() -> Dict[str, Union[str, float]]:
"""Get current system metrics including CPU, RAM, and GPU if available.
Returns:
dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
"""
# Get per-CPU percentages and calculate average
cpu_percentages = psutil.cpu_percent(percpu=True)
avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
metrics = {
"timestamp": datetime.now().isoformat(),
"cpu_percent": round(avg_cpu, 2),
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:
def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
"""Save audio data to a file with proper naming and directory creation.
Args:
audio_data: Raw audio bytes
identifier: String to identify this audio file (e.g. token count, test name)
output_dir: Directory to save the file
Returns:
str: Path to the saved audio file
"""
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{identifier}.wav")
with open(output_file, "wb") as f:
f.write(audio_data)
return output_file
def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
"""Write benchmark statistics to a file in a clean, organized format.
Args:
stats: List of dictionaries containing stat name/value pairs
output_file: Path to output file
"""
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w") as f:
for section in stats:
# Write section header
f.write(f"=== {section['title']} ===\n\n")
# Write stats
for label, value in section['stats'].items():
for label, value in section["stats"].items():
if isinstance(value, float):
f.write(f"{label}: {value:.2f}\n")
else:
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None
def save_json_results(results: Dict[str, Any], output_file: str) -> None:
"""Save benchmark results to a JSON file with proper formatting.
Args:
results: Dictionary of results to save
output_file: Path to output file
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
json.dump(results, f, indent=2)
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
def real_time_factor(
processing_time: float, audio_length: float, decimals: int = 2
) -> float:
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
Args:
processing_time: Time taken to process/generate audio
audio_length: Length of the generated audio
decimals: Number of decimal places to round to
Returns:
float: RTF value
"""

View file

@ -0,0 +1,205 @@
#!/usr/bin/env python3
import os
import time
import wave
from typing import Any, Dict, List, Callable, Optional
import pandas as pd
import scipy.io.wavfile as wavfile
from .shared_utils import save_json_results
from .shared_plotting import plot_timeline, plot_correlation
from .shared_benchmark_utils import enc, get_text_for_tokens
def check_audio_silence(audio_path: str) -> bool:
"""Check if audio file contains only silence"""
sample_rate, audio_data = wavfile.read(audio_path)
# Convert to float for RMS calculation
audio_float = audio_data.astype(float)
# Calculate RMS value
rms = (audio_float**2).mean() ** 0.5
# Define silence threshold (adjust if needed)
SILENCE_THRESHOLD = 50.0
return rms < SILENCE_THRESHOLD
def process_benchmark_results(
all_results: List[Dict[str, Any]], token_sizes: List[int]
) -> Dict[str, Any]:
"""Process benchmark results and generate summary"""
summary = {}
for tokens in token_sizes:
matching_results = [
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
]
if matching_results:
avg_first_chunk = sum(
r["time_to_first_chunk"] for r in matching_results
) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(
matching_results
)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
matching_results
)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results),
}
return summary
def save_benchmark_results(
all_results: List[Dict[str, Any]],
summary: Dict[str, Any],
output_data_dir: str,
output_plots_dir: str,
suffix: str,
plot_title_suffix: str,
prefix: str = "",
):
"""Save benchmark results and generate plots"""
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
save_json_results(
results_data,
os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create plots
plot_correlation(
df,
"target_tokens",
"time_to_first_chunk",
f"Time to First Audio vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
)
plot_correlation(
df,
"target_tokens",
"total_time",
f"Total Time vs Input Size {plot_title_suffix}",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
)
plot_timeline(
df,
os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
suffix=plot_title_suffix,
)
def run_benchmark(
measure_func: Callable,
output_dir: str,
output_data_dir: str,
output_plots_dir: str,
suffix: str = "",
plot_title_suffix: str = "",
num_runs: int = 5,
client=None,
prefix="",
):
"""Run benchmark with the given measurement function"""
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(output_plots_dir, exist_ok=True)
# Load sample text
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 50, 100, 250, 500]
all_results = []
silent_files = []
for tokens in token_sizes:
print(
f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
)
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
for i in range(num_runs):
print(f"Run {i+1}/{num_runs}...")
result = measure_func(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
# Handle time to first audio
first_chunk = result.get('time_to_first_chunk')
print(
f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
)
# Handle total time
total_time = result.get('total_time')
print(
f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
)
# Handle audio length
audio_length = result.get('audio_length')
print(
f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
)
# Calculate streaming overhead only if both values exist
if total_time is not None and first_chunk is not None:
print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
else:
print("Streaming overhead: N/A")
if result["error"]:
print(f"Error: {result['error']}")
elif result["audio_path"] and check_audio_silence(result["audio_path"]):
silent_files.append(result["audio_path"])
all_results.append(result)
# Process and save results
summary = process_benchmark_results(all_results, token_sizes)
save_benchmark_results(
all_results,
summary,
output_data_dir,
output_plots_dir,
suffix,
plot_title_suffix,
)
# Print paths
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
# Print silence check summary
if silent_files:
print("\nWARNING: The following files contain only silence:")
for file in silent_files:
print(f"- {file}")
else:
print("\nAll generated audio files contain valid audio content.")

View file

@ -1,111 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 18.833295583724976,
"output_length": 31.15,
"realtime_factor": 1.6539856161403135,
"elapsed_time": 19.024322748184204
},
{
"tokens": 200,
"processing_time": 38.95506024360657,
"output_length": 62.6,
"realtime_factor": 1.6069799304257042,
"elapsed_time": 58.21527123451233
},
{
"tokens": 300,
"processing_time": 49.74252939224243,
"output_length": 96.325,
"realtime_factor": 1.9364716908630366,
"elapsed_time": 108.19673728942871
},
{
"tokens": 400,
"processing_time": 61.349056243896484,
"output_length": 128.575,
"realtime_factor": 2.095794261102292,
"elapsed_time": 169.733656167984
},
{
"tokens": 500,
"processing_time": 82.86568236351013,
"output_length": 158.575,
"realtime_factor": 1.9136389815071193,
"elapsed_time": 252.7968451976776
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:13:49.865330",
"cpu_percent": 8.0,
"ram_percent": 39.4,
"ram_used_gb": 25.03811264038086,
"gpu_memory_used": 1204.0
},
{
"timestamp": "2025-01-03T00:14:08.781551",
"cpu_percent": 26.8,
"ram_percent": 42.6,
"ram_used_gb": 27.090862274169922,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:08.916973",
"cpu_percent": 16.1,
"ram_percent": 42.6,
"ram_used_gb": 27.089553833007812,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:47.979053",
"cpu_percent": 31.5,
"ram_percent": 43.6,
"ram_used_gb": 27.714427947998047,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:48.098976",
"cpu_percent": 20.0,
"ram_percent": 43.6,
"ram_used_gb": 27.704315185546875,
"gpu_memory_used": 1211.0
},
{
"timestamp": "2025-01-03T00:15:37.944729",
"cpu_percent": 29.7,
"ram_percent": 38.6,
"ram_used_gb": 24.53925323486328,
"gpu_memory_used": 1217.0
},
{
"timestamp": "2025-01-03T00:15:38.071915",
"cpu_percent": 8.6,
"ram_percent": 38.5,
"ram_used_gb": 24.51690673828125,
"gpu_memory_used": 1208.0
},
{
"timestamp": "2025-01-03T00:16:39.525449",
"cpu_percent": 23.4,
"ram_percent": 38.8,
"ram_used_gb": 24.71230697631836,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:16:39.612442",
"cpu_percent": 5.5,
"ram_percent": 38.9,
"ram_used_gb": 24.72066879272461,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:18:02.569076",
"cpu_percent": 27.4,
"ram_percent": 39.1,
"ram_used_gb": 24.868202209472656,
"gpu_memory_used": 1264.0
}
]
}

View file

@ -1,216 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 14.349808931350708,
"output_length": 31.15,
"rtf": 0.46,
"elapsed_time": 14.716031074523926
},
{
"tokens": 200,
"processing_time": 28.341803312301636,
"output_length": 62.6,
"rtf": 0.45,
"elapsed_time": 43.44207406044006
},
{
"tokens": 300,
"processing_time": 43.352553606033325,
"output_length": 96.325,
"rtf": 0.45,
"elapsed_time": 87.26906609535217
},
{
"tokens": 400,
"processing_time": 71.02449822425842,
"output_length": 128.575,
"rtf": 0.55,
"elapsed_time": 158.7198133468628
},
{
"tokens": 500,
"processing_time": 70.92521691322327,
"output_length": 158.575,
"rtf": 0.45,
"elapsed_time": 230.01379895210266
},
{
"tokens": 600,
"processing_time": 83.6328592300415,
"output_length": 189.25,
"rtf": 0.44,
"elapsed_time": 314.02610969543457
},
{
"tokens": 700,
"processing_time": 103.0810194015503,
"output_length": 222.075,
"rtf": 0.46,
"elapsed_time": 417.5678551197052
},
{
"tokens": 800,
"processing_time": 127.02162909507751,
"output_length": 253.85,
"rtf": 0.5,
"elapsed_time": 545.0128681659698
},
{
"tokens": 900,
"processing_time": 130.49781227111816,
"output_length": 283.775,
"rtf": 0.46,
"elapsed_time": 675.8943417072296
},
{
"tokens": 1000,
"processing_time": 154.76425909996033,
"output_length": 315.475,
"rtf": 0.49,
"elapsed_time": 831.0677945613861
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:23:52.896889",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.86032485961914,
"gpu_memory_used": 1281.0
},
{
"timestamp": "2025-01-03T00:24:07.429461",
"cpu_percent": 4.5,
"ram_percent": 39.1,
"ram_used_gb": 24.847564697265625,
"gpu_memory_used": 1285.0
},
{
"timestamp": "2025-01-03T00:24:07.620587",
"cpu_percent": 2.7,
"ram_percent": 39.1,
"ram_used_gb": 24.846607208251953,
"gpu_memory_used": 1275.0
},
{
"timestamp": "2025-01-03T00:24:36.140754",
"cpu_percent": 5.4,
"ram_percent": 39.1,
"ram_used_gb": 24.857810974121094,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:24:36.340675",
"cpu_percent": 6.2,
"ram_percent": 39.1,
"ram_used_gb": 24.85773468017578,
"gpu_memory_used": 1267.0
},
{
"timestamp": "2025-01-03T00:25:19.905634",
"cpu_percent": 29.1,
"ram_percent": 39.2,
"ram_used_gb": 24.920318603515625,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:25:20.182219",
"cpu_percent": 20.0,
"ram_percent": 39.2,
"ram_used_gb": 24.930198669433594,
"gpu_memory_used": 1256.0
},
{
"timestamp": "2025-01-03T00:26:31.414760",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.127891540527344,
"gpu_memory_used": 1259.0
},
{
"timestamp": "2025-01-03T00:26:31.617256",
"cpu_percent": 3.6,
"ram_percent": 39.5,
"ram_used_gb": 25.126346588134766,
"gpu_memory_used": 1252.0
},
{
"timestamp": "2025-01-03T00:27:42.736097",
"cpu_percent": 10.5,
"ram_percent": 39.5,
"ram_used_gb": 25.100231170654297,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:27:42.912870",
"cpu_percent": 5.3,
"ram_percent": 39.5,
"ram_used_gb": 25.098285675048828,
"gpu_memory_used": 1249.0
},
{
"timestamp": "2025-01-03T00:29:06.725264",
"cpu_percent": 8.9,
"ram_percent": 39.5,
"ram_used_gb": 25.123123168945312,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:29:06.928826",
"cpu_percent": 5.5,
"ram_percent": 39.5,
"ram_used_gb": 25.128646850585938,
"gpu_memory_used": 1239.0
},
{
"timestamp": "2025-01-03T00:30:50.206349",
"cpu_percent": 49.6,
"ram_percent": 39.6,
"ram_used_gb": 25.162948608398438,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:30:50.491837",
"cpu_percent": 14.8,
"ram_percent": 39.5,
"ram_used_gb": 25.13379669189453,
"gpu_memory_used": 1245.0
},
{
"timestamp": "2025-01-03T00:32:57.721467",
"cpu_percent": 6.2,
"ram_percent": 39.6,
"ram_used_gb": 25.187721252441406,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:32:57.913350",
"cpu_percent": 3.6,
"ram_percent": 39.6,
"ram_used_gb": 25.199390411376953,
"gpu_memory_used": 1384.0
},
{
"timestamp": "2025-01-03T00:35:08.608730",
"cpu_percent": 6.3,
"ram_percent": 39.8,
"ram_used_gb": 25.311710357666016,
"gpu_memory_used": 1330.0
},
{
"timestamp": "2025-01-03T00:35:08.791851",
"cpu_percent": 5.3,
"ram_percent": 39.8,
"ram_used_gb": 25.326683044433594,
"gpu_memory_used": 1333.0
},
{
"timestamp": "2025-01-03T00:37:43.782406",
"cpu_percent": 6.8,
"ram_percent": 40.6,
"ram_used_gb": 25.803058624267578,
"gpu_memory_used": 1409.0
}
]
}

View file

@ -1,300 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 0.96,
"output_length": 31.1,
"rtf": 0.03,
"elapsed_time": 1.11
},
{
"tokens": 250,
"processing_time": 2.23,
"output_length": 77.17,
"rtf": 0.03,
"elapsed_time": 3.49
},
{
"tokens": 400,
"processing_time": 4.05,
"output_length": 128.05,
"rtf": 0.03,
"elapsed_time": 7.77
},
{
"tokens": 550,
"processing_time": 4.06,
"output_length": 171.45,
"rtf": 0.02,
"elapsed_time": 12.0
},
{
"tokens": 700,
"processing_time": 6.01,
"output_length": 221.6,
"rtf": 0.03,
"elapsed_time": 18.16
},
{
"tokens": 850,
"processing_time": 6.9,
"output_length": 269.1,
"rtf": 0.03,
"elapsed_time": 25.21
},
{
"tokens": 1000,
"processing_time": 7.65,
"output_length": 315.05,
"rtf": 0.02,
"elapsed_time": 33.03
},
{
"tokens": 6000,
"processing_time": 48.7,
"output_length": 1837.1,
"rtf": 0.03,
"elapsed_time": 82.21
},
{
"tokens": 11000,
"processing_time": 92.44,
"output_length": 3388.57,
"rtf": 0.03,
"elapsed_time": 175.46
},
{
"tokens": 16000,
"processing_time": 163.61,
"output_length": 4977.32,
"rtf": 0.03,
"elapsed_time": 340.46
},
{
"tokens": 21000,
"processing_time": 209.72,
"output_length": 6533.3,
"rtf": 0.03,
"elapsed_time": 551.92
},
{
"tokens": 26000,
"processing_time": 329.35,
"output_length": 8068.15,
"rtf": 0.04,
"elapsed_time": 883.37
},
{
"tokens": 31000,
"processing_time": 473.52,
"output_length": 9611.48,
"rtf": 0.05,
"elapsed_time": 1359.28
},
{
"tokens": 36000,
"processing_time": 650.98,
"output_length": 11157.15,
"rtf": 0.06,
"elapsed_time": 2012.9
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T14:41:01.331735",
"cpu_percent": 7.5,
"ram_percent": 50.2,
"ram_used_gb": 31.960269927978516,
"gpu_memory_used": 3191.0
},
{
"timestamp": "2025-01-03T14:41:02.357116",
"cpu_percent": 17.01,
"ram_percent": 50.2,
"ram_used_gb": 31.96163558959961,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:02.445009",
"cpu_percent": 9.5,
"ram_percent": 50.3,
"ram_used_gb": 31.966781616210938,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:04.742152",
"cpu_percent": 18.27,
"ram_percent": 50.4,
"ram_used_gb": 32.08788299560547,
"gpu_memory_used": 3642.0
},
{
"timestamp": "2025-01-03T14:41:04.847795",
"cpu_percent": 16.27,
"ram_percent": 50.5,
"ram_used_gb": 32.094364166259766,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.019590",
"cpu_percent": 15.97,
"ram_percent": 50.7,
"ram_used_gb": 32.23244094848633,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.110324",
"cpu_percent": 3.54,
"ram_percent": 50.7,
"ram_used_gb": 32.234458923339844,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:13.252607",
"cpu_percent": 13.4,
"ram_percent": 50.6,
"ram_used_gb": 32.194271087646484,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:13.327557",
"cpu_percent": 4.69,
"ram_percent": 50.6,
"ram_used_gb": 32.191776275634766,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:19.413633",
"cpu_percent": 12.92,
"ram_percent": 50.9,
"ram_used_gb": 32.3467903137207,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:19.492758",
"cpu_percent": 7.5,
"ram_percent": 50.8,
"ram_used_gb": 32.34375,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:26.467284",
"cpu_percent": 13.09,
"ram_percent": 51.2,
"ram_used_gb": 32.56281280517578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:26.553559",
"cpu_percent": 8.39,
"ram_percent": 51.2,
"ram_used_gb": 32.56183624267578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:34.284362",
"cpu_percent": 12.61,
"ram_percent": 51.7,
"ram_used_gb": 32.874778747558594,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:34.362353",
"cpu_percent": 1.25,
"ram_percent": 51.7,
"ram_used_gb": 32.87461471557617,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:42:23.471312",
"cpu_percent": 11.64,
"ram_percent": 54.9,
"ram_used_gb": 34.90264129638672,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:42:23.547203",
"cpu_percent": 5.31,
"ram_percent": 54.9,
"ram_used_gb": 34.91563415527344,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:43:56.724933",
"cpu_percent": 12.97,
"ram_percent": 59.5,
"ram_used_gb": 37.84241485595703,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:43:56.815453",
"cpu_percent": 11.75,
"ram_percent": 59.5,
"ram_used_gb": 37.832679748535156,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:46:41.705155",
"cpu_percent": 12.94,
"ram_percent": 66.3,
"ram_used_gb": 42.1534538269043,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:46:41.835177",
"cpu_percent": 7.73,
"ram_percent": 66.2,
"ram_used_gb": 42.13554000854492,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:50:13.166236",
"cpu_percent": 11.62,
"ram_percent": 73.4,
"ram_used_gb": 46.71288299560547,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:50:13.261611",
"cpu_percent": 8.16,
"ram_percent": 73.4,
"ram_used_gb": 46.71356201171875,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:55:44.623607",
"cpu_percent": 12.92,
"ram_percent": 82.8,
"ram_used_gb": 52.65533447265625,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T14:55:44.735410",
"cpu_percent": 15.29,
"ram_percent": 82.7,
"ram_used_gb": 52.63290786743164,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T15:03:40.534449",
"cpu_percent": 13.88,
"ram_percent": 85.0,
"ram_used_gb": 54.050071716308594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:03:40.638708",
"cpu_percent": 12.21,
"ram_percent": 85.0,
"ram_used_gb": 54.053733825683594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:14:34.159142",
"cpu_percent": 14.51,
"ram_percent": 78.1,
"ram_used_gb": 49.70396423339844,
"gpu_memory_used": 4739.0
}
]
}

View file

@ -1,19 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 5500
Total audio generated: 1741.65s
Total test duration: 831.07s
Average processing rate: 6.72 tokens/second
Average RTF: 0.47x
Per-chunk Stats:
Average chunk size: 550.00 tokens
Min chunk size: 100.00 tokens
Max chunk size: 1000.00 tokens
Average processing time: 82.70s
Average output length: 174.17s
Performance Ranges:
Processing rate range: 5.63 - 7.17 tokens/second
RTF range: 0.44x - 0.55x

View file

@ -1,9 +0,0 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 150850
Total audio generated: 46786.59s
Total test duration: 2012.90s
Average processing rate: 104.34 tokens/second
Average RTF: 0.03x

View file

@ -2,22 +2,22 @@
Total tokens processed: 1800
Total audio generated (s): 568.53
Total test duration (s): 244.10
Average processing rate (tokens/s): 7.34
Average RTF: 0.43
Average Real Time Speed: 2.33
Total test duration (s): 306.02
Average processing rate (tokens/s): 5.75
Average RTF: 0.55
Average Real Time Speed: 1.81
=== Per-chunk Stats ===
Average chunk size (tokens): 600.00
Min chunk size (tokens): 300
Max chunk size (tokens): 900
Average processing time (s): 81.30
Average processing time (s): 101.89
Average output length (s): 189.51
=== Performance Ranges ===
Processing rate range (tokens/s): 7.21 - 7.47
RTF range: 0.43x - 0.43x
Real Time Speed range: 2.33x - 2.33x
Processing rate range (tokens/s): 5.30 - 6.26
RTF range: 0.51x - 0.59x
Real Time Speed range: 1.69x - 1.96x

View file

@ -0,0 +1,337 @@
{
"individual_runs": [
{
"text_length": 37,
"token_count": null,
"total_time": 1.818483829498291,
"time_to_first_chunk": 1.8067498207092285,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.6271553039550781,
"time_to_first_chunk": 1.610968828201294,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.5759549140930176,
"time_to_first_chunk": 1.561316967010498,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.615680456161499,
"time_to_first_chunk": 1.6035709381103516,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.6515357494354248,
"time_to_first_chunk": 1.6268820762634277,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 7.368175268173218,
"time_to_first_chunk": 3.4540352821350098,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.931752443313599,
"time_to_first_chunk": 3.1553661823272705,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.867500066757202,
"time_to_first_chunk": 3.127124309539795,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.933881521224976,
"time_to_first_chunk": 3.1872360706329346,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": null,
"total_time": 7.605916738510132,
"time_to_first_chunk": 3.6397976875305176,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": null,
"total_time": 14.777218580245972,
"time_to_first_chunk": 3.625889778137207,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": null,
"total_time": 13.911701202392578,
"time_to_first_chunk": 3.298157215118408,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": null,
"total_time": 14.451806783676147,
"time_to_first_chunk": 3.8353848457336426,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": null,
"total_time": 13.941124200820923,
"time_to_first_chunk": 3.3754897117614746,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": null,
"total_time": 15.717307329177856,
"time_to_first_chunk": 3.6421003341674805,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 1140,
"token_count": null,
"total_time": 41.16162133216858,
"time_to_first_chunk": 3.7044918537139893,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 1
},
{
"text_length": 1140,
"token_count": null,
"total_time": 35.43009877204895,
"time_to_first_chunk": 3.1040024757385254,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 2
},
{
"text_length": 1140,
"token_count": null,
"total_time": 35.285505294799805,
"time_to_first_chunk": 3.657808780670166,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 3
},
{
"text_length": 1140,
"token_count": null,
"total_time": 34.47842836380005,
"time_to_first_chunk": 3.2033851146698,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 4
},
{
"text_length": 1140,
"token_count": null,
"total_time": 36.50936222076416,
"time_to_first_chunk": 3.1159815788269043,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 5
},
{
"text_length": 2232,
"token_count": null,
"total_time": 86.84899735450745,
"time_to_first_chunk": 5.405678987503052,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": null,
"total_time": 74.72578477859497,
"time_to_first_chunk": 3.966891050338745,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": null,
"total_time": 68.1974081993103,
"time_to_first_chunk": 3.27712082862854,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": null,
"total_time": 72.68819260597229,
"time_to_first_chunk": 3.153608560562134,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": null,
"total_time": 67.94887590408325,
"time_to_first_chunk": 3.954728841781616,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 1.642,
"avg_total_time": 1.658,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 3.313,
"avg_total_time": 7.141,
"avg_audio_length": 15.825,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 3.555,
"avg_total_time": 14.56,
"avg_audio_length": 30.35,
"num_successful_runs": 5
},
"250": {
"avg_time_to_first_chunk": 3.357,
"avg_total_time": 36.573,
"avg_audio_length": 78.175,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 3.952,
"avg_total_time": 74.082,
"avg_audio_length": 155.125,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-06 03:31:37"
}

View file

@ -0,0 +1,337 @@
{
"individual_runs": [
{
"text_length": 37,
"token_count": null,
"total_time": 1.638200044631958,
"time_to_first_chunk": 1.6232295036315918,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.4960439205169678,
"time_to_first_chunk": 1.4854960441589355,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.5055279731750488,
"time_to_first_chunk": 1.4948456287384033,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.496837854385376,
"time_to_first_chunk": 1.4835176467895508,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": null,
"total_time": 1.7330272197723389,
"time_to_first_chunk": 1.7219843864440918,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.865253925323486,
"time_to_first_chunk": 3.1809072494506836,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": null,
"total_time": 7.975425720214844,
"time_to_first_chunk": 3.2910428047180176,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.793715715408325,
"time_to_first_chunk": 3.210068464279175,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": null,
"total_time": 6.639606237411499,
"time_to_first_chunk": 3.0641400814056396,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": null,
"total_time": 8.100529193878174,
"time_to_first_chunk": 3.3910109996795654,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
"audio_length": 15.825,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": null,
"total_time": 15.246968984603882,
"time_to_first_chunk": 3.1980819702148438,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": null,
"total_time": 15.934760332107544,
"time_to_first_chunk": 4.23082709312439,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": null,
"total_time": 13.799078226089478,
"time_to_first_chunk": 3.42996883392334,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": null,
"total_time": 13.400063037872314,
"time_to_first_chunk": 3.2097883224487305,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": null,
"total_time": 14.833694219589233,
"time_to_first_chunk": 3.1589744091033936,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
"audio_length": 30.35,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 1140,
"token_count": null,
"total_time": 35.49378156661987,
"time_to_first_chunk": 3.852027177810669,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 1
},
{
"text_length": 1140,
"token_count": null,
"total_time": 33.59433174133301,
"time_to_first_chunk": 3.2059006690979004,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 2
},
{
"text_length": 1140,
"token_count": null,
"total_time": 34.23120045661926,
"time_to_first_chunk": 3.1464977264404297,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 3
},
{
"text_length": 1140,
"token_count": null,
"total_time": 36.18487215042114,
"time_to_first_chunk": 3.188844919204712,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 4
},
{
"text_length": 1140,
"token_count": null,
"total_time": 38.142744302749634,
"time_to_first_chunk": 3.6997063159942627,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
"audio_length": 78.175,
"target_tokens": 250,
"actual_tokens": 250,
"run_number": 5
},
{
"text_length": 2232,
"token_count": null,
"total_time": 71.48920440673828,
"time_to_first_chunk": 3.148237943649292,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": null,
"total_time": 73.53017520904541,
"time_to_first_chunk": 3.464594841003418,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": null,
"total_time": 75.52278685569763,
"time_to_first_chunk": 3.5506417751312256,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": null,
"total_time": 69.45922994613647,
"time_to_first_chunk": 3.495962619781494,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": null,
"total_time": 66.66928672790527,
"time_to_first_chunk": 3.301323175430298,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
"audio_length": 155.125,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 1.562,
"avg_total_time": 1.574,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 3.227,
"avg_total_time": 7.275,
"avg_audio_length": 15.825,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 3.446,
"avg_total_time": 14.643,
"avg_audio_length": 30.35,
"num_successful_runs": 5
},
"250": {
"avg_time_to_first_chunk": 3.419,
"avg_total_time": 35.529,
"avg_audio_length": 78.175,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 3.392,
"avg_total_time": 71.334,
"avg_audio_length": 155.125,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-06 03:42:32"
}

View file

@ -1,23 +1,23 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 17150
Total audio generated (s): 5296.38
Total test duration (s): 155.23
Average processing rate (tokens/s): 102.86
Average RTF: 0.03
Average Real Time Speed: 31.25
Total tokens processed: 3150
Total audio generated (s): 994.22
Total test duration (s): 73.81
Average processing rate (tokens/s): 49.36
Average RTF: 0.07
Average Real Time Speed: 15.00
=== Per-chunk Stats ===
Average chunk size (tokens): 1715.00
Average chunk size (tokens): 525.00
Min chunk size (tokens): 150
Max chunk size (tokens): 5000
Average processing time (s): 15.39
Average output length (s): 529.64
Max chunk size (tokens): 900
Average processing time (s): 12.12
Average output length (s): 165.70
=== Performance Ranges ===
Processing rate range (tokens/s): 80.65 - 125.10
RTF range: 0.03x - 0.04x
Real Time Speed range: 25.00x - 33.33x
Processing rate range (tokens/s): 30.33 - 63.56
RTF range: 0.05x - 0.10x
Real Time Speed range: 10.00x - 20.00x

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 181 KiB

After

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 454 KiB

After

Width:  |  Height:  |  Size: 491 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 764 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 238 KiB

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 250 KiB

After

Width:  |  Height:  |  Size: 221 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 459 KiB

After

Width:  |  Height:  |  Size: 463 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 198 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

View file

@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""Script to generate all plots needed for the README."""
import os
import sys
import shutil
from pathlib import Path
from validate_wav import validate_tts
# Get absolute paths
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent.parent
# Add directories to Python path for imports
sys.path.append(str(script_dir))
sys.path.append(str(script_dir / "benchmarks"))
# Import test scripts
from benchmark_tts_rtf import main as benchmark_rtf
from test_formats.test_audio_formats import main as test_formats
from benchmark_first_token_stream_unified import main as benchmark_stream
from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
# Remove directories from path after imports
sys.path.remove(str(script_dir))
sys.path.remove(str(script_dir / "benchmarks"))
def ensure_assets_dir():
"""Create assets directory if it doesn't exist."""
assets_dir = project_root / "assets"
assets_dir.mkdir(exist_ok=True)
return assets_dir
def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
"""Copy a plot to the assets directory with a new name."""
if os.path.exists(src_path):
shutil.copy2(src_path, assets_dir / dest_name)
print(f"Copied {src_path} to {assets_dir / dest_name}")
else:
print(f"Warning: Source plot not found at {src_path}")
def validate_and_print(wav_path: str, category: str):
"""Validate a WAV file and print results."""
if not os.path.exists(wav_path):
print(f"Warning: WAV file not found at {wav_path}")
return
print(f"\n=== Validating {category} Audio ===")
result = validate_tts(wav_path)
if "error" in result:
print(f"Error: {result['error']}")
else:
print(f"Duration: {result['duration']}")
print(f"Sample Rate: {result['sample_rate']} Hz")
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
print(f"- {issue}")
else:
print("\nNo issues found")
def main():
"""Generate all plots needed for the README."""
# Ensure assets directory exists
prefix = "gpu"
assets_dir = ensure_assets_dir()
print("\n=== Generating Format Comparison Plot ===")
test_formats()
copy_plot(
str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
"format_comparison.png",
assets_dir,
)
# Validate WAV output from format test
validate_and_print(
str(script_dir / "test_formats/output/test_formats/speech.wav"),
"Format Test WAV",
)
print("\n=== Generating Voice Analysis Plot ===")
test_voice_analysis()
copy_plot(
str(script_dir / "test_combinations/output/analysis_comparison.png"),
"voice_analysis.png",
assets_dir,
)
# Validate combined voice output
validate_and_print(
str(
script_dir
/ "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
),
"Combined Voice",
)
print("\n=== Generating Performance Benchmark Plots ===")
benchmark_rtf()
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
f"{prefix}_processing_time.png",
assets_dir,
)
copy_plot(
str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
f"{prefix}_realtime_factor.png",
assets_dir,
)
# Validate RTF benchmark output (~500 tokens)
validate_and_print(
str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
"RTF Benchmark",
)
print("\n=== Generating Streaming Benchmark Plots ===")
benchmark_stream()
# Copy direct streaming plots
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
f"{prefix}_first_token_latency_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
f"{prefix}_first_token_timeline_direct.png",
assets_dir,
)
copy_plot(
str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
f"{prefix}_total_time_latency_direct.png",
assets_dir,
)
# Copy OpenAI streaming plots
copy_plot(
str(
script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
),
f"{prefix}_first_token_latency_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir
/ "benchmarks/output_plots/first_token_timeline_stream_openai.png"
),
f"{prefix}_first_token_timeline_openai.png",
assets_dir,
)
copy_plot(
str(
script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
),
f"{prefix}_total_time_latency_openai.png",
assets_dir,
)
# Wait a moment for files to be generated
import time
time.sleep(2)
# Validate streaming outputs (~500 tokens)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
),
"Direct Streaming",
)
validate_and_print(
str(
script_dir
/ "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
),
"OpenAI Streaming",
)
validate_and_print(
str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
"Format Test WAV",
)
print("\nAll plots have been generated and copied to the assets directory")
if __name__ == "__main__":
main()

View file

@ -73,6 +73,7 @@ def generate_speech(
"voice": voice,
"speed": 1.0,
"response_format": "wav", # Use WAV for analysis
"stream": False,
},
)
@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
fig.patch.set_facecolor("#1a1a2e")
num_files = len(audio_files)
# Create subplot grid with proper spacing
# Create subplot grid with proper spacing for waveforms and metrics
total_rows = num_files + 2 # Add one more row for metrics
gs = plt.GridSpec(
num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
)
# Analyze all files first
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
# Colors for voices
colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]
# Create two subplots for metrics with similar scales
# Left subplot: Brightness and Volume
ax1 = plt.subplot(gs[num_files, 0])
metrics1 = [
# Create metrics for each subplot
metrics = [
(
"Brightness",
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
"kHz",
),
("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
]
# Right subplot: Voice Pitch and Texture
ax2 = plt.subplot(gs[num_files, 1])
metrics2 = [
(
"Voice Pitch",
[min(chars["dominant_frequencies"]) for chars in all_chars.values()],
"Hz",
plt.subplot(gs[num_files, 0]),
[
(
"Volume",
[chars["rms"] * 100 for chars in all_chars.values()],
"RMS×100",
)
],
),
(
"Texture",
[chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
"ZCR×1000",
plt.subplot(gs[num_files, 1]),
[
(
"Brightness",
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
"kHz",
)
],
),
(
plt.subplot(gs[num_files + 1, 0]),
[
(
"Voice Pitch",
[
min(chars["dominant_frequencies"])
for chars in all_chars.values()
],
"Hz",
)
],
),
(
plt.subplot(gs[num_files + 1, 1]),
[
(
"Texture",
[
chars["zero_crossing_rate"] * 1000
for chars in all_chars.values()
],
"ZCR×1000",
)
],
),
]
def plot_grouped_bars(ax, metrics, show_legend=True):
n_groups = len(metrics)
# Plot each metric
for i, (ax, metric_data) in enumerate(metrics):
n_voices = len(audio_files)
bar_width = 0.25
indices = np.array([0])
indices = np.arange(n_groups)
values = metric_data[0][1]
max_val = max(values)
# Get max value for y-axis scaling
max_val = max(max(m[1]) for m in metrics)
for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
values = [m[1][i] for m in metrics]
offset = (i - n_voices / 2 + 0.5) * bar_width
for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
offset = (j - n_voices / 2 + 0.5) * bar_width
bars = ax.bar(
indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
indices + offset,
[values[j]],
bar_width,
label=voice,
color=color,
alpha=0.8,
)
# Add value labels on top of bars
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
)
ax.set_xticks(indices)
ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
# Set y-axis limits with some padding
ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
ax.set_ylim(0, max_val * 1.2)
ax.set_ylabel("Value")
if show_legend:
# Only show legend on first metric plot
if i == 0:
ax.legend(
bbox_to_anchor=(1.05, 1),
loc="upper left",
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
edgecolor="#ffffff",
)
# Plot both subplots
plot_grouped_bars(ax1, metrics1, show_legend=True)
plot_grouped_bars(ax2, metrics2, show_legend=False)
# Style the subplot
setup_plot(fig, ax, metric_data[0][0])
# Style both subplots
setup_plot(fig, ax1, "Brightness and Volume")
setup_plot(fig, ax2, "Voice Pitch and Texture")
# Add y-axis labels
ax1.set_ylabel("Value")
ax2.set_ylabel("Value")
# Adjust the figure size to accommodate the legend
fig.set_size_inches(15, 15)
# Add padding around the entire figure
# Adjust the figure size and padding
fig.set_size_inches(15, 20)
plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
@ -332,7 +349,7 @@ def main():
)
parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
parser.add_argument(
"--output-dir",
"--output-dir",
default="examples/assorted_checks/test_combinations/output",
help="Output directory for audio files",
)

View file

@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
for i, stat in enumerate(stats):
format_name = stat["format"].upper()
try:
# Handle PCM format differently
if stat["format"] == "pcm":
# Read raw PCM data (16-bit mono)
with open(
os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
) as f:
raw_data = f.read()
data = np.frombuffer(raw_data, dtype=np.int16)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000
else:
# Read other formats with soundfile
data, sr = sf.read(
os.path.join(output_dir, f"test_audio.{stat['format']}")
)
file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
# Plot waveform
if stat["format"] == "wav":
# Use scipy.io.wavfile for WAV files
sr, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
elif stat["format"] == "pcm":
# Read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
sr = 24000 # Known sample rate for our endpoint
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sr = sf.read(file_path)
# Plot waveform with consistent normalization
ax = plt.subplot(gs_waves[i])
time = np.arange(len(data)) / sr
plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
ax.set_xlabel("Time (seconds)")
ax.set_ylabel("")
ax.set_ylim(-1.1, 1.1)
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
"""Get audio file statistics"""
file_size = os.path.getsize(file_path)
file_size_kb = file_size / 1024 # Convert to KB
format_name = Path(file_path).suffix[1:]
try:
# Try reading with soundfile first
if format_name == "wav":
# Use scipy.io.wavfile for WAV files
sample_rate, data = wavfile.read(file_path)
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1]
elif format_name == "pcm":
# For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
data = np.frombuffer(
open(file_path, "rb").read(), dtype="<i2"
) # '<i2' means little-endian 16-bit signed int
data = data.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
sample_rate = 24000 # Known sample rate for our endpoint
duration = len(data) / sample_rate
channels = 1
else:
# Use soundfile for other formats (mp3, opus, flac)
data, sample_rate = sf.read(file_path)
duration = len(data) / sample_rate
channels = 1 if len(data.shape) == 1 else data.shape[1]
# Calculate audio statistics
stats = {
"format": Path(file_path).suffix[1:],
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": sample_rate,
"channels": channels,
"min_amplitude": float(np.min(data)),
"max_amplitude": float(np.max(data)),
"mean_amplitude": float(np.mean(np.abs(data))),
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
}
return stats
except:
# For PCM, read raw bytes and estimate duration
with open(file_path, "rb") as f:
data = f.read()
# Assuming 16-bit PCM mono at 24kHz
samples = len(data) // 2 # 2 bytes per sample
duration = samples / 24000
return {
"format": "pcm",
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": 24000,
"channels": 1,
"note": "PCM stats are estimated from raw bytes",
}
# Calculate audio statistics
stats = {
"format": format_name,
"file_size_kb": round(file_size_kb, 2),
"duration_seconds": round(duration, 2),
"sample_rate": sample_rate,
"channels": channels,
"min_amplitude": float(np.min(data)),
"max_amplitude": float(np.max(data)),
"mean_amplitude": float(np.mean(np.abs(data))),
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
}
return stats
def main():
@ -254,13 +256,49 @@ def main():
# Generate and save
start_time = time.time()
response = client.audio.speech.create(
model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
# Use requests with stream=False for consistent data handling
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"voice": voice,
"input": SAMPLE_TEXT,
"response_format": fmt,
"stream": False, # Explicitly disable streaming to get single complete chunk
},
stream=False,
headers={"Accept": f"audio/{fmt}"}, # Explicitly request audio format
)
generation_time = time.time() - start_time
with open(output_path, "wb") as f:
f.write(response.content)
print(f"\nResponse headers for {fmt}:")
for header, value in response.headers.items():
print(f"{header}: {value}")
print(f"Content length: {len(response.content)} bytes")
print(f"First few bytes: {response.content[:20].hex()}")
# Write the file and verify it was written correctly
try:
with open(output_path, "wb") as f:
f.write(response.content)
# Verify file was written
if not output_path.exists():
raise Exception(f"Failed to write {fmt} file")
# Check file size matches content length
written_size = output_path.stat().st_size
if written_size != len(response.content):
raise Exception(
f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
)
print(f"Successfully wrote {fmt} file")
except Exception as e:
print(f"Error writing {fmt} file: {e}")
continue
# Get stats
file_stats = get_audio_stats(str(output_path))

View file

@ -0,0 +1,308 @@
import re
import time
import random
import string
from typing import List, Tuple
def create_test_cases() -> List[str]:
"""Create a variety of test cases with different characteristics"""
# Helper to create random text with specific patterns
def random_text(length: int) -> str:
return "".join(
random.choice(string.ascii_letters + string.digits + " .,!?")
for _ in range(length)
)
test_cases = []
# Base test cases that hit specific patterns
base_cases = [
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
"Yeah, they met at 10:30 and reviewed A.B.C. documentation with Mrs. Brown etc.",
'The temperature was 72.5 degrees (quite normal) for "this time" of year.',
"X's and Y's properties cost £50 million in the 1990s",
"こんにちは。今日は!",
]
# Add base cases
test_cases.extend(base_cases)
# Add variations with random content
for length in [100, 1000, 10000]:
# Create 3 variations of each length
for _ in range(3):
text = random_text(length)
# Insert some patterns we're looking for
text = text.replace(text[10:20], "Dr. Smith")
text = text.replace(text[30:40], "$1,234.56")
text = text.replace(text[50:60], "A.B.C. xyz")
test_cases.append(text)
return test_cases
class TextNormalizerInline:
"""Text normalizer using inline patterns"""
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
split_num,
text,
)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
handle_money,
text,
)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
class TextNormalizerCompiled:
"""Text normalizer using all compiled patterns"""
def __init__(self):
self.patterns = {
"whitespace": re.compile(r"[^\S \n]"),
"multi_space": re.compile(r" +"),
"newline_space": re.compile(r"(?<=\n) +(?=\n)"),
"doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
"mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
"miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
"mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
"etc": re.compile(r"\betc\.(?! [A-Z])"),
"yeah": re.compile(r"(?i)\b(y)eah?\b"),
"numbers": re.compile(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
),
"comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
"money": re.compile(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
),
"decimal": re.compile(r"\d*\.\d+"),
"range": re.compile(r"(?<=\d)-(?=\d)"),
"s_after_number": re.compile(r"(?<=\d)S"),
"possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
"x_possessive": re.compile(r"(?<=X')S\b"),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
"single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns
text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns["multi_space"].sub(" ", text)
text = self.patterns["newline_space"].sub("", text)
text = self.patterns["doctor"].sub("Doctor", text)
text = self.patterns["mister"].sub("Mister", text)
text = self.patterns["miss"].sub("Miss", text)
text = self.patterns["mrs"].sub("Mrs", text)
text = self.patterns["etc"].sub("etc", text)
text = self.patterns["yeah"].sub(r"\1e'a", text)
text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns["comma_in_number"].sub("", text)
text = self.patterns["money"].sub(handle_money, text)
text = self.patterns["decimal"].sub(handle_decimal, text)
text = self.patterns["range"].sub(" to ", text)
text = self.patterns["s_after_number"].sub(" S", text)
text = self.patterns["possessive_s"].sub("'S", text)
text = self.patterns["x_possessive"].sub("s", text)
text = self.patterns["initials"].sub(
lambda m: m.group().replace(".", "-"), text
)
text = self.patterns["single_initial"].sub("-", text)
return text.strip()
class TextNormalizerHybrid:
"""Text normalizer using hybrid approach - compile only complex/frequent patterns"""
def __init__(self):
# Only compile patterns that are complex or frequently used
self.patterns = {
"whitespace": re.compile(r"[^\S \n]"),
"numbers": re.compile(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
),
"money": re.compile(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
),
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
}
def normalize(self, text: str) -> str:
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Use compiled patterns for complex operations
text = self.patterns["whitespace"].sub(" ", text)
text = self.patterns["numbers"].sub(split_num, text)
text = self.patterns["money"].sub(handle_money, text)
text = self.patterns["initials"].sub(
lambda m: m.group().replace(".", "-"), text
)
# Use inline patterns for simpler operations
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
def split_num(match: re.Match) -> str:
"""Split numbers for TTS processing"""
num = match.group(0)
if ":" in num:
h, m = num.split(":")
return f"{h} {m}"
if num.endswith("s"):
return f"{num[:-1]} s"
return num
def handle_money(match: re.Match) -> str:
"""Format money strings for TTS"""
text = match.group(0)
return text.replace("$", " dollars ").replace("£", " pounds ")
def handle_decimal(match: re.Match) -> str:
"""Format decimal numbers for TTS"""
num = match.group(0)
return num.replace(".", " point ")
def benchmark_normalizers(
test_cases: List[str], iterations: int = 100
) -> Tuple[float, float, float]:
"""Benchmark all three implementations"""
normalizers = {
"inline": TextNormalizerInline(),
"compiled": TextNormalizerCompiled(),
"hybrid": TextNormalizerHybrid(),
}
results = {}
# Test each normalizer
for name, normalizer in normalizers.items():
start = time.perf_counter()
# Run normalizations
for _ in range(iterations):
for test in test_cases:
normalizer.normalize(test)
results[name] = time.perf_counter() - start
return results
def verify_outputs(test_cases: List[str]) -> bool:
"""Verify that all implementations produce identical output"""
normalizers = {
"inline": TextNormalizerInline(),
"compiled": TextNormalizerCompiled(),
"hybrid": TextNormalizerHybrid(),
}
for test in test_cases:
results = [norm.normalize(test) for norm in normalizers.values()]
if not all(r == results[0] for r in results):
return False
return True
def main():
# Create test cases
print("Generating test cases...")
test_cases = create_test_cases()
total_chars = sum(len(t) for t in test_cases)
print(
f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
)
# Verify output consistency
print("\nVerifying output consistency...")
if verify_outputs(test_cases):
print("✓ All implementations produce identical output")
else:
print("✗ Warning: Implementations produce different outputs!")
return
# Run benchmarks
print("\nRunning benchmarks...")
iterations = 100
results = benchmark_normalizers(test_cases, iterations)
# Print results
print(f"\nResults for {iterations} iterations: ")
for name, time_taken in results.items():
print(f"{name.capitalize()}: {time_taken:.3f}s")
main()

View file

@ -1,218 +1,262 @@
import argparse
from typing import Any, Dict
from pathlib import Path
import numpy as np
import soundfile as sf
import argparse
from pathlib import Path
from tqdm import tqdm
def validate_tts(wav_path: str) -> dict:
"""
Quick validation checks for TTS-generated audio files to detect common artifacts.
Checks for:
- Unnatural silence gaps
- Audio glitches and artifacts
- Repeated speech segments (stuck/looping)
- Abrupt changes in speech
- Audio quality issues
Args:
wav_path: Path to audio file (wav, mp3, etc)
Returns:
Dictionary with validation results
Validation checks for TTS-generated audio files to detect common artifacts.
"""
try:
# Load audio
# Load and process audio
audio, sr = sf.read(wav_path)
if len(audio.shape) > 1:
audio = audio.mean(axis=1) # Convert to mono
# Basic audio stats
audio = np.mean(audio, axis=1)
duration = len(audio) / sr
rms = np.sqrt(np.mean(audio**2))
peak = np.max(np.abs(audio))
dc_offset = np.mean(audio)
# Calculate clipping stats if we're near peak
clip_count = np.sum(np.abs(audio) >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if clip_percent > 0:
clip_stats = f" ({clip_percent:.2e} ratio near peak)"
else:
clip_stats = " (no samples near peak)"
# Convert to dB for analysis
eps = np.finfo(float).eps
db = 20 * np.log10(np.abs(audio) + eps)
issues = []
# Check if audio is too short (likely failed generation)
if duration < 0.1: # Less than 100ms
issues.append("WARNING: Audio is suspiciously short - possible failed generation")
# 1. Check for basic audio quality
if peak >= 1.0:
# Calculate percentage of samples that are clipping
clip_count = np.sum(np.abs(audio) >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if clip_percent > 1.0: # Only warn if more than 1% of samples clip
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
elif clip_percent > 0.01: # Add info if more than 0.01% but less than 1%
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples) - likely intentional normalization")
if rms < 0.01:
# Basic quality checks
abs_audio = np.abs(audio)
stats = {
"rms": float(np.sqrt(np.mean(audio**2))),
"peak": float(np.max(abs_audio)),
"dc_offset": float(np.mean(audio)),
}
clip_count = np.sum(abs_audio >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if duration < 0.1:
issues.append(
"WARNING: Audio is suspiciously short - possible failed generation"
)
if stats["peak"] >= 1.0:
if clip_percent > 1.0:
issues.append(
f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
)
elif clip_percent > 0.01:
issues.append(
f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
)
if stats["rms"] < 0.01:
issues.append("WARNING: Audio is very quiet - possible failed generation")
if abs(dc_offset) > 0.1: # DC offset is particularly bad for speech
issues.append(f"WARNING: High DC offset ({dc_offset:.3f}) - may cause audio artifacts")
# 2. Check for long silence gaps (potential TTS failures)
if abs(stats["dc_offset"]) > 0.1:
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
# Check for long silence gaps
eps = np.finfo(float).eps
db = 20 * np.log10(abs_audio + eps)
silence_threshold = -45 # dB
min_silence = 2.0 # Only detect silences longer than 2 seconds
min_silence = 2.0 # seconds
window_size = int(min_silence * sr)
silence_count = 0
last_silence = -1
# Skip the first 0.2s for silence detection (avoid false positives at start)
start_idx = int(0.2 * sr)
for i in range(start_idx, len(db) - window_size, window_size):
window = db[i:i+window_size]
start_idx = int(0.2 * sr) # Skip first 0.2s
for i in tqdm(
range(start_idx, len(db) - window_size, window_size),
desc="Checking for silence",
):
window = db[i : i + window_size]
if np.mean(window) < silence_threshold:
# Verify the entire window is mostly silence
silent_ratio = np.mean(window < silence_threshold)
if silent_ratio > 0.9: # 90% of the window should be below threshold
if last_silence == -1 or (i/sr - last_silence) > 2.0: # Only count silences more than 2s apart
if silent_ratio > 0.9:
if last_silence == -1 or (i / sr - last_silence) > 2.0:
silence_count += 1
last_silence = i/sr
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
if silence_count > 2: # Only warn if there are multiple long silences
issues.append(f"WARNING: Multiple long silences found ({silence_count} total) - possible generation issue")
# 3. Check for extreme audio artifacts (changes too rapid for natural speech)
# Use a longer window to avoid flagging normal phoneme transitions
window_size = int(0.02 * sr) # 20ms window
db_smooth = np.convolve(db, np.ones(window_size)/window_size, 'same')
db_diff = np.abs(np.diff(db_smooth))
# Much higher threshold to only catch truly unnatural changes
artifact_threshold = 40 # dB
min_duration = int(0.01 * sr) # Minimum 10ms duration
# Find regions where the smoothed dB change is extreme
artifact_points = np.where(db_diff > artifact_threshold)[0]
if len(artifact_points) > 0:
# Group artifacts that are very close together
grouped_artifacts = []
current_group = [artifact_points[0]]
for i in range(1, len(artifact_points)):
if (artifact_points[i] - current_group[-1]) < min_duration:
current_group.append(artifact_points[i])
else:
if len(current_group) * (1/sr) >= 0.01: # Only keep groups lasting >= 10ms
grouped_artifacts.append(current_group)
current_group = [artifact_points[i]]
if len(current_group) * (1/sr) >= 0.01:
grouped_artifacts.append(current_group)
# Report only the most severe artifacts
for group in grouped_artifacts[:2]: # Report up to 2 worst artifacts
center_idx = group[len(group)//2]
db_change = db_diff[center_idx]
if db_change > 45: # Only report very extreme changes
issues.append(
f"WARNING: Possible audio artifact at {center_idx/sr:.2f}s "
f"({db_change:.1f}dB change over {len(group)/sr*1000:.0f}ms)"
)
# 4. Check for repeated speech segments (stuck/looping)
# Check both short and long sentence durations at audiobook speed (150-160 wpm)
for chunk_duration in [5.0, 10.0]: # 5s (~12 words) and 10s (~25 words) at ~audiobook speed
last_silence = i / sr
issues.append(
f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
)
if silence_count > 2:
issues.append(
f"WARNING: Multiple long silences found ({silence_count} total)"
)
# Detect audio artifacts
diff = np.diff(audio)
abs_diff = np.abs(diff)
window_size = min(int(0.005 * sr), 256)
window = np.ones(window_size) / window_size
local_avg_diff = np.convolve(abs_diff, window, mode="same")
spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
artifact_indices = np.nonzero(spikes)[0]
artifacts = []
if len(artifact_indices) > 0:
gaps = np.diff(artifact_indices)
min_gap = int(0.005 * sr)
break_points = np.nonzero(gaps > min_gap)[0] + 1
groups = np.split(artifact_indices, break_points)
for group in groups:
if len(group) >= 5:
severity = np.max(abs_diff[group])
if severity > 0.2:
center_idx = group[len(group) // 2]
artifacts.append(
{
"time": float(
center_idx / sr
), # Ensure float for consistent timing
"severity": float(severity),
}
)
issues.append(
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
f"(severity: {severity:.3f})"
)
# Check for repeated speech segments
for chunk_duration in tqdm(
[0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
):
chunk_size = int(chunk_duration * sr)
overlap = int(0.2 * chunk_size) # 20% overlap between chunks
for i in range(0, len(audio) - 2*chunk_size, overlap):
chunk1 = audio[i:i+chunk_size]
chunk2 = audio[i+chunk_size:i+2*chunk_size]
# Ignore chunks that are mostly silence
overlap = int(0.2 * chunk_size)
for i in range(0, len(audio) - 2 * chunk_size, overlap):
chunk1 = audio[i : i + chunk_size]
chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
continue
try:
correlation = np.corrcoef(chunk1, chunk2)[0,1]
if not np.isnan(correlation) and correlation > 0.92: # Lower threshold for sentence-length chunks
correlation = np.corrcoef(chunk1, chunk2)[0, 1]
if not np.isnan(correlation) and correlation > 0.92:
issues.append(
f"WARNING: Possible repeated speech at {i/sr:.1f}s "
f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
)
break # Found repetition at this duration, try next duration
break
except:
continue
# 5. Check for extreme amplitude discontinuities (common in failed TTS)
amplitude_envelope = np.abs(audio)
window_size = sr // 10 # 100ms window for smoother envelope
smooth_env = np.convolve(amplitude_envelope, np.ones(window_size)/float(window_size), 'same')
env_diff = np.abs(np.diff(smooth_env))
# Only detect very extreme amplitude changes
jump_threshold = 0.5 # Much higher threshold
jumps = np.where(env_diff > jump_threshold)[0]
if len(jumps) > 0:
# Group jumps that are close together
grouped_jumps = []
current_group = [jumps[0]]
for i in range(1, len(jumps)):
if (jumps[i] - current_group[-1]) < 0.05 * sr: # Group within 50ms
current_group.append(jumps[i])
else:
if len(current_group) >= 3: # Only keep significant discontinuities
grouped_jumps.append(current_group)
current_group = [jumps[i]]
if len(current_group) >= 3:
grouped_jumps.append(current_group)
# Report only the most severe discontinuities
for group in grouped_jumps[:2]: # Report up to 2 worst cases
center_idx = group[len(group)//2]
jump_size = env_diff[center_idx]
if jump_size > 0.6: # Only report very extreme changes
issues.append(
f"WARNING: Possible audio discontinuity at {center_idx/sr:.2f}s "
f"({jump_size:.2f} amplitude ratio change)"
)
return {
"file": wav_path,
"duration": f"{duration:.2f}s",
"sample_rate": sr,
"peak_amplitude": f"{peak:.3f}{clip_stats}",
"rms_level": f"{rms:.3f}",
"dc_offset": f"{dc_offset:.3f}",
"peak_amplitude": f"{stats['peak']:.3f}",
"rms_level": f"{stats['rms']:.3f}",
"dc_offset": f"{stats['dc_offset']:.3f}",
"artifact_count": len(artifacts),
"artifact_locations": [a["time"] for a in artifacts],
"artifact_severities": [a["severity"] for a in artifacts],
"issues": issues,
"valid": len(issues) == 0
}
except Exception as e:
return {
"file": wav_path,
"error": str(e),
"valid": False
"valid": len(issues) == 0,
}
except Exception as e:
return {"file": wav_path, "error": str(e), "valid": False}
def generate_analysis_plots(
wav_path: str, output_dir: str, validation_result: Dict[str, Any]
):
"""
Generate analysis plots for audio file with time-aligned visualizations.
"""
import matplotlib.pyplot as plt
from scipy.signal import spectrogram
# Load audio
audio, sr = sf.read(wav_path)
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
# Create figure with shared x-axis
fig = plt.figure(figsize=(15, 8))
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1], sharex=ax1)
# Calculate spectrogram
nperseg = 2048
noverlap = 1536
f, t, Sxx = spectrogram(
audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
)
# Plot spectrogram
im = ax1.pcolormesh(
t,
f,
10 * np.log10(Sxx + 1e-10),
shading="gouraud",
cmap="viridis",
vmin=-100,
vmax=-20,
)
ax1.set_ylabel("Frequency [Hz]", fontsize=10)
cbar = plt.colorbar(im, ax=ax1, label="dB")
ax1.set_title("Spectrogram", pad=10, fontsize=12)
# Plot waveform with exact time alignment
times = np.arange(len(audio)) / sr
ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
ax2.set_ylabel("Amplitude", fontsize=10)
ax2.set_xlabel("Time [sec]", fontsize=10)
ax2.grid(True, alpha=0.2)
# Add artifact markers
if (
"artifact_locations" in validation_result
and validation_result["artifact_locations"]
):
for loc in validation_result["artifact_locations"]:
ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
ax2.axvline(
x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
)
# Add legend to both plots
if len(validation_result["artifact_locations"]) > 0:
ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
ax1.legend(loc="upper right", fontsize=8)
# Only add unique labels to legend
handles, labels = ax2.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles))
ax2.legend(
unique_labels.values(),
unique_labels.keys(),
loc="upper right",
fontsize=8,
)
# Set common x limits
xlim = (0, len(audio) / sr)
ax1.set_xlim(xlim)
ax2.set_xlim(xlim)
og_filename = Path(wav_path).name.split(".")[0]
# Save plot
plt.savefig(
Path(output_dir) / f"{og_filename}_audio_analysis.png",
dpi=300,
bbox_inches="tight",
)
plt.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="TTS Output Validator")
parser.add_argument("wav_file", help="Path to audio file to validate")
args = parser.parse_args()
result = validate_tts(args.wav_file)
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
silent = False
print(f"\n\n Processing:\n\t{wav_file}")
result = validate_tts(wav_file)
if not silent:
wav_root_dir = Path(wav_file).parent
generate_analysis_plots(wav_file, wav_root_dir, result)
print(f"\nValidating: {result['file']}")
if "error" in result:
print(f"Error: {result['error']}")
@ -222,7 +266,8 @@ if __name__ == "__main__":
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}")
print(f"Detected Artifacts: {result['artifact_count']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:

View file

@ -1,7 +1,9 @@
import argparse
from pathlib import Path
from validate_wav import validate_tts
def print_validation_result(result: dict, rel_path: Path):
"""Print full validation details for a single file."""
print(f"\nValidating: {rel_path}")
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}")
if result["issues"]:
print("\nIssues Found:")
for issue in result["issues"]:
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
else:
print("\nNo issues found")
def validate_directory(directory: str):
"""Validate all wav files in a directory with detailed output and summary."""
dir_path = Path(directory)
# Find all wav files (including nested directories)
wav_files = list(dir_path.rglob("*.wav"))
wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s
wav_files = sorted(wav_files)
if not wav_files:
print(f"No .wav or .mp3 files found in {directory}")
return
print(f"Found {len(wav_files)} files in {directory}")
print("=" * 80)
# Store results for summary
results = []
# Detailed validation output
for wav_file in wav_files:
result = validate_tts(str(wav_file))
@ -47,7 +50,7 @@ def validate_directory(directory: str):
print_validation_result(result, rel_path)
results.append((rel_path, result))
print("=" * 80)
# Summary with detailed issues
print("\nSUMMARY:")
for rel_path, result in results:
@ -58,15 +61,18 @@ def validate_directory(directory: str):
issues = result["issues"]
first_issue = issues[0].replace("WARNING: ", "")
if len(issues) > 1:
print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
print(
f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
)
else:
print(f"{rel_path}: FAIL - {first_issue}")
else:
print(f"{rel_path}: PASS")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
parser.add_argument("directory", help="Directory containing wav files to validate")
args = parser.parse_args()
validate_directory(args.directory)

BIN
examples/audio_analysis.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 MiB

View file

@ -0,0 +1,49 @@
#!/usr/bin/env rye run python
import time
from pathlib import Path
from openai import OpenAI
# gets OPENAI_API_KEY from your environment variables
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
speech_file_path = Path(__file__).parent / "speech.mp3"
def main() -> None:
stream_to_speakers()
# Create text-to-speech audio file
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
input="the quick brown fox jumped over the lazy dogs",
) as response:
response.stream_to_file(speech_file_path)
def stream_to_speakers() -> None:
import pyaudio
player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
start_time = time.time()
with openai.audio.speech.with_streaming_response.create(
model="kokoro",
voice="af",
response_format="pcm", # similar to WAV, but without a header chunk at the start.
input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension with a uniform velocity from the cradle to the grave. Just as we should travel down if we began our existence fifty miles above the earths surface""",
) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
for chunk in response.iter_bytes(chunk_size=1024):
player_stream.write(chunk)
print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,125 @@
#!/usr/bin/env python3
import requests
import numpy as np
import sounddevice as sd
import time
import os
import wave
def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"):
"""Stream TTS audio and play it back in real-time"""
print("\nStarting TTS stream request...")
start_time = time.time()
# Initialize variables
sample_rate = 24000 # Known sample rate for Kokoro
audio_started = False
chunk_count = 0
total_bytes = 0
first_chunk_time = None
all_audio_data = bytearray() # Raw PCM audio data
# Start sounddevice stream with buffer
stream = sd.OutputStream(
samplerate=sample_rate,
channels=1,
dtype=np.int16,
blocksize=1024, # Buffer size in samples
latency='low' # Request low latency
)
stream.start()
# Make streaming request to API
try:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": voice,
"response_format": "pcm",
"stream": True
},
stream=True,
timeout=1800
)
response.raise_for_status()
print(f"Request started successfully after {time.time() - start_time:.2f}s")
# Process streaming response with smaller chunks for lower latency
for chunk in response.iter_content(chunk_size=512): # 512 bytes = 256 samples at 16-bit
if chunk:
chunk_count += 1
total_bytes += len(chunk)
# Handle first chunk
if not audio_started:
first_chunk_time = time.time()
print(f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s")
print(f"First chunk size: {len(chunk)} bytes")
audio_started = True
# Convert bytes to numpy array and play
audio_chunk = np.frombuffer(chunk, dtype=np.int16)
stream.write(audio_chunk)
# Accumulate raw audio data
all_audio_data.extend(chunk)
# Log progress every 10 chunks
if chunk_count % 10 == 0:
elapsed = time.time() - start_time
print(f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed")
# Final stats
total_time = time.time() - start_time
print(f"\nStream complete:")
print(f"Total chunks: {chunk_count}")
print(f"Total data: {total_bytes/1024:.1f}KB")
print(f"Total time: {total_time:.2f}s")
print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
# Save as WAV file
if output_file:
print(f"\nWriting audio to {output_file}")
with wave.open(output_file, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(sample_rate)
wav_file.writeframes(all_audio_data)
print(f"Saved {len(all_audio_data)} bytes of audio data")
# Clean up
stream.stop()
stream.close()
except requests.exceptions.ConnectionError as e:
print(f"Connection error - Is the server running? Error: {str(e)}")
stream.stop()
stream.close()
except Exception as e:
print(f"Error during streaming: {str(e)}")
stream.stop()
stream.close()
def main():
# Load sample text from HG Wells
script_dir = os.path.dirname(os.path.abspath(__file__))
wells_path = os.path.join(script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt")
output_path = os.path.join(script_dir, "output.wav")
with open(wells_path, "r", encoding="utf-8") as f:
full_text = f.read()
# Take first few paragraphs
text = " ".join(full_text.split("\n\n")[:2])
print("\nStarting TTS stream playback...")
print(f"Text length: {len(text)} characters")
print("\nFirst 100 characters:")
print(text[:100] + "...")
play_streaming_tts(text, output_file=output_path)
if __name__ == "__main__":
main()

View file

@ -13,7 +13,7 @@ numpy==2.2.1
scipy==1.14.1
# Audio processing
soundfile==0.12.1
soundfile==0.13.0
# Text processing
phonemizer==3.3.0