-update soundfile version
-alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo
BIN
.coverage
|
@ -129,7 +129,7 @@ response = requests.post(
|
|||
)
|
||||
```
|
||||
<p align="center">
|
||||
<img src="examples/benchmarks/analysis_comparison.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
|
||||
<img src="assets/voice_analysis.png" width="80%" alt="Voice Analysis Comparison" style="border: 2px solid #333; padding: 10px;">
|
||||
</p>
|
||||
</details>
|
||||
|
||||
|
@ -144,7 +144,7 @@ response = requests.post(
|
|||
- pcm
|
||||
|
||||
<p align="center">
|
||||
<img src="examples/benchmarks/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
|
||||
<img src="assets/format_comparison.png" width="80%" alt="Audio Format Comparison" style="border: 2px solid #333; padding: 10px;">
|
||||
</p>
|
||||
|
||||
</details>
|
||||
|
@ -175,8 +175,8 @@ Benchmarking was performed on generation via the local API using text lengths up
|
|||
- H.G. Wells - The Time Machine (full text)
|
||||
|
||||
<p align="center">
|
||||
<img src="examples/benchmarks/processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
|
||||
<img src="examples/benchmarks/realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
|
||||
<img src="assets/gpu_processing_time.png" width="45%" alt="Processing Time" style="border: 2px solid #333; padding: 10px; margin-right: 1%;">
|
||||
<img src="assets/gpu_realtime_factor.png" width="45%" alt="Realtime Factor" style="border: 2px solid #333; padding: 10px;">
|
||||
</p>
|
||||
|
||||
Key Performance Metrics:
|
||||
|
|
|
@ -18,6 +18,8 @@ class Settings(BaseSettings):
|
|||
onnx_model_path: str = "kokoro-v0_19.onnx"
|
||||
voices_dir: str = "voices"
|
||||
sample_rate: int = 24000
|
||||
max_chunk_size: int = 300 # Maximum size of text chunks for processing
|
||||
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
|
||||
|
||||
# ONNX Optimization Settings
|
||||
onnx_num_threads: int = 4 # Number of threads for intra-op parallelism
|
||||
|
|
9
api/src/core/don_quixote.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
In a village of La Mancha, the name of which I have no desire to call
|
||||
to mind, there lived not long since one of those gentlemen that keep a
|
||||
lance in the lance-rack, an old buckler, a lean hack, and a greyhound
|
||||
for coursing. An olla of rather more beef than mutton, a salad on most
|
||||
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so
|
||||
extra on Sundays, made away with three-quarters of his income. The rest
|
||||
of it went in a doublet of fine cloth and velvet breeches and shoes to
|
||||
match for holidays, while on week-days he made a brave figure in his
|
||||
best homespun.
|
|
@ -22,10 +22,11 @@ async def lifespan(app: FastAPI):
|
|||
logger.info("Loading TTS model and voice packs...")
|
||||
|
||||
# Initialize the main model with warm-up
|
||||
voicepack_count = TTSModel.setup()
|
||||
voicepack_count = await TTSModel.setup()
|
||||
# boundary = "█████╗"*9
|
||||
boundary = "░" * 30
|
||||
boundary = "░" * 24
|
||||
startup_msg =f"""
|
||||
|
||||
{boundary}
|
||||
|
||||
╔═╗┌─┐┌─┐┌┬┐
|
||||
|
@ -37,8 +38,9 @@ async def lifespan(app: FastAPI):
|
|||
|
||||
{boundary}
|
||||
"""
|
||||
startup_msg += f"\nModel loaded and warmed up on {TTSModel.get_device()}"
|
||||
startup_msg += f"\n{voicepack_count} voice packs loaded successfully\n"
|
||||
# TODO: Improve CPU warmup, threads, memory, etc
|
||||
startup_msg += f"\nModel warmed up on {TTSModel.get_device()}"
|
||||
startup_msg += f"\n{voicepack_count} voice packs loaded\n"
|
||||
startup_msg += f"\n{boundary}\n"
|
||||
logger.info(startup_msg)
|
||||
|
||||
|
|
|
@ -83,8 +83,8 @@ async def create_speech(
|
|||
audio,
|
||||
24000,
|
||||
request.response_format,
|
||||
is_first_chunk=True
|
||||
)
|
||||
is_first_chunk=True,
|
||||
stream=False)
|
||||
|
||||
return Response(
|
||||
content=content,
|
||||
|
|
|
@ -4,22 +4,30 @@ from io import BytesIO
|
|||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import scipy.io.wavfile as wavfile
|
||||
from loguru import logger
|
||||
|
||||
from ..core.config import settings
|
||||
|
||||
class AudioNormalizer:
|
||||
"""Handles audio normalization state for a single stream"""
|
||||
def __init__(self):
|
||||
self.int16_max = np.iinfo(np.int16).max
|
||||
self.chunk_trim_ms = settings.gap_trim_ms
|
||||
self.sample_rate = 24000 # Sample rate of the audio
|
||||
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
|
||||
|
||||
def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
||||
"""Normalize audio data to int16 range"""
|
||||
def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
|
||||
"""Normalize audio data to int16 range and trim chunk boundaries"""
|
||||
# Convert to float32 if not already
|
||||
audio_float = audio_data.astype(np.float32)
|
||||
|
||||
# Normalize to [-1, 1] range first
|
||||
if np.max(np.abs(audio_float)) > 0:
|
||||
audio_float = audio_float / np.max(np.abs(audio_float))
|
||||
|
||||
# Trim end of non-final chunks to reduce gaps
|
||||
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
|
||||
audio_float = audio_float[:-self.samples_to_trim]
|
||||
|
||||
# Scale to int16 range
|
||||
return (audio_float * self.int16_max).astype(np.int16)
|
||||
|
@ -27,13 +35,30 @@ class AudioNormalizer:
|
|||
class AudioService:
|
||||
"""Service for audio format conversions"""
|
||||
|
||||
# Default audio format settings balanced for speed and compression
|
||||
DEFAULT_SETTINGS = {
|
||||
"mp3": {
|
||||
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
|
||||
"compression_level": 0.0, # Balanced compression
|
||||
},
|
||||
"opus": {
|
||||
"compression_level": 0.0, # Good balance for speech
|
||||
},
|
||||
"flac": {
|
||||
"compression_level": 0.0, # Light compression, still fast
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def convert_audio(
|
||||
audio_data: np.ndarray,
|
||||
sample_rate: int,
|
||||
output_format: str,
|
||||
is_first_chunk: bool = True,
|
||||
normalizer: AudioNormalizer = None
|
||||
is_last_chunk: bool = False,
|
||||
normalizer: AudioNormalizer = None,
|
||||
format_settings: dict = None,
|
||||
stream: bool = True
|
||||
) -> bytes:
|
||||
"""Convert audio data to specified format
|
||||
|
||||
|
@ -42,6 +67,19 @@ class AudioService:
|
|||
sample_rate: Sample rate of the audio
|
||||
output_format: Target format (wav, mp3, opus, flac, pcm)
|
||||
is_first_chunk: Whether this is the first chunk of a stream
|
||||
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
|
||||
format_settings: Optional dict of format-specific settings to override defaults
|
||||
Example: {
|
||||
"mp3": {
|
||||
"bitrate_mode": "VARIABLE",
|
||||
"compression_level": 0.8
|
||||
}
|
||||
}
|
||||
Default settings balance speed and compression:
|
||||
optimized for localhost @ 0.0
|
||||
- MP3: constant bitrate, no compression (0.0)
|
||||
- OPUS: no compression (0.0)
|
||||
- FLAC: no compression (0.0)
|
||||
|
||||
Returns:
|
||||
Bytes of the converted audio
|
||||
|
@ -50,31 +88,48 @@ class AudioService:
|
|||
|
||||
try:
|
||||
# Always normalize audio to ensure proper amplitude scaling
|
||||
if normalizer is None:
|
||||
normalizer = AudioNormalizer()
|
||||
normalized_audio = normalizer.normalize(audio_data)
|
||||
if stream:
|
||||
if normalizer is None:
|
||||
normalizer = AudioNormalizer()
|
||||
normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
|
||||
else:
|
||||
normalized_audio = audio_data
|
||||
|
||||
if output_format == "pcm":
|
||||
logger.info("Writing PCM data...")
|
||||
# Raw 16-bit PCM samples, no header
|
||||
buffer.write(normalized_audio.tobytes())
|
||||
elif output_format == "wav":
|
||||
logger.info("Writing to WAV format...")
|
||||
# Always include WAV header for WAV format
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
|
||||
if stream:
|
||||
# Use soundfile for streaming to ensure proper headers
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
|
||||
else:
|
||||
# Trying scipy.io.wavfile for non-streaming WAV generation
|
||||
# seems faster than soundfile
|
||||
# avoids overhead from header generation and PCM encoding
|
||||
wavfile.write(buffer, sample_rate, normalized_audio)
|
||||
elif output_format == "mp3":
|
||||
logger.info("Converting to MP3 format...")
|
||||
# Use lower bitrate for streaming
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="MP3")
|
||||
# Use format settings or defaults
|
||||
settings = format_settings.get("mp3", {}) if format_settings else {}
|
||||
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
|
||||
sf.write(
|
||||
buffer, normalized_audio,
|
||||
sample_rate, format="MP3",
|
||||
**settings
|
||||
)
|
||||
|
||||
elif output_format == "opus":
|
||||
logger.info("Converting to Opus format...")
|
||||
# Use lower bitrate and smaller frame size for streaming
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
|
||||
settings = format_settings.get("opus", {}) if format_settings else {}
|
||||
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="OGG",
|
||||
subtype="OPUS", **settings)
|
||||
|
||||
elif output_format == "flac":
|
||||
logger.info("Converting to FLAC format...")
|
||||
# Use smaller block size for streaming
|
||||
if is_first_chunk:
|
||||
logger.info("Starting FLAC stream...")
|
||||
settings = format_settings.get("flac", {}) if format_settings else {}
|
||||
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
|
||||
sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
|
||||
subtype='PCM_16')
|
||||
subtype='PCM_16', **settings)
|
||||
else:
|
||||
if output_format == "aac":
|
||||
raise ValueError(
|
||||
|
|
52
api/src/services/text_processing/chunker.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
"""Text chunking service"""
|
||||
|
||||
import re
|
||||
from ...core.config import settings
|
||||
|
||||
|
||||
def split_text(text: str, max_chunk=None):
|
||||
"""Split text into chunks on natural pause points
|
||||
|
||||
Args:
|
||||
text: Text to split into chunks
|
||||
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
|
||||
"""
|
||||
if max_chunk is None:
|
||||
max_chunk = settings.max_chunk_size
|
||||
|
||||
if not isinstance(text, str):
|
||||
text = str(text) if text is not None else ""
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return
|
||||
|
||||
# First split into sentences
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# For medium-length sentences, split on punctuation
|
||||
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
|
||||
# First try splitting on semicolons and colons
|
||||
parts = re.split(r"(?<=[;:])\s+", sentence)
|
||||
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# If part is still long, split on commas
|
||||
if len(part) > max_chunk:
|
||||
subparts = re.split(r"(?<=,)\s+", part)
|
||||
for subpart in subparts:
|
||||
subpart = subpart.strip()
|
||||
if subpart:
|
||||
yield subpart
|
||||
else:
|
||||
yield part
|
||||
else:
|
||||
yield sentence
|
|
@ -15,7 +15,7 @@ class TTSBaseModel(ABC):
|
|||
VOICES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "voices")
|
||||
|
||||
@classmethod
|
||||
def setup(cls):
|
||||
async def setup(cls):
|
||||
"""Initialize model and setup voices"""
|
||||
with cls._lock:
|
||||
# Set device
|
||||
|
@ -59,19 +59,23 @@ class TTSBaseModel(ABC):
|
|||
except Exception as e:
|
||||
logger.error(f"Error copying voice {voice_name}: {str(e)}")
|
||||
|
||||
# Warm up with default voice
|
||||
# Load warmup text
|
||||
try:
|
||||
dummy_text = "Hello"
|
||||
voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
|
||||
dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
|
||||
|
||||
# Process text and generate audio
|
||||
phonemes, tokens = cls.process_text(dummy_text, "a")
|
||||
cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
|
||||
|
||||
logger.info("Model warm-up complete")
|
||||
with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), "core", "don_quixote.txt")) as f:
|
||||
warmup_text = f.read()
|
||||
except Exception as e:
|
||||
logger.warning(f"Model warm-up failed: {e}")
|
||||
logger.warning(f"Failed to load warmup text: {e}")
|
||||
warmup_text = "This is a warmup text that will be split into chunks for processing."
|
||||
|
||||
# Use warmup service
|
||||
from .warmup import WarmupService
|
||||
warmup = WarmupService()
|
||||
|
||||
# Load and warm up voices
|
||||
loaded_voices = warmup.load_voices()
|
||||
await warmup.warmup_voices(warmup_text, loaded_voices)
|
||||
|
||||
logger.info("Model warm-up complete")
|
||||
|
||||
# Count voices in directory
|
||||
voice_count = len([f for f in os.listdir(cls.VOICES_DIR) if f.endswith(".pt")])
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import torch
|
||||
import time
|
||||
from loguru import logger
|
||||
from models import build_model
|
||||
from .text_processing import phonemize, tokenize
|
||||
|
@ -8,42 +9,97 @@ from .text_processing import phonemize, tokenize
|
|||
from .tts_base import TTSBaseModel
|
||||
from ..core.config import settings
|
||||
|
||||
# @torch.no_grad()
|
||||
# def forward(model, tokens, ref_s, speed):
|
||||
# """Forward pass through the model"""
|
||||
# device = ref_s.device
|
||||
# tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
||||
# input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
||||
# text_mask = length_to_mask(input_lengths).to(device)
|
||||
# bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
||||
# d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
||||
# s = ref_s[:, 128:]
|
||||
# d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
||||
# x, _ = model.predictor.lstm(d)
|
||||
# duration = model.predictor.duration_proj(x)
|
||||
# duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
||||
# pred_dur = torch.round(duration).clamp(min=1).long()
|
||||
# pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
||||
# c_frame = 0
|
||||
# for i in range(pred_aln_trg.size(0)):
|
||||
# pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
|
||||
# c_frame += pred_dur[0, i].item()
|
||||
# en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
||||
# F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
||||
# t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
||||
# asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
||||
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
||||
@torch.no_grad()
|
||||
def forward(model, tokens, ref_s, speed):
|
||||
"""Forward pass through the model"""
|
||||
"""Forward pass through the model with light optimizations that preserve output quality"""
|
||||
device = ref_s.device
|
||||
|
||||
# Keep original token handling but optimize device placement
|
||||
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
||||
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
||||
text_mask = length_to_mask(input_lengths).to(device)
|
||||
|
||||
# BERT and encoder pass
|
||||
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
||||
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
||||
s = ref_s[:, 128:]
|
||||
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
||||
|
||||
# Split reference signal once for efficiency
|
||||
s_content = ref_s[:, 128:]
|
||||
s_ref = ref_s[:, :128]
|
||||
|
||||
# Predictor forward pass
|
||||
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
|
||||
x, _ = model.predictor.lstm(d)
|
||||
|
||||
# Duration prediction - keeping original logic
|
||||
duration = model.predictor.duration_proj(x)
|
||||
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
||||
pred_dur = torch.round(duration).clamp(min=1).long()
|
||||
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
||||
|
||||
# Alignment matrix construction - keeping original approach for quality
|
||||
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
|
||||
c_frame = 0
|
||||
for i in range(pred_aln_trg.size(0)):
|
||||
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
|
||||
pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
|
||||
c_frame += pred_dur[0, i].item()
|
||||
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
||||
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
||||
|
||||
# Matrix multiplications - reuse unsqueezed tensor
|
||||
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
|
||||
en = d.transpose(-1, -2) @ pred_aln_trg
|
||||
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
|
||||
|
||||
# Text encoding and final decoding
|
||||
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
||||
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
||||
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
||||
asr = t_en @ pred_aln_trg
|
||||
|
||||
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
|
||||
|
||||
# def length_to_mask(lengths):
|
||||
# """Create attention mask from lengths"""
|
||||
# mask = (
|
||||
# torch.arange(lengths.max())
|
||||
# .unsqueeze(0)
|
||||
# .expand(lengths.shape[0], -1)
|
||||
# .type_as(lengths)
|
||||
# )
|
||||
# mask = torch.gt(mask + 1, lengths.unsqueeze(1))
|
||||
# return mask
|
||||
|
||||
def length_to_mask(lengths):
|
||||
"""Create attention mask from lengths"""
|
||||
mask = (
|
||||
torch.arange(lengths.max())
|
||||
.unsqueeze(0)
|
||||
.expand(lengths.shape[0], -1)
|
||||
.type_as(lengths)
|
||||
)
|
||||
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
|
||||
return mask
|
||||
"""Create attention mask from lengths - possibly optimized version"""
|
||||
max_len = lengths.max()
|
||||
# Create mask directly on the same device as lengths
|
||||
mask = torch.arange(max_len, device=lengths.device)[None, :].expand(lengths.shape[0], -1)
|
||||
# Avoid type_as by using the correct dtype from the start
|
||||
if lengths.dtype != mask.dtype:
|
||||
mask = mask.to(dtype=lengths.dtype)
|
||||
# Fuse operations using broadcasting
|
||||
return mask + 1 > lengths[:, None]
|
||||
|
||||
class TTSGPUModel(TTSBaseModel):
|
||||
_instance = None
|
||||
|
|
|
@ -8,7 +8,7 @@ from functools import lru_cache
|
|||
import numpy as np
|
||||
import torch
|
||||
import scipy.io.wavfile as wavfile
|
||||
from .text_processing import normalize_text
|
||||
from .text_processing import normalize_text, chunker
|
||||
from loguru import logger
|
||||
|
||||
from ..core.config import settings
|
||||
|
@ -20,40 +20,6 @@ class TTSService:
|
|||
def __init__(self, output_dir: str = None):
|
||||
self.output_dir = output_dir
|
||||
|
||||
def _split_text(self, text: str):
|
||||
"""Generate text chunks one at a time, splitting on natural pause points"""
|
||||
if not isinstance(text, str):
|
||||
text = str(text) if text is not None else ""
|
||||
|
||||
# First split into sentences
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# For longer sentences, split on commas and semicolons
|
||||
if len(sentence) > 300: # Only split long sentences
|
||||
# Split on pause points while preserving the punctuation
|
||||
chunks = re.split(r"((?<=[,;])\s+)", sentence)
|
||||
|
||||
# Reassemble chunks with their trailing punctuation
|
||||
current_chunk = ""
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i % 2 == 0: # Text chunk
|
||||
current_chunk += chunk
|
||||
else: # Punctuation/whitespace chunk
|
||||
current_chunk += chunk
|
||||
if current_chunk.strip():
|
||||
yield current_chunk.strip()
|
||||
current_chunk = ""
|
||||
|
||||
# Yield any remaining text
|
||||
if current_chunk.strip():
|
||||
yield current_chunk.strip()
|
||||
else:
|
||||
yield sentence
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
|
||||
|
@ -96,28 +62,32 @@ class TTSService:
|
|||
# Load voice using cached loader
|
||||
voicepack = self._load_voice(voice_path)
|
||||
|
||||
# Generate audio with or without stitching
|
||||
# For non-streaming, preprocess all chunks first
|
||||
if stitch_long_output:
|
||||
audio_chunks = []
|
||||
chunk_count = 0
|
||||
|
||||
# Process chunks as they're generated
|
||||
for chunk in self._split_text(text):
|
||||
# Preprocess all chunks to phonemes/tokens
|
||||
chunks_data = []
|
||||
for chunk in chunker.split_text(text):
|
||||
try:
|
||||
# Process text and generate audio
|
||||
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
|
||||
chunks_data.append((chunk, tokens))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
|
||||
continue
|
||||
|
||||
if not chunks_data:
|
||||
raise ValueError("No chunks were processed successfully")
|
||||
|
||||
# Generate audio for all chunks
|
||||
audio_chunks = []
|
||||
for chunk, tokens in chunks_data:
|
||||
try:
|
||||
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
|
||||
|
||||
if chunk_audio is not None:
|
||||
audio_chunks.append(chunk_audio)
|
||||
chunk_count += 1
|
||||
else:
|
||||
logger.error(f"No audio generated for chunk {chunk_count + 1}")
|
||||
|
||||
logger.error(f"No audio generated for chunk: '{chunk}'")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to generate audio for chunk {chunk_count + 1}: '{chunk}'. Error: {str(e)}"
|
||||
)
|
||||
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
|
||||
continue
|
||||
|
||||
if not audio_chunks:
|
||||
|
@ -138,53 +108,93 @@ class TTSService:
|
|||
raise
|
||||
|
||||
async def generate_audio_stream(
|
||||
self, text: str, voice: str, speed: float, output_format: str = "wav"
|
||||
self, text: str, voice: str, speed: float, output_format: str = "wav", silent=False
|
||||
):
|
||||
"""Generate and yield audio chunks as they're generated for real-time streaming"""
|
||||
try:
|
||||
stream_start = time.time()
|
||||
# Create normalizer for consistent audio levels
|
||||
stream_normalizer = AudioNormalizer()
|
||||
|
||||
# Input validation and preprocessing
|
||||
if not text:
|
||||
raise ValueError("Text is empty")
|
||||
preprocess_start = time.time()
|
||||
normalized = normalize_text(text)
|
||||
if not normalized:
|
||||
raise ValueError("Text is empty after preprocessing")
|
||||
text = str(normalized)
|
||||
logger.debug(f"Text preprocessing took: {(time.time() - preprocess_start)*1000:.1f}ms")
|
||||
|
||||
# Voice validation and loading
|
||||
voice_start = time.time()
|
||||
voice_path = self._get_voice_path(voice)
|
||||
if not voice_path:
|
||||
raise ValueError(f"Voice not found: {voice}")
|
||||
voicepack = self._load_voice(voice_path)
|
||||
logger.debug(f"Voice loading took: {(time.time() - voice_start)*1000:.1f}ms")
|
||||
|
||||
# Process chunks as they're generated
|
||||
is_first = True
|
||||
for chunk in self._split_text(text):
|
||||
chunks_processed = 0
|
||||
# last_chunk_end = time.time()
|
||||
|
||||
# Process chunks as they come from generator
|
||||
chunk_gen = chunker.split_text(text)
|
||||
current_chunk = next(chunk_gen, None)
|
||||
|
||||
while current_chunk is not None:
|
||||
next_chunk = next(chunk_gen, None) # Peek at next chunk
|
||||
# chunk_start = time.time()
|
||||
chunks_processed += 1
|
||||
try:
|
||||
# Process text and generate audio
|
||||
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
|
||||
# text_process_start = time.time()
|
||||
phonemes, tokens = TTSModel.process_text(current_chunk, voice[0])
|
||||
# text_process_time = time.time() - text_process_start
|
||||
|
||||
# audio_gen_start = time.time()
|
||||
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
|
||||
|
||||
# audio_gen_time = time.time() - audio_gen_start
|
||||
|
||||
if chunk_audio is not None:
|
||||
# Convert chunk with proper header handling
|
||||
convert_start = time.time()
|
||||
chunk_bytes = AudioService.convert_audio(
|
||||
chunk_audio,
|
||||
24000,
|
||||
output_format,
|
||||
is_first_chunk=is_first,
|
||||
normalizer=stream_normalizer
|
||||
normalizer=stream_normalizer,
|
||||
is_last_chunk=(next_chunk is None) # Last if no next chunk
|
||||
)
|
||||
# convert_time = time.time() - convert_start
|
||||
|
||||
# Calculate gap from last chunk
|
||||
# gap_time = chunk_start - last_chunk_end
|
||||
|
||||
# Log timing details if not silent
|
||||
# if not silent:
|
||||
# logger.debug(
|
||||
# f"\nChunk {chunks_processed} timing:"
|
||||
# f"\n Gap from last chunk: {gap_time*1000:.1f}ms"
|
||||
# f"\n Text processing: {text_process_time*1000:.1f}ms"
|
||||
# f"\n Audio generation: {audio_gen_time*1000:.1f}ms"
|
||||
# f"\n Audio conversion: {convert_time*1000:.1f}ms"
|
||||
# f"\n Total chunk time: {(time.time() - chunk_start)*1000:.1f}ms"
|
||||
# )
|
||||
|
||||
yield chunk_bytes
|
||||
is_first = False
|
||||
# last_chunk_end = time.time()
|
||||
else:
|
||||
logger.error(f"No audio generated for chunk: '{chunk}'")
|
||||
logger.error(f"No audio generated for chunk: '{current_chunk}'")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
|
||||
continue
|
||||
|
||||
logger.error(f"Failed to generate audio for chunk: '{current_chunk}'. Error: {str(e)}")
|
||||
|
||||
current_chunk = next_chunk # Move to next chunk
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in audio generation stream: {str(e)}")
|
||||
raise
|
||||
|
|
52
api/src/services/warmup.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import os
|
||||
from typing import List, Tuple
|
||||
import torch
|
||||
from loguru import logger
|
||||
|
||||
from .tts_service import TTSService
|
||||
from .tts_model import TTSModel
|
||||
|
||||
|
||||
class WarmupService:
|
||||
"""Service for warming up TTS models and voice caches"""
|
||||
|
||||
def __init__(self):
|
||||
self.tts_service = TTSService()
|
||||
|
||||
def load_voices(self) -> List[Tuple[str, torch.Tensor]]:
|
||||
"""Load and cache voices up to LRU limit"""
|
||||
# Get all voices sorted by filename length (shorter names first, usually base voices)
|
||||
voice_files = sorted(
|
||||
[f for f in os.listdir(TTSModel.VOICES_DIR) if f.endswith(".pt")],
|
||||
key=len
|
||||
)
|
||||
|
||||
# Load up to LRU cache limit (20)
|
||||
loaded_voices = []
|
||||
for voice_file in voice_files[:20]:
|
||||
try:
|
||||
voice_path = os.path.join(TTSModel.VOICES_DIR, voice_file)
|
||||
voicepack = torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
|
||||
loaded_voices.append((voice_file[:-3], voicepack)) # Store name and tensor
|
||||
# logger.info(f"Loaded voice {voice_file[:-3]} into cache")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load voice {voice_file}: {e}")
|
||||
logger.info(f"Pre-loaded {len(loaded_voices)} voices into cache")
|
||||
return loaded_voices
|
||||
|
||||
async def warmup_voices(self, warmup_text: str, loaded_voices: List[Tuple[str, torch.Tensor]]):
|
||||
"""Warm up voice inference and streaming"""
|
||||
n_warmups = 1
|
||||
for voice_name, _ in loaded_voices[:n_warmups]:
|
||||
try:
|
||||
logger.info(f"Running warmup inference on voice {voice_name}")
|
||||
async for _ in self.tts_service.generate_audio_stream(
|
||||
warmup_text,
|
||||
voice_name,
|
||||
1.0,
|
||||
"pcm"
|
||||
):
|
||||
pass # Process all chunks to properly warm up
|
||||
logger.info(f"Completed warmup for voice {voice_name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Warmup failed for voice {voice_name}: {e}")
|
35
api/tests/test_chunker.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""Tests for text chunking service"""
|
||||
|
||||
import pytest
|
||||
from api.src.services.text_processing import chunker
|
||||
|
||||
|
||||
def test_split_text():
|
||||
"""Test text splitting into sentences"""
|
||||
text = "First sentence. Second sentence! Third sentence?"
|
||||
sentences = list(chunker.split_text(text))
|
||||
assert len(sentences) == 3
|
||||
assert sentences[0] == "First sentence."
|
||||
assert sentences[1] == "Second sentence!"
|
||||
assert sentences[2] == "Third sentence?"
|
||||
|
||||
|
||||
def test_split_text_empty():
|
||||
"""Test splitting empty text"""
|
||||
assert list(chunker.split_text("")) == []
|
||||
|
||||
|
||||
def test_split_text_single_sentence():
|
||||
"""Test splitting single sentence"""
|
||||
text = "Just one sentence."
|
||||
assert list(chunker.split_text(text)) == ["Just one sentence."]
|
||||
|
||||
|
||||
def test_split_text_with_custom_chunk_size():
|
||||
"""Test splitting with custom max chunk size"""
|
||||
text = "First part, second part, third part."
|
||||
chunks = list(chunker.split_text(text, max_chunk=15))
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0] == "First part,"
|
||||
assert chunks[1] == "second part,"
|
||||
assert chunks[2] == "third part."
|
|
@ -1,7 +1,8 @@
|
|||
from unittest.mock import Mock
|
||||
from unittest.mock import Mock, AsyncMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import asyncio
|
||||
from fastapi.testclient import TestClient
|
||||
from httpx import AsyncClient
|
||||
|
||||
|
@ -22,6 +23,12 @@ async def async_client():
|
|||
def mock_tts_service(monkeypatch):
|
||||
mock_service = Mock()
|
||||
mock_service._generate_audio.return_value = (bytes([0, 1, 2, 3]), 1.0)
|
||||
|
||||
# Create proper async generator mock
|
||||
async def mock_stream(*args, **kwargs):
|
||||
for chunk in [b"chunk1", b"chunk2"]:
|
||||
yield chunk
|
||||
mock_service.generate_audio_stream = mock_stream
|
||||
mock_service.list_voices.return_value = [
|
||||
"af",
|
||||
"bm_lewis",
|
||||
|
@ -65,6 +72,7 @@ def test_openai_speech_endpoint(mock_tts_service, mock_audio_service):
|
|||
"voice": "bm_lewis",
|
||||
"response_format": "wav",
|
||||
"speed": 1.0,
|
||||
"stream": False # Explicitly disable streaming
|
||||
}
|
||||
response = client.post("/v1/audio/speech", json=test_request)
|
||||
assert response.status_code == 200
|
||||
|
@ -84,6 +92,7 @@ def test_openai_speech_invalid_voice(mock_tts_service):
|
|||
"voice": "invalid_voice",
|
||||
"response_format": "wav",
|
||||
"speed": 1.0,
|
||||
"stream": False # Explicitly disable streaming
|
||||
}
|
||||
response = client.post("/v1/audio/speech", json=test_request)
|
||||
assert response.status_code == 400 # Bad request
|
||||
|
@ -98,6 +107,7 @@ def test_openai_speech_invalid_speed(mock_tts_service):
|
|||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
"speed": -1.0, # Invalid speed
|
||||
"stream": False # Explicitly disable streaming
|
||||
}
|
||||
response = client.post("/v1/audio/speech", json=test_request)
|
||||
assert response.status_code == 422 # Validation error
|
||||
|
@ -112,6 +122,7 @@ def test_openai_speech_generation_error(mock_tts_service):
|
|||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
"speed": 1.0,
|
||||
"stream": False # Explicitly disable streaming
|
||||
}
|
||||
response = client.post("/v1/audio/speech", json=test_request)
|
||||
assert response.status_code == 500
|
||||
|
@ -171,13 +182,14 @@ async def test_openai_speech_pcm_streaming(mock_tts_service, async_client):
|
|||
"input": "Hello world",
|
||||
"voice": "af",
|
||||
"response_format": "pcm",
|
||||
"stream": True
|
||||
}
|
||||
|
||||
# Mock streaming response
|
||||
async def mock_stream():
|
||||
yield b"chunk1"
|
||||
yield b"chunk2"
|
||||
mock_tts_service.generate_audio_stream.return_value = mock_stream()
|
||||
# Create streaming mock for this test
|
||||
async def mock_stream(*args, **kwargs):
|
||||
for chunk in [b"chunk1", b"chunk2"]:
|
||||
yield chunk
|
||||
mock_tts_service.generate_audio_stream = mock_stream
|
||||
|
||||
# Add streaming header
|
||||
headers = {"x-raw-response": "stream"}
|
||||
|
@ -198,13 +210,14 @@ async def test_openai_speech_streaming_mp3(mock_tts_service, async_client):
|
|||
"input": "Hello world",
|
||||
"voice": "af",
|
||||
"response_format": "mp3",
|
||||
"stream": True
|
||||
}
|
||||
|
||||
# Mock streaming response
|
||||
async def mock_stream():
|
||||
yield b"mp3header"
|
||||
yield b"mp3data"
|
||||
mock_tts_service.generate_audio_stream.return_value = mock_stream()
|
||||
# Create streaming mock for this test
|
||||
async def mock_stream(*args, **kwargs):
|
||||
for chunk in [b"mp3header", b"mp3data"]:
|
||||
yield chunk
|
||||
mock_tts_service.generate_audio_stream = mock_stream
|
||||
|
||||
# Add streaming header
|
||||
headers = {"x-raw-response": "stream"}
|
||||
|
@ -227,14 +240,14 @@ async def test_openai_speech_streaming_generator(mock_tts_service, async_client)
|
|||
"input": "Hello world",
|
||||
"voice": "af",
|
||||
"response_format": "pcm",
|
||||
"stream": True
|
||||
}
|
||||
|
||||
# Mock streaming response
|
||||
async def mock_stream():
|
||||
yield b"chunk1"
|
||||
yield b"chunk2"
|
||||
|
||||
mock_tts_service.generate_audio_stream.return_value = mock_stream()
|
||||
# Create streaming mock for this test
|
||||
async def mock_stream(*args, **kwargs):
|
||||
for chunk in [b"chunk1", b"chunk2"]:
|
||||
yield chunk
|
||||
mock_tts_service.generate_audio_stream = mock_stream
|
||||
|
||||
# Add streaming header
|
||||
headers = {"x-raw-response": "stream"}
|
||||
|
|
|
@ -28,29 +28,34 @@ async def test_lifespan_successful_warmup(mock_logger, mock_tts_model):
|
|||
"""Test successful model warmup in lifespan"""
|
||||
# Mock file system for voice counting
|
||||
mock_tts_model.VOICES_DIR = "/mock/voices"
|
||||
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
|
||||
mock_tts_model.setup.return_value = 3 # 3 voice files
|
||||
mock_tts_model.get_device.return_value = "cuda"
|
||||
|
||||
# Create an async generator from the lifespan context manager
|
||||
async_gen = lifespan(MagicMock())
|
||||
# Start the context manager
|
||||
await async_gen.__aenter__()
|
||||
|
||||
# Verify the expected logging sequence
|
||||
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
|
||||
|
||||
# Check for the startup message containing the required info
|
||||
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
|
||||
startup_msg = next(msg for msg in startup_calls if "Model loaded and warmed up on" in msg)
|
||||
assert "Model loaded and warmed up on cuda" in startup_msg
|
||||
assert "3 voice packs loaded successfully" in startup_msg
|
||||
# Create async mock
|
||||
async def async_setup():
|
||||
return 3
|
||||
mock_tts_model.setup = MagicMock()
|
||||
mock_tts_model.setup.side_effect = async_setup
|
||||
mock_tts_model.get_device.return_value = "cuda"
|
||||
|
||||
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt"]):
|
||||
# Create an async generator from the lifespan context manager
|
||||
async_gen = lifespan(MagicMock())
|
||||
# Start the context manager
|
||||
await async_gen.__aenter__()
|
||||
|
||||
# Verify model setup was called
|
||||
mock_tts_model.setup.assert_called_once()
|
||||
# Verify the expected logging sequence
|
||||
mock_logger.info.assert_any_call("Loading TTS model and voice packs...")
|
||||
|
||||
# Check for the startup message containing the required info
|
||||
startup_calls = [call[0][0] for call in mock_logger.info.call_args_list]
|
||||
startup_msg = next(msg for msg in startup_calls if "Model warmed up on" in msg)
|
||||
assert "Model warmed up on" in startup_msg
|
||||
assert "3 voice packs loaded" in startup_msg
|
||||
|
||||
# Clean up
|
||||
await async_gen.__aexit__(None, None, None)
|
||||
# Verify model setup was called
|
||||
mock_tts_model.setup.assert_called_once()
|
||||
|
||||
# Clean up
|
||||
await async_gen.__aexit__(None, None, None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
@ -81,39 +86,21 @@ async def test_lifespan_cuda_warmup(mock_tts_model):
|
|||
"""Test model warmup specifically on CUDA"""
|
||||
# Mock file system for voice counting
|
||||
mock_tts_model.VOICES_DIR = "/mock/voices"
|
||||
|
||||
# Create async mock
|
||||
async def async_setup():
|
||||
return 2
|
||||
mock_tts_model.setup = MagicMock()
|
||||
mock_tts_model.setup.side_effect = async_setup
|
||||
mock_tts_model.get_device.return_value = "cuda"
|
||||
|
||||
with patch("os.listdir", return_value=["voice1.pt", "voice2.pt"]):
|
||||
mock_tts_model.setup.return_value = 2 # 2 voice files
|
||||
mock_tts_model.get_device.return_value = "cuda"
|
||||
# Create an async generator from the lifespan context manager
|
||||
async_gen = lifespan(MagicMock())
|
||||
await async_gen.__aenter__()
|
||||
|
||||
# Create an async generator from the lifespan context manager
|
||||
async_gen = lifespan(MagicMock())
|
||||
await async_gen.__aenter__()
|
||||
# Verify model setup was called
|
||||
mock_tts_model.setup.assert_called_once()
|
||||
|
||||
# Verify model setup was called
|
||||
mock_tts_model.setup.assert_called_once()
|
||||
|
||||
# Clean up
|
||||
await async_gen.__aexit__(None, None, None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("api.src.main.TTSModel")
|
||||
async def test_lifespan_cpu_fallback(mock_tts_model):
|
||||
"""Test model warmup falling back to CPU"""
|
||||
# Mock file system for voice counting
|
||||
mock_tts_model.VOICES_DIR = "/mock/voices"
|
||||
with patch(
|
||||
"os.listdir", return_value=["voice1.pt", "voice2.pt", "voice3.pt", "voice4.pt"]
|
||||
):
|
||||
mock_tts_model.setup.return_value = 4 # 4 voice files
|
||||
mock_tts_model.get_device.return_value = "cpu"
|
||||
|
||||
# Create an async generator from the lifespan context manager
|
||||
async_gen = lifespan(MagicMock())
|
||||
await async_gen.__aenter__()
|
||||
|
||||
# Verify model setup was called
|
||||
mock_tts_model.setup.assert_called_once()
|
||||
|
||||
# Clean up
|
||||
await async_gen.__aexit__(None, None, None)
|
||||
# Clean up
|
||||
await async_gen.__aexit__(None, None, None)
|
||||
|
|
|
@ -16,13 +16,14 @@ def test_get_device_error():
|
|||
with pytest.raises(RuntimeError, match="Model not initialized"):
|
||||
TTSBaseModel.get_device()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('torch.cuda.is_available')
|
||||
@patch('os.path.exists')
|
||||
@patch('os.path.join')
|
||||
@patch('os.listdir')
|
||||
@patch('torch.load')
|
||||
@patch('torch.save')
|
||||
def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
|
||||
async def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
|
||||
"""Test setup with CUDA available"""
|
||||
TTSBaseModel._device = None
|
||||
mock_cuda_available.return_value = True
|
||||
|
@ -36,17 +37,18 @@ def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, moc
|
|||
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
|
||||
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
|
||||
|
||||
voice_count = TTSBaseModel.setup()
|
||||
voice_count = await TTSBaseModel.setup()
|
||||
assert TTSBaseModel._device == "cuda"
|
||||
assert voice_count == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('torch.cuda.is_available')
|
||||
@patch('os.path.exists')
|
||||
@patch('os.path.join')
|
||||
@patch('os.listdir')
|
||||
@patch('torch.load')
|
||||
@patch('torch.save')
|
||||
def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
|
||||
async def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
|
||||
"""Test setup with CUDA unavailable"""
|
||||
TTSBaseModel._device = None
|
||||
mock_cuda_available.return_value = False
|
||||
|
@ -60,7 +62,7 @@ def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, m
|
|||
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
|
||||
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
|
||||
|
||||
voice_count = TTSBaseModel.setup()
|
||||
voice_count = await TTSBaseModel.setup()
|
||||
assert TTSBaseModel._device == "cpu"
|
||||
assert voice_count == 2
|
||||
|
||||
|
|
|
@ -31,27 +31,6 @@ def sample_audio():
|
|||
return np.sin(2 * np.pi * frequency * t).astype(np.float32)
|
||||
|
||||
|
||||
def test_split_text(tts_service):
|
||||
"""Test text splitting into sentences"""
|
||||
text = "First sentence. Second sentence! Third sentence?"
|
||||
sentences = tts_service._split_text(text)
|
||||
assert len(sentences) == 3
|
||||
assert sentences[0] == "First sentence."
|
||||
assert sentences[1] == "Second sentence!"
|
||||
assert sentences[2] == "Third sentence?"
|
||||
|
||||
|
||||
def test_split_text_empty(tts_service):
|
||||
"""Test splitting empty text"""
|
||||
assert tts_service._split_text("") == []
|
||||
|
||||
|
||||
def test_split_text_single_sentence(tts_service):
|
||||
"""Test splitting single sentence"""
|
||||
text = "Just one sentence."
|
||||
assert tts_service._split_text(text) == ["Just one sentence."]
|
||||
|
||||
|
||||
def test_audio_to_bytes(tts_service, sample_audio):
|
||||
"""Test converting audio tensor to bytes"""
|
||||
audio_bytes = tts_service._audio_to_bytes(sample_audio)
|
||||
|
@ -152,7 +131,7 @@ def test_generate_audio_phonemize_error(
|
|||
mock_torch_load.return_value = torch.zeros((10, 24000))
|
||||
mock_generate.return_value = (None, None)
|
||||
|
||||
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
|
||||
with pytest.raises(ValueError, match="No chunks were processed successfully"):
|
||||
tts_service._generate_audio("Test text", "af", 1.0)
|
||||
|
||||
|
||||
|
@ -185,7 +164,7 @@ def test_generate_audio_error(
|
|||
mock_exists.return_value = True
|
||||
mock_torch_load.return_value = torch.zeros((10, 24000))
|
||||
|
||||
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
|
||||
with pytest.raises(ValueError, match="No chunks were processed successfully"):
|
||||
tts_service._generate_audio("Test text", "af", 1.0)
|
||||
|
||||
|
||||
|
|
BIN
assets/format_comparison.png
Normal file
After Width: | Height: | Size: 774 KiB |
BIN
assets/gpu_first_token_latency_direct.png
Normal file
After Width: | Height: | Size: 237 KiB |
BIN
assets/gpu_first_token_latency_openai.png
Normal file
After Width: | Height: | Size: 238 KiB |
BIN
assets/gpu_first_token_timeline_direct.png
Normal file
After Width: | Height: | Size: 234 KiB |
BIN
assets/gpu_first_token_timeline_openai.png
Normal file
After Width: | Height: | Size: 241 KiB |
BIN
assets/gpu_processing_time.png
Normal file
After Width: | Height: | Size: 248 KiB |
BIN
assets/gpu_realtime_factor.png
Normal file
After Width: | Height: | Size: 237 KiB |
BIN
assets/gpu_total_time_latency_direct.png
Normal file
After Width: | Height: | Size: 248 KiB |
BIN
assets/gpu_total_time_latency_openai.png
Normal file
After Width: | Height: | Size: 258 KiB |
BIN
assets/voice_analysis.png
Normal file
After Width: | Height: | Size: 958 KiB |
|
@ -43,6 +43,7 @@ services:
|
|||
- ONNX_OPTIMIZATION_LEVEL=all
|
||||
- ONNX_MEMORY_PATTERN=true
|
||||
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
|
||||
|
||||
depends_on:
|
||||
model-fetcher:
|
||||
condition: service_healthy
|
||||
|
|
|
@ -2,7 +2,7 @@ services:
|
|||
model-fetcher:
|
||||
image: datamachines/git-lfs:latest
|
||||
environment:
|
||||
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-true}
|
||||
- SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
|
||||
volumes:
|
||||
- ./Kokoro-82M:/app/Kokoro-82M
|
||||
working_dir: /app/Kokoro-82M
|
||||
|
@ -32,10 +32,10 @@ services:
|
|||
start_period: 1s
|
||||
|
||||
kokoro-tts:
|
||||
image: ghcr.io/remsky/kokoro-fastapi:latest
|
||||
# image: ghcr.io/remsky/kokoro-fastapi:latest
|
||||
# Uncomment below to build from source instead of using the released image
|
||||
# build:
|
||||
# context: .
|
||||
build:
|
||||
context: .
|
||||
volumes:
|
||||
- ./api/src:/app/api/src
|
||||
- ./Kokoro-82M:/app/Kokoro-82M
|
||||
|
@ -54,14 +54,14 @@ services:
|
|||
model-fetcher:
|
||||
condition: service_healthy
|
||||
|
||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
||||
# gradio-ui:
|
||||
# build:
|
||||
# context: ./ui
|
||||
# ports:
|
||||
# - "7860:7860"
|
||||
# volumes:
|
||||
# - ./ui/data:/app/ui/data
|
||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
# environment:
|
||||
# - GRADIO_WATCH=True # Enable hot reloading
|
||||
# Gradio UI service [Comment out everything below if you don't need it]
|
||||
gradio-ui:
|
||||
build:
|
||||
context: ./ui
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- ./ui/data:/app/ui/data
|
||||
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
environment:
|
||||
- GRADIO_WATCH=True # Enable hot reloading
|
||||
|
|
|
@ -1,15 +1,19 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import requests
|
||||
import pandas as pd
|
||||
from lib.shared_benchmark_utils import get_text_for_tokens, enc
|
||||
from lib.shared_utils import save_json_results
|
||||
from lib.shared_plotting import plot_correlation, plot_timeline
|
||||
import time
|
||||
|
||||
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
from lib.shared_utils import save_json_results
|
||||
from lib.shared_plotting import plot_timeline, plot_correlation
|
||||
from lib.shared_benchmark_utils import enc, get_text_for_tokens
|
||||
|
||||
|
||||
def measure_first_token(
|
||||
text: str, output_dir: str, tokens: int, run_number: int
|
||||
) -> dict:
|
||||
"""Measure time to audio via API calls and save the audio output"""
|
||||
results = {
|
||||
"text_length": len(text),
|
||||
|
@ -18,12 +22,12 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
|
|||
"time_to_first_chunk": None,
|
||||
"error": None,
|
||||
"audio_path": None,
|
||||
"audio_length": None # Length of output audio in seconds
|
||||
"audio_length": None, # Length of output audio in seconds
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
# Make request without streaming
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
|
@ -32,58 +36,62 @@ def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int
|
|||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
"stream": False
|
||||
"stream": False,
|
||||
},
|
||||
timeout=1800
|
||||
timeout=1800,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# Save complete audio
|
||||
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
|
||||
audio_path = os.path.join(output_dir, audio_filename)
|
||||
results["audio_path"] = audio_path
|
||||
|
||||
|
||||
content = response.content
|
||||
with open(audio_path, 'wb') as f:
|
||||
with open(audio_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
|
||||
# Calculate audio length using scipy
|
||||
import scipy.io.wavfile as wavfile
|
||||
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||
results["time_to_first_chunk"] = time.time() - start_time
|
||||
|
||||
|
||||
results["total_time"] = time.time() - start_time
|
||||
return results
|
||||
|
||||
|
||||
except Exception as e:
|
||||
results["error"] = str(e)
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
# Set up paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_dir = os.path.join(script_dir, "output_audio")
|
||||
output_data_dir = os.path.join(script_dir, "output_data")
|
||||
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
|
||||
# Load sample text
|
||||
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||
with open(
|
||||
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
|
||||
) as f:
|
||||
text = f.read()
|
||||
|
||||
# Test specific token counts
|
||||
token_sizes = [10, 25, 50, 100, 200, 500]
|
||||
all_results = []
|
||||
|
||||
|
||||
for tokens in token_sizes:
|
||||
print(f"\nTesting {tokens} tokens")
|
||||
test_text = get_text_for_tokens(text, tokens)
|
||||
actual_tokens = len(enc.encode(test_text))
|
||||
print(f"Text preview: {test_text[:50]}...")
|
||||
|
||||
|
||||
# Run test 3 times for each size to get average
|
||||
for i in range(5):
|
||||
print(f"Run {i+1}/3...")
|
||||
|
@ -91,67 +99,74 @@ def main():
|
|||
result["target_tokens"] = tokens
|
||||
result["actual_tokens"] = actual_tokens
|
||||
result["run_number"] = i + 1
|
||||
|
||||
|
||||
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
|
||||
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
|
||||
|
||||
|
||||
if result["error"]:
|
||||
print(f"Error: {result['error']}")
|
||||
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
|
||||
# Calculate averages per token size
|
||||
summary = {}
|
||||
for tokens in token_sizes:
|
||||
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
|
||||
matching_results = [
|
||||
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
|
||||
]
|
||||
if matching_results:
|
||||
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
|
||||
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
|
||||
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
|
||||
avg_first_chunk = sum(
|
||||
r["time_to_first_chunk"] for r in matching_results
|
||||
) / len(matching_results)
|
||||
avg_total = sum(r["total_time"] for r in matching_results) / len(
|
||||
matching_results
|
||||
)
|
||||
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
|
||||
matching_results
|
||||
)
|
||||
summary[tokens] = {
|
||||
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||
"avg_total_time": round(avg_total, 3),
|
||||
"avg_audio_length": round(avg_audio_length, 3),
|
||||
"num_successful_runs": len(matching_results)
|
||||
"num_successful_runs": len(matching_results),
|
||||
}
|
||||
|
||||
|
||||
# Save results
|
||||
# Save results
|
||||
results_data = {
|
||||
"individual_runs": all_results,
|
||||
"summary": summary,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
save_json_results(
|
||||
results_data,
|
||||
os.path.join(output_data_dir, "first_token_benchmark.json")
|
||||
results_data, os.path.join(output_data_dir, "first_token_benchmark.json")
|
||||
)
|
||||
|
||||
|
||||
# Create plot directory if it doesn't exist
|
||||
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||
os.makedirs(output_plots_dir, exist_ok=True)
|
||||
|
||||
|
||||
# Create DataFrame for plotting
|
||||
df = pd.DataFrame(all_results)
|
||||
|
||||
|
||||
# Create both plots
|
||||
plot_correlation(
|
||||
df, "target_tokens", "time_to_first_chunk",
|
||||
df,
|
||||
"target_tokens",
|
||||
"time_to_first_chunk",
|
||||
"Time to Audio vs Input Size",
|
||||
"Number of Input Tokens",
|
||||
"Time to Audio (seconds)",
|
||||
os.path.join(output_plots_dir, "first_token_latency.png")
|
||||
os.path.join(output_plots_dir, "first_token_latency.png"),
|
||||
)
|
||||
|
||||
plot_timeline(
|
||||
df,
|
||||
os.path.join(output_plots_dir, "first_token_timeline.png")
|
||||
)
|
||||
|
||||
|
||||
plot_timeline(df, os.path.join(output_plots_dir, "first_token_timeline.png"))
|
||||
|
||||
print("\nResults and plots saved to:")
|
||||
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,193 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import requests
|
||||
import pandas as pd
|
||||
from lib.shared_benchmark_utils import get_text_for_tokens, enc
|
||||
from lib.shared_utils import save_json_results
|
||||
from lib.shared_plotting import plot_correlation, plot_timeline
|
||||
|
||||
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
|
||||
"""Measure time to audio via API calls and save the audio output"""
|
||||
results = {
|
||||
"text_length": len(text),
|
||||
"token_count": len(enc.encode(text)),
|
||||
"total_time": None,
|
||||
"time_to_first_chunk": None,
|
||||
"error": None,
|
||||
"audio_path": None,
|
||||
"audio_length": None # Length of output audio in seconds
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Make request with streaming enabled
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "pcm",
|
||||
"stream": True
|
||||
},
|
||||
stream=True,
|
||||
timeout=1800
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Save complete audio
|
||||
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
|
||||
audio_path = os.path.join(output_dir, audio_filename)
|
||||
results["audio_path"] = audio_path
|
||||
|
||||
first_chunk_time = None
|
||||
chunks = []
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = time.time()
|
||||
results["time_to_first_chunk"] = first_chunk_time - start_time
|
||||
chunks.append(chunk)
|
||||
|
||||
# Concatenate all PCM chunks
|
||||
if not chunks:
|
||||
raise ValueError("No audio chunks received")
|
||||
|
||||
all_audio_data = b''.join(chunks)
|
||||
|
||||
# Write as WAV file
|
||||
import wave
|
||||
with wave.open(audio_path, 'wb') as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
||||
wav_file.setframerate(24000) # Known sample rate for Kokoro
|
||||
wav_file.writeframes(all_audio_data)
|
||||
|
||||
# Calculate audio length using scipy
|
||||
import scipy.io.wavfile as wavfile
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||
|
||||
results["total_time"] = time.time() - start_time
|
||||
|
||||
# Print debug info
|
||||
print(f"Complete audio size: {len(all_audio_data)} bytes")
|
||||
print(f"Number of chunks received: {len(chunks)}")
|
||||
print(f"Audio length: {results['audio_length']:.3f}s")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
results["error"] = str(e)
|
||||
return results
|
||||
|
||||
def main():
|
||||
# Set up paths with _stream suffix
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_dir = os.path.join(script_dir, "output_audio_stream")
|
||||
output_data_dir = os.path.join(script_dir, "output_data")
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
|
||||
# Load sample text
|
||||
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
# Test specific token counts
|
||||
token_sizes = [50, 100, 200, 500, 1000, 2000, 5000, 10000]
|
||||
all_results = []
|
||||
|
||||
for tokens in token_sizes:
|
||||
print(f"\nTesting {tokens} tokens (streaming)")
|
||||
test_text = get_text_for_tokens(text, tokens)
|
||||
actual_tokens = len(enc.encode(test_text))
|
||||
print(f"Text preview: {test_text[:50]}...")
|
||||
|
||||
# Run test 3 times for each size to get average
|
||||
for i in range(5):
|
||||
print(f"Run {i+1}/3...")
|
||||
result = measure_first_token(test_text, output_dir, tokens, i + 1)
|
||||
result["target_tokens"] = tokens
|
||||
result["actual_tokens"] = actual_tokens
|
||||
result["run_number"] = i + 1
|
||||
|
||||
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
|
||||
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
|
||||
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
|
||||
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
|
||||
|
||||
if result["error"]:
|
||||
print(f"Error: {result['error']}")
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
# Calculate averages per token size
|
||||
summary = {}
|
||||
for tokens in token_sizes:
|
||||
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
|
||||
if matching_results:
|
||||
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
|
||||
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
|
||||
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
|
||||
summary[tokens] = {
|
||||
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||
"avg_total_time": round(avg_total, 3),
|
||||
"avg_audio_length": round(avg_audio_length, 3),
|
||||
"num_successful_runs": len(matching_results)
|
||||
}
|
||||
|
||||
# Save results with _stream suffix
|
||||
results_data = {
|
||||
"individual_runs": all_results,
|
||||
"summary": summary,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
save_json_results(
|
||||
results_data,
|
||||
os.path.join(output_data_dir, "first_token_benchmark_stream.json")
|
||||
)
|
||||
|
||||
# Create plot directory if it doesn't exist
|
||||
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||
os.makedirs(output_plots_dir, exist_ok=True)
|
||||
|
||||
# Create DataFrame for plotting
|
||||
df = pd.DataFrame(all_results)
|
||||
|
||||
# Create both plots with _stream suffix
|
||||
# Plot correlation for both metrics
|
||||
plot_correlation(
|
||||
df, "target_tokens", "time_to_first_chunk",
|
||||
"Time to First Audio vs Input Size (Streaming)",
|
||||
"Number of Input Tokens",
|
||||
"Time to First Audio (seconds)",
|
||||
os.path.join(output_plots_dir, "first_token_latency_stream.png")
|
||||
)
|
||||
|
||||
plot_correlation(
|
||||
df, "target_tokens", "total_time",
|
||||
"Total Time vs Input Size (Streaming)",
|
||||
"Number of Input Tokens",
|
||||
"Total Time (seconds)",
|
||||
os.path.join(output_plots_dir, "total_time_latency_stream.png")
|
||||
)
|
||||
|
||||
plot_timeline(
|
||||
df,
|
||||
os.path.join(output_plots_dir, "first_token_timeline_stream.png", suffix="(Streaming)")
|
||||
)
|
||||
|
||||
print("\nResults and plots saved to:")
|
||||
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,184 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from openai import OpenAI
|
||||
from lib.shared_benchmark_utils import get_text_for_tokens, enc
|
||||
from lib.shared_utils import save_json_results
|
||||
from lib.shared_plotting import plot_correlation, plot_timeline
|
||||
|
||||
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
|
||||
"""Measure time to audio via OpenAI API calls and save the audio output"""
|
||||
results = {
|
||||
"text_length": len(text),
|
||||
"token_count": len(enc.encode(text)),
|
||||
"total_time": None,
|
||||
"time_to_first_chunk": None,
|
||||
"error": None,
|
||||
"audio_path": None,
|
||||
"audio_length": None # Length of output audio in seconds
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Initialize OpenAI client
|
||||
openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local")
|
||||
|
||||
# Save complete audio
|
||||
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
|
||||
audio_path = os.path.join(output_dir, audio_filename)
|
||||
results["audio_path"] = audio_path
|
||||
|
||||
first_chunk_time = None
|
||||
all_audio_data = bytearray()
|
||||
chunk_count = 0
|
||||
|
||||
# Make streaming request using OpenAI client
|
||||
with openai.audio.speech.with_streaming_response.create(
|
||||
model="kokoro",
|
||||
voice="af",
|
||||
response_format="pcm",
|
||||
input=text,
|
||||
) as response:
|
||||
for chunk in response.iter_bytes(chunk_size=1024):
|
||||
if chunk:
|
||||
chunk_count += 1
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = time.time()
|
||||
results["time_to_first_chunk"] = first_chunk_time - start_time
|
||||
all_audio_data.extend(chunk)
|
||||
|
||||
# Write as WAV file
|
||||
import wave
|
||||
with wave.open(audio_path, 'wb') as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
||||
wav_file.setframerate(24000) # Known sample rate for Kokoro
|
||||
wav_file.writeframes(all_audio_data)
|
||||
|
||||
# Calculate audio length using scipy
|
||||
import scipy.io.wavfile as wavfile
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||
|
||||
results["total_time"] = time.time() - start_time
|
||||
|
||||
# Print debug info
|
||||
print(f"Complete audio size: {len(all_audio_data)} bytes")
|
||||
print(f"Number of chunks received: {chunk_count}")
|
||||
print(f"Audio length: {results['audio_length']:.3f}s")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
results["error"] = str(e)
|
||||
return results
|
||||
|
||||
def main():
|
||||
# Set up paths with _stream_openai suffix
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_dir = os.path.join(script_dir, "output_audio_stream_openai")
|
||||
output_data_dir = os.path.join(script_dir, "output_data")
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
|
||||
# Load sample text
|
||||
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
# Test specific token counts
|
||||
token_sizes = [50, 100, 200, 500]
|
||||
all_results = []
|
||||
|
||||
for tokens in token_sizes:
|
||||
print(f"\nTesting {tokens} tokens (streaming)")
|
||||
test_text = get_text_for_tokens(text, tokens)
|
||||
actual_tokens = len(enc.encode(test_text))
|
||||
print(f"Text preview: {test_text[:50]}...")
|
||||
|
||||
# Run test 5 times for each size to get average
|
||||
for i in range(5):
|
||||
print(f"Run {i+1}/5...")
|
||||
result = measure_first_token(test_text, output_dir, tokens, i + 1)
|
||||
result["target_tokens"] = tokens
|
||||
result["actual_tokens"] = actual_tokens
|
||||
result["run_number"] = i + 1
|
||||
|
||||
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
|
||||
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
|
||||
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
|
||||
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
|
||||
|
||||
if result["error"]:
|
||||
print(f"Error: {result['error']}")
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
# Calculate averages per token size
|
||||
summary = {}
|
||||
for tokens in token_sizes:
|
||||
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
|
||||
if matching_results:
|
||||
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
|
||||
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
|
||||
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
|
||||
summary[tokens] = {
|
||||
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||
"avg_total_time": round(avg_total, 3),
|
||||
"avg_audio_length": round(avg_audio_length, 3),
|
||||
"num_successful_runs": len(matching_results)
|
||||
}
|
||||
|
||||
# Save results with _stream_openai suffix
|
||||
results_data = {
|
||||
"individual_runs": all_results,
|
||||
"summary": summary,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
save_json_results(
|
||||
results_data,
|
||||
os.path.join(output_data_dir, "first_token_benchmark_stream_openai.json")
|
||||
)
|
||||
|
||||
# Create plot directory if it doesn't exist
|
||||
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||
os.makedirs(output_plots_dir, exist_ok=True)
|
||||
|
||||
# Create DataFrame for plotting
|
||||
df = pd.DataFrame(all_results)
|
||||
|
||||
# Create plots with _stream_openai suffix
|
||||
plot_correlation(
|
||||
df, "target_tokens", "time_to_first_chunk",
|
||||
"Time to First Audio vs Input Size (OpenAI Streaming)",
|
||||
"Number of Input Tokens",
|
||||
"Time to First Audio (seconds)",
|
||||
os.path.join(output_plots_dir, "first_token_latency_stream_openai.png")
|
||||
)
|
||||
|
||||
plot_correlation(
|
||||
df, "target_tokens", "total_time",
|
||||
"Total Time vs Input Size (OpenAI Streaming)",
|
||||
"Number of Input Tokens",
|
||||
"Total Time (seconds)",
|
||||
os.path.join(output_plots_dir, "total_time_latency_stream_openai.png")
|
||||
)
|
||||
|
||||
plot_timeline(
|
||||
df,
|
||||
os.path.join(output_plots_dir, "first_token_timeline_stream_openai.png")
|
||||
)
|
||||
|
||||
print("\nResults and plots saved to:")
|
||||
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream_openai.json')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream_openai.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream_openai.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream_openai.png')}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,195 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
from lib.stream_utils import run_benchmark
|
||||
|
||||
OPENAI_CLIENT = OpenAI(
|
||||
base_url="http://localhost:8880/v1", api_key="not-needed-for-local"
|
||||
)
|
||||
|
||||
|
||||
def measure_first_token_requests(
|
||||
text: str, output_dir: str, tokens: int, run_number: int
|
||||
) -> dict:
|
||||
"""Measure time to audio via direct API calls and save the audio output"""
|
||||
results = {
|
||||
"text_length": len(text),
|
||||
"token_count": None, # Will be set by run_benchmark
|
||||
"total_time": None,
|
||||
"time_to_first_chunk": None,
|
||||
"error": None,
|
||||
"audio_path": None,
|
||||
"audio_length": None,
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Make request with streaming enabled
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "pcm",
|
||||
"stream": True,
|
||||
},
|
||||
stream=True,
|
||||
timeout=1800,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Save complete audio
|
||||
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
|
||||
audio_path = os.path.join(output_dir, audio_filename)
|
||||
results["audio_path"] = audio_path
|
||||
|
||||
first_chunk_time = None
|
||||
chunks = []
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = time.time()
|
||||
results["time_to_first_chunk"] = first_chunk_time - start_time
|
||||
chunks.append(chunk)
|
||||
|
||||
# Concatenate all PCM chunks
|
||||
if not chunks:
|
||||
raise ValueError("No audio chunks received")
|
||||
|
||||
all_audio_data = b"".join(chunks)
|
||||
|
||||
# Write as WAV file
|
||||
import wave
|
||||
|
||||
with wave.open(audio_path, "wb") as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
||||
wav_file.setframerate(24000) # Known sample rate for Kokoro
|
||||
wav_file.writeframes(all_audio_data)
|
||||
|
||||
# Calculate audio length using scipy
|
||||
import scipy.io.wavfile as wavfile
|
||||
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||
|
||||
results["total_time"] = time.time() - start_time
|
||||
|
||||
# Print debug info
|
||||
print(f"Complete audio size: {len(all_audio_data)} bytes")
|
||||
print(f"Number of chunks received: {len(chunks)}")
|
||||
print(f"Audio length: {results['audio_length']:.3f}s")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
results["error"] = str(e)
|
||||
return results
|
||||
|
||||
|
||||
def measure_first_token_openai(
|
||||
text: str, output_dir: str, tokens: int, run_number: int
|
||||
) -> dict:
|
||||
"""Measure time to audio via OpenAI API calls and save the audio output"""
|
||||
results = {
|
||||
"text_length": len(text),
|
||||
"token_count": None, # Will be set by run_benchmark
|
||||
"total_time": None,
|
||||
"time_to_first_chunk": None,
|
||||
"error": None,
|
||||
"audio_path": None,
|
||||
"audio_length": None,
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Initialize OpenAI client
|
||||
|
||||
# Save complete audio
|
||||
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream_openai.wav"
|
||||
audio_path = os.path.join(output_dir, audio_filename)
|
||||
results["audio_path"] = audio_path
|
||||
|
||||
first_chunk_time = None
|
||||
all_audio_data = bytearray()
|
||||
chunk_count = 0
|
||||
|
||||
# Make streaming request using OpenAI client
|
||||
with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
|
||||
model="kokoro",
|
||||
voice="af",
|
||||
response_format="pcm",
|
||||
input=text,
|
||||
) as response:
|
||||
for chunk in response.iter_bytes(chunk_size=1024):
|
||||
if chunk:
|
||||
chunk_count += 1
|
||||
if first_chunk_time is None:
|
||||
first_chunk_time = time.time()
|
||||
results["time_to_first_chunk"] = first_chunk_time - start_time
|
||||
all_audio_data.extend(chunk)
|
||||
|
||||
# Write as WAV file
|
||||
import wave
|
||||
|
||||
with wave.open(audio_path, "wb") as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
||||
wav_file.setframerate(24000) # Known sample rate for Kokoro
|
||||
wav_file.writeframes(all_audio_data)
|
||||
|
||||
# Calculate audio length using scipy
|
||||
import scipy.io.wavfile as wavfile
|
||||
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||
|
||||
results["total_time"] = time.time() - start_time
|
||||
|
||||
# Print debug info
|
||||
print(f"Complete audio size: {len(all_audio_data)} bytes")
|
||||
print(f"Number of chunks received: {chunk_count}")
|
||||
print(f"Audio length: {results['audio_length']:.3f}s")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
results["error"] = str(e)
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
prefix='cpu'
|
||||
# Run requests benchmark
|
||||
print("\n=== Running Direct Requests Benchmark ===")
|
||||
run_benchmark(
|
||||
measure_first_token_requests,
|
||||
output_dir=os.path.join(script_dir, "output_audio_stream"),
|
||||
output_data_dir=os.path.join(script_dir, "output_data"),
|
||||
output_plots_dir=os.path.join(script_dir, "output_plots"),
|
||||
suffix="_stream",
|
||||
plot_title_suffix="(Streaming)",
|
||||
prefix=prefix
|
||||
)
|
||||
# Run OpenAI benchmark
|
||||
print("\n=== Running OpenAI Library Benchmark ===")
|
||||
run_benchmark(
|
||||
measure_first_token_openai,
|
||||
output_dir=os.path.join(script_dir, "output_audio_stream_openai"),
|
||||
output_data_dir=os.path.join(script_dir, "output_data"),
|
||||
output_plots_dir=os.path.join(script_dir, "output_plots"),
|
||||
suffix="_stream_openai",
|
||||
plot_title_suffix="(OpenAI Streaming)",
|
||||
prefix=prefix
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,30 +1,37 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import threading
|
||||
import queue
|
||||
import pandas as pd
|
||||
import sys
|
||||
import threading
|
||||
from datetime import datetime
|
||||
|
||||
from lib.shared_plotting import plot_system_metrics, plot_correlation
|
||||
import pandas as pd
|
||||
from lib.shared_utils import (
|
||||
get_system_metrics, save_json_results, write_benchmark_stats,
|
||||
real_time_factor
|
||||
real_time_factor,
|
||||
save_json_results,
|
||||
get_system_metrics,
|
||||
write_benchmark_stats,
|
||||
)
|
||||
from lib.shared_plotting import plot_correlation, plot_system_metrics
|
||||
from lib.shared_benchmark_utils import (
|
||||
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
|
||||
enc,
|
||||
make_tts_request,
|
||||
get_text_for_tokens,
|
||||
generate_token_sizes,
|
||||
)
|
||||
|
||||
|
||||
class SystemMonitor:
|
||||
def __init__(self, interval=1.0):
|
||||
"""Rough system tracker: Not always accurate"""
|
||||
self.interval = interval
|
||||
self.metrics_queue = queue.Queue()
|
||||
self.stop_event = threading.Event()
|
||||
self.metrics_timeline = []
|
||||
self.start_time = None
|
||||
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Background thread function to collect system metrics."""
|
||||
while not self.stop_event.is_set():
|
||||
|
@ -32,20 +39,20 @@ class SystemMonitor:
|
|||
metrics["relative_time"] = time.time() - self.start_time
|
||||
self.metrics_queue.put(metrics)
|
||||
time.sleep(self.interval)
|
||||
|
||||
|
||||
def start(self):
|
||||
"""Start the monitoring thread."""
|
||||
self.start_time = time.time()
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop)
|
||||
self.monitor_thread.daemon = True
|
||||
self.monitor_thread.start()
|
||||
|
||||
|
||||
def stop(self):
|
||||
"""Stop the monitoring thread and collect final metrics."""
|
||||
self.stop_event.set()
|
||||
if hasattr(self, 'monitor_thread'):
|
||||
if hasattr(self, "monitor_thread"):
|
||||
self.monitor_thread.join(timeout=2)
|
||||
|
||||
|
||||
# Collect all metrics from queue
|
||||
while True:
|
||||
try:
|
||||
|
@ -53,23 +60,24 @@ class SystemMonitor:
|
|||
self.metrics_timeline.append(metrics)
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
|
||||
return self.metrics_timeline
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize system monitor
|
||||
monitor = SystemMonitor(interval=1.0) # 1 second interval
|
||||
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
|
||||
prefix = "gpu"
|
||||
prefix = "cpu"
|
||||
# Generate token sizes
|
||||
if 'gpu' in prefix:
|
||||
if "gpu" in prefix:
|
||||
token_sizes = generate_token_sizes(
|
||||
max_tokens=5000, dense_step=150,
|
||||
dense_max=1000, sparse_step=1000)
|
||||
elif 'cpu' in prefix:
|
||||
max_tokens=1000, dense_step=150, dense_max=1000, sparse_step=1000
|
||||
)
|
||||
elif "cpu" in prefix:
|
||||
token_sizes = generate_token_sizes(
|
||||
max_tokens=1000, dense_step=300,
|
||||
dense_max=1000, sparse_step=0)
|
||||
max_tokens=1000, dense_step=100, dense_max=500, sparse_step=250
|
||||
)
|
||||
else:
|
||||
token_sizes = generate_token_sizes(max_tokens=3000)
|
||||
|
||||
|
@ -78,7 +86,7 @@ def main():
|
|||
output_dir = os.path.join(script_dir, "output_audio")
|
||||
output_data_dir = os.path.join(script_dir, "output_data")
|
||||
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
|
@ -90,7 +98,9 @@ def main():
|
|||
filename = f"{prefix}_{filename}"
|
||||
return os.path.join(path, filename)
|
||||
|
||||
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||
with open(
|
||||
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
|
||||
) as f:
|
||||
text = f.read()
|
||||
|
||||
total_tokens = len(enc.encode(text))
|
||||
|
@ -100,7 +110,7 @@ def main():
|
|||
|
||||
results = []
|
||||
test_start_time = time.time()
|
||||
|
||||
|
||||
# Start system monitoring
|
||||
monitor.start()
|
||||
|
||||
|
@ -114,7 +124,8 @@ def main():
|
|||
processing_time, audio_length = make_tts_request(
|
||||
chunk,
|
||||
output_dir=output_dir,
|
||||
prefix=prefix
|
||||
prefix=prefix,
|
||||
stream=False, # Use non-streaming mode for RTF benchmarking
|
||||
)
|
||||
if processing_time is None or audio_length is None:
|
||||
print("Breaking loop due to error")
|
||||
|
@ -123,14 +134,16 @@ def main():
|
|||
# Calculate RTF using the correct formula
|
||||
rtf = real_time_factor(processing_time, audio_length)
|
||||
print(f"Real-Time Factor: {rtf:.5f}")
|
||||
|
||||
results.append({
|
||||
"tokens": actual_tokens,
|
||||
"processing_time": processing_time,
|
||||
"output_length": audio_length,
|
||||
"rtf": rtf,
|
||||
"elapsed_time": round(time.time() - test_start_time, 2),
|
||||
})
|
||||
|
||||
results.append(
|
||||
{
|
||||
"tokens": actual_tokens,
|
||||
"processing_time": processing_time,
|
||||
"output_length": audio_length,
|
||||
"rtf": rtf,
|
||||
"elapsed_time": round(time.time() - test_start_time, 5),
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
if df.empty:
|
||||
|
@ -144,89 +157,101 @@ def main():
|
|||
{
|
||||
"title": "Benchmark Statistics (with correct RTF)",
|
||||
"stats": {
|
||||
"Total tokens processed": df['tokens'].sum(),
|
||||
"Total audio generated (s)": df['output_length'].sum(),
|
||||
"Total test duration (s)": df['elapsed_time'].max(),
|
||||
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
|
||||
"Average RTF": df['rtf'].mean(),
|
||||
"Average Real Time Speed": 1/df['rtf'].mean()
|
||||
}
|
||||
"Total tokens processed": df["tokens"].sum(),
|
||||
"Total audio generated (s)": df["output_length"].sum(),
|
||||
"Total test duration (s)": df["elapsed_time"].max(),
|
||||
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
|
||||
"Average RTF": df["rtf"].mean(),
|
||||
"Average Real Time Speed": 1 / df["rtf"].mean(),
|
||||
},
|
||||
},
|
||||
{
|
||||
"title": "Per-chunk Stats",
|
||||
"stats": {
|
||||
"Average chunk size (tokens)": df['tokens'].mean(),
|
||||
"Min chunk size (tokens)": df['tokens'].min(),
|
||||
"Max chunk size (tokens)": df['tokens'].max(),
|
||||
"Average processing time (s)": df['processing_time'].mean(),
|
||||
"Average output length (s)": df['output_length'].mean()
|
||||
}
|
||||
"Average chunk size (tokens)": df["tokens"].mean(),
|
||||
"Min chunk size (tokens)": df["tokens"].min(),
|
||||
"Max chunk size (tokens)": df["tokens"].max(),
|
||||
"Average processing time (s)": df["processing_time"].mean(),
|
||||
"Average output length (s)": df["output_length"].mean(),
|
||||
},
|
||||
},
|
||||
{
|
||||
"title": "Performance Ranges",
|
||||
"stats": {
|
||||
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
|
||||
"RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
|
||||
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
|
||||
}
|
||||
}
|
||||
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x",
|
||||
},
|
||||
},
|
||||
]
|
||||
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
|
||||
write_benchmark_stats(
|
||||
stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt")
|
||||
)
|
||||
|
||||
# Plot Processing Time vs Token Count
|
||||
plot_correlation(
|
||||
df, "tokens", "processing_time",
|
||||
df,
|
||||
"tokens",
|
||||
"processing_time",
|
||||
"Processing Time vs Input Size",
|
||||
"Number of Input Tokens",
|
||||
"Processing Time (seconds)",
|
||||
prefix_path(output_plots_dir, "processing_time_rtf.png")
|
||||
prefix_path(output_plots_dir, "processing_time_rtf.png"),
|
||||
)
|
||||
|
||||
# Plot RTF vs Token Count
|
||||
plot_correlation(
|
||||
df, "tokens", "rtf",
|
||||
df,
|
||||
"tokens",
|
||||
"rtf",
|
||||
"Real-Time Factor vs Input Size",
|
||||
"Number of Input Tokens",
|
||||
"Real-Time Factor (processing time / audio length)",
|
||||
prefix_path(output_plots_dir, "realtime_factor_rtf.png")
|
||||
prefix_path(output_plots_dir, "realtime_factor_rtf.png"),
|
||||
)
|
||||
|
||||
# Stop monitoring and get final metrics
|
||||
final_metrics = monitor.stop()
|
||||
|
||||
|
||||
# Convert metrics timeline to DataFrame for stats
|
||||
metrics_df = pd.DataFrame(final_metrics)
|
||||
|
||||
|
||||
# Add system usage stats
|
||||
if not metrics_df.empty:
|
||||
stats.append({
|
||||
"title": "System Usage Statistics",
|
||||
"stats": {
|
||||
"Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
|
||||
"Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
|
||||
"Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
|
||||
"Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
|
||||
"Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
|
||||
"Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
|
||||
stats.append(
|
||||
{
|
||||
"title": "System Usage Statistics",
|
||||
"stats": {
|
||||
"Peak CPU Usage (%)": metrics_df["cpu_percent"].max(),
|
||||
"Avg CPU Usage (%)": metrics_df["cpu_percent"].mean(),
|
||||
"Peak RAM Usage (%)": metrics_df["ram_percent"].max(),
|
||||
"Avg RAM Usage (%)": metrics_df["ram_percent"].mean(),
|
||||
"Peak RAM Used (GB)": metrics_df["ram_used_gb"].max(),
|
||||
"Avg RAM Used (GB)": metrics_df["ram_used_gb"].mean(),
|
||||
},
|
||||
}
|
||||
})
|
||||
if 'gpu_memory_used' in metrics_df:
|
||||
stats[-1]["stats"].update({
|
||||
"Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
|
||||
"Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
|
||||
})
|
||||
|
||||
)
|
||||
if "gpu_memory_used" in metrics_df:
|
||||
stats[-1]["stats"].update(
|
||||
{
|
||||
"Peak GPU Memory (MB)": metrics_df["gpu_memory_used"].max(),
|
||||
"Avg GPU Memory (MB)": metrics_df["gpu_memory_used"].mean(),
|
||||
}
|
||||
)
|
||||
|
||||
# Plot system metrics
|
||||
plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
|
||||
plot_system_metrics(
|
||||
final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png")
|
||||
)
|
||||
|
||||
# Save final results
|
||||
save_json_results(
|
||||
{
|
||||
"results": results,
|
||||
"system_metrics": final_metrics,
|
||||
"test_duration": time.time() - test_start_time
|
||||
"test_duration": time.time() - test_start_time,
|
||||
},
|
||||
prefix_path(output_data_dir, "benchmark_results_rtf.json")
|
||||
prefix_path(output_data_dir, "benchmark_results_rtf.json"),
|
||||
)
|
||||
|
||||
print("\nResults saved to:")
|
||||
|
|
|
@ -1,19 +1,30 @@
|
|||
import os
|
||||
import json
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
|
||||
|
||||
from examples.assorted_checks.lib.shared_utils import (
|
||||
get_system_metrics, save_json_results, write_benchmark_stats
|
||||
save_json_results,
|
||||
get_system_metrics,
|
||||
write_benchmark_stats,
|
||||
)
|
||||
from examples.assorted_checks.lib.shared_plotting import (
|
||||
plot_correlation,
|
||||
plot_system_metrics,
|
||||
)
|
||||
from examples.assorted_checks.lib.shared_benchmark_utils import (
|
||||
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
|
||||
enc,
|
||||
make_tts_request,
|
||||
get_text_for_tokens,
|
||||
generate_token_sizes,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
# Get optional prefix from first command line argument
|
||||
import sys
|
||||
|
||||
prefix = sys.argv[1] if len(sys.argv) > 1 else ""
|
||||
|
||||
# Set up paths relative to this file
|
||||
|
@ -21,7 +32,7 @@ def main():
|
|||
output_dir = os.path.join(script_dir, "output_audio")
|
||||
output_data_dir = os.path.join(script_dir, "output_data")
|
||||
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
|
@ -43,7 +54,6 @@ def main():
|
|||
total_tokens = len(enc.encode(text))
|
||||
print(f"Total tokens in file: {total_tokens}")
|
||||
|
||||
|
||||
token_sizes = generate_token_sizes(total_tokens)
|
||||
|
||||
print(f"Testing sizes: {token_sizes}")
|
||||
|
@ -85,7 +95,7 @@ def main():
|
|||
# Save intermediate results
|
||||
save_json_results(
|
||||
{"results": results, "system_metrics": system_metrics},
|
||||
prefix_path(output_data_dir, "benchmark_results.json")
|
||||
prefix_path(output_data_dir, "benchmark_results.json"),
|
||||
)
|
||||
|
||||
# Create DataFrame and calculate stats
|
||||
|
@ -102,53 +112,59 @@ def main():
|
|||
{
|
||||
"title": "Benchmark Statistics",
|
||||
"stats": {
|
||||
"Total tokens processed": df['tokens'].sum(),
|
||||
"Total audio generated (s)": df['output_length'].sum(),
|
||||
"Total test duration (s)": df['elapsed_time'].max(),
|
||||
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
|
||||
"Average realtime factor": df['realtime_factor'].mean()
|
||||
}
|
||||
"Total tokens processed": df["tokens"].sum(),
|
||||
"Total audio generated (s)": df["output_length"].sum(),
|
||||
"Total test duration (s)": df["elapsed_time"].max(),
|
||||
"Average processing rate (tokens/s)": df["tokens_per_second"].mean(),
|
||||
"Average realtime factor": df["realtime_factor"].mean(),
|
||||
},
|
||||
},
|
||||
{
|
||||
"title": "Per-chunk Stats",
|
||||
"stats": {
|
||||
"Average chunk size (tokens)": df['tokens'].mean(),
|
||||
"Min chunk size (tokens)": df['tokens'].min(),
|
||||
"Max chunk size (tokens)": df['tokens'].max(),
|
||||
"Average processing time (s)": df['processing_time'].mean(),
|
||||
"Average output length (s)": df['output_length'].mean()
|
||||
}
|
||||
"Average chunk size (tokens)": df["tokens"].mean(),
|
||||
"Min chunk size (tokens)": df["tokens"].min(),
|
||||
"Max chunk size (tokens)": df["tokens"].max(),
|
||||
"Average processing time (s)": df["processing_time"].mean(),
|
||||
"Average output length (s)": df["output_length"].mean(),
|
||||
},
|
||||
},
|
||||
{
|
||||
"title": "Performance Ranges",
|
||||
"stats": {
|
||||
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
|
||||
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
|
||||
}
|
||||
}
|
||||
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x",
|
||||
},
|
||||
},
|
||||
]
|
||||
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
|
||||
|
||||
# Plot Processing Time vs Token Count
|
||||
plot_correlation(
|
||||
df, "tokens", "processing_time",
|
||||
df,
|
||||
"tokens",
|
||||
"processing_time",
|
||||
"Processing Time vs Input Size",
|
||||
"Number of Input Tokens",
|
||||
"Processing Time (seconds)",
|
||||
prefix_path(output_plots_dir, "processing_time.png")
|
||||
prefix_path(output_plots_dir, "processing_time.png"),
|
||||
)
|
||||
|
||||
# Plot Realtime Factor vs Token Count
|
||||
plot_correlation(
|
||||
df, "tokens", "realtime_factor",
|
||||
df,
|
||||
"tokens",
|
||||
"realtime_factor",
|
||||
"Realtime Factor vs Input Size",
|
||||
"Number of Input Tokens",
|
||||
"Realtime Factor (output length / processing time)",
|
||||
prefix_path(output_plots_dir, "realtime_factor.png")
|
||||
prefix_path(output_plots_dir, "realtime_factor.png"),
|
||||
)
|
||||
|
||||
# Plot system metrics
|
||||
plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
|
||||
plot_system_metrics(
|
||||
system_metrics, prefix_path(output_plots_dir, "system_usage.png")
|
||||
)
|
||||
|
||||
print("\nResults saved to:")
|
||||
print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
"""Shared utilities specific to TTS benchmarking."""
|
||||
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import requests
|
||||
import tiktoken
|
||||
|
||||
from .shared_utils import get_audio_length, save_audio_file
|
||||
from .shared_utils import save_audio_file, get_audio_length
|
||||
|
||||
# Global tokenizer instance
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
|
@ -13,11 +14,11 @@ enc = tiktoken.get_encoding("cl100k_base")
|
|||
|
||||
def get_text_for_tokens(text: str, num_tokens: int) -> str:
|
||||
"""Get a slice of text that contains exactly num_tokens tokens.
|
||||
|
||||
|
||||
Args:
|
||||
text: Input text to slice
|
||||
num_tokens: Desired number of tokens
|
||||
|
||||
|
||||
Returns:
|
||||
str: Text slice containing exactly num_tokens tokens
|
||||
"""
|
||||
|
@ -31,44 +32,69 @@ def make_tts_request(
|
|||
text: str,
|
||||
output_dir: str = None,
|
||||
timeout: int = 1800,
|
||||
prefix: str = ""
|
||||
prefix: str = "",
|
||||
stream: bool = True,
|
||||
) -> Tuple[Optional[float], Optional[float]]:
|
||||
"""Make TTS request using OpenAI-compatible endpoint.
|
||||
|
||||
|
||||
Args:
|
||||
text: Input text to convert to speech
|
||||
output_dir: Directory to save audio files. If None, audio won't be saved.
|
||||
timeout: Request timeout in seconds
|
||||
prefix: Optional prefix for output filenames
|
||||
|
||||
|
||||
Returns:
|
||||
tuple: (processing_time, audio_length) in seconds, or (None, None) on error
|
||||
"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
if stream:
|
||||
# For streaming, we need to collect all chunks
|
||||
audio_chunks = []
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
"stream": True,
|
||||
},
|
||||
timeout=timeout,
|
||||
stream=True,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
audio_chunks.append(chunk)
|
||||
|
||||
# Combine all chunks
|
||||
audio_data = b"".join(audio_chunks)
|
||||
else:
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": "af",
|
||||
"response_format": "wav",
|
||||
"stream": False,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
audio_data = response.content
|
||||
|
||||
processing_time = round(time.time() - start_time, 2)
|
||||
# Calculate audio length from response content
|
||||
audio_length = get_audio_length(response.content)
|
||||
|
||||
# Calculate audio length from audio data
|
||||
audio_length = get_audio_length(audio_data)
|
||||
|
||||
# Save the audio file if output_dir is provided
|
||||
if output_dir:
|
||||
token_count = len(enc.encode(text))
|
||||
output_file = save_audio_file(
|
||||
response.content,
|
||||
f"chunk_{token_count}_tokens",
|
||||
output_dir
|
||||
audio_data, f"chunk_{token_count}_tokens", output_dir
|
||||
)
|
||||
print(f"Saved audio to {output_file}")
|
||||
|
||||
|
@ -86,26 +112,26 @@ def generate_token_sizes(
|
|||
max_tokens: int,
|
||||
dense_step: int = 100,
|
||||
dense_max: int = 1000,
|
||||
sparse_step: int = 1000
|
||||
sparse_step: int = 1000,
|
||||
) -> List[int]:
|
||||
"""Generate token size ranges with dense sampling at start.
|
||||
|
||||
|
||||
Args:
|
||||
max_tokens: Maximum number of tokens to generate sizes up to
|
||||
dense_step: Step size for dense sampling range
|
||||
dense_max: Maximum value for dense sampling
|
||||
sparse_step: Step size for sparse sampling range
|
||||
|
||||
|
||||
Returns:
|
||||
list: Sorted list of token sizes
|
||||
"""
|
||||
# Dense sampling at start
|
||||
dense_range = list(range(dense_step, dense_max + 1, dense_step))
|
||||
|
||||
|
||||
if max_tokens <= dense_max or sparse_step < dense_max:
|
||||
return sorted(dense_range)
|
||||
# Sparse sampling for larger sizes
|
||||
sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
|
||||
|
||||
|
||||
# Combine and deduplicate
|
||||
return sorted(list(set(dense_range + sparse_range)))
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
"""Shared plotting utilities for benchmarks and tests."""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
|
@ -12,66 +13,71 @@ STYLE_CONFIG = {
|
|||
"secondary_color": "#05d9e8",
|
||||
"grid_color": "#ffffff",
|
||||
"text_color": "#ffffff",
|
||||
"font_sizes": {
|
||||
"title": 16,
|
||||
"label": 14,
|
||||
"tick": 12,
|
||||
"text": 10
|
||||
}
|
||||
"font_sizes": {"title": 16, "label": 14, "tick": 12, "text": 10},
|
||||
}
|
||||
|
||||
|
||||
def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
|
||||
"""Configure plot styling with consistent theme.
|
||||
|
||||
|
||||
Args:
|
||||
fig: matplotlib figure object
|
||||
ax: matplotlib axis object
|
||||
title: str, plot title
|
||||
xlabel: str, optional x-axis label
|
||||
ylabel: str, optional y-axis label
|
||||
|
||||
|
||||
Returns:
|
||||
tuple: (fig, ax) with applied styling
|
||||
"""
|
||||
# Grid styling
|
||||
ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
|
||||
|
||||
|
||||
# Title and labels
|
||||
ax.set_title(title, pad=20,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["title"],
|
||||
fontweight="bold",
|
||||
color=STYLE_CONFIG["text_color"])
|
||||
|
||||
ax.set_title(
|
||||
title,
|
||||
pad=20,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["title"],
|
||||
fontweight="bold",
|
||||
color=STYLE_CONFIG["text_color"],
|
||||
)
|
||||
|
||||
if xlabel:
|
||||
ax.set_xlabel(xlabel,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["label"],
|
||||
fontweight="medium",
|
||||
color=STYLE_CONFIG["text_color"])
|
||||
ax.set_xlabel(
|
||||
xlabel,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["label"],
|
||||
fontweight="medium",
|
||||
color=STYLE_CONFIG["text_color"],
|
||||
)
|
||||
if ylabel:
|
||||
ax.set_ylabel(ylabel,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["label"],
|
||||
fontweight="medium",
|
||||
color=STYLE_CONFIG["text_color"])
|
||||
|
||||
ax.set_ylabel(
|
||||
ylabel,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["label"],
|
||||
fontweight="medium",
|
||||
color=STYLE_CONFIG["text_color"],
|
||||
)
|
||||
|
||||
# Tick styling
|
||||
ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"],
|
||||
colors=STYLE_CONFIG["text_color"])
|
||||
|
||||
ax.tick_params(
|
||||
labelsize=STYLE_CONFIG["font_sizes"]["tick"], colors=STYLE_CONFIG["text_color"]
|
||||
)
|
||||
|
||||
# Spine styling
|
||||
for spine in ax.spines.values():
|
||||
spine.set_color(STYLE_CONFIG["text_color"])
|
||||
spine.set_alpha(0.3)
|
||||
spine.set_linewidth(0.5)
|
||||
|
||||
|
||||
# Background colors
|
||||
ax.set_facecolor(STYLE_CONFIG["background_color"])
|
||||
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
|
||||
|
||||
|
||||
return fig, ax
|
||||
|
||||
|
||||
def plot_system_metrics(metrics_data, output_path):
|
||||
"""Create plots for system metrics over time.
|
||||
|
||||
|
||||
Args:
|
||||
metrics_data: list of dicts containing system metrics
|
||||
output_path: str, path to save the output plot
|
||||
|
@ -79,68 +85,118 @@ def plot_system_metrics(metrics_data, output_path):
|
|||
df = pd.DataFrame(metrics_data)
|
||||
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
||||
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
|
||||
|
||||
|
||||
# Get baseline values
|
||||
baseline_cpu = df["cpu_percent"].iloc[0]
|
||||
baseline_ram = df["ram_used_gb"].iloc[0]
|
||||
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
|
||||
|
||||
baseline_gpu = (
|
||||
df["gpu_memory_used"].iloc[0] / 1024
|
||||
if "gpu_memory_used" in df.columns
|
||||
else None
|
||||
)
|
||||
|
||||
# Convert GPU memory to GB if present
|
||||
if "gpu_memory_used" in df.columns:
|
||||
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
|
||||
|
||||
|
||||
plt.style.use("dark_background")
|
||||
|
||||
|
||||
# Create subplots based on available metrics
|
||||
has_gpu = "gpu_memory_used" in df.columns
|
||||
num_plots = 3 if has_gpu else 2
|
||||
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
|
||||
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
|
||||
|
||||
|
||||
# Smoothing window
|
||||
window = min(5, len(df) // 2)
|
||||
|
||||
|
||||
# Plot CPU Usage
|
||||
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
|
||||
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0],
|
||||
color=STYLE_CONFIG["primary_color"], linewidth=2)
|
||||
axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"],
|
||||
linestyle="--", alpha=0.5, label="Baseline")
|
||||
setup_plot(fig, axes[0], "CPU Usage Over Time",
|
||||
xlabel="Time (seconds)", ylabel="CPU Usage (%)")
|
||||
sns.lineplot(
|
||||
x=elapsed_time,
|
||||
y=smoothed_cpu,
|
||||
ax=axes[0],
|
||||
color=STYLE_CONFIG["primary_color"],
|
||||
linewidth=2,
|
||||
)
|
||||
axes[0].axhline(
|
||||
y=baseline_cpu,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
linestyle="--",
|
||||
alpha=0.5,
|
||||
label="Baseline",
|
||||
)
|
||||
setup_plot(
|
||||
fig,
|
||||
axes[0],
|
||||
"CPU Usage Over Time",
|
||||
xlabel="Time (seconds)",
|
||||
ylabel="CPU Usage (%)",
|
||||
)
|
||||
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
|
||||
axes[0].legend()
|
||||
|
||||
|
||||
# Plot RAM Usage
|
||||
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
|
||||
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1],
|
||||
color=STYLE_CONFIG["secondary_color"], linewidth=2)
|
||||
axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"],
|
||||
linestyle="--", alpha=0.5, label="Baseline")
|
||||
setup_plot(fig, axes[1], "RAM Usage Over Time",
|
||||
xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
|
||||
sns.lineplot(
|
||||
x=elapsed_time,
|
||||
y=smoothed_ram,
|
||||
ax=axes[1],
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
linewidth=2,
|
||||
)
|
||||
axes[1].axhline(
|
||||
y=baseline_ram,
|
||||
color=STYLE_CONFIG["primary_color"],
|
||||
linestyle="--",
|
||||
alpha=0.5,
|
||||
label="Baseline",
|
||||
)
|
||||
setup_plot(
|
||||
fig,
|
||||
axes[1],
|
||||
"RAM Usage Over Time",
|
||||
xlabel="Time (seconds)",
|
||||
ylabel="RAM Usage (GB)",
|
||||
)
|
||||
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
|
||||
axes[1].legend()
|
||||
|
||||
|
||||
# Plot GPU Memory if available
|
||||
if has_gpu:
|
||||
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
|
||||
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2],
|
||||
color=STYLE_CONFIG["primary_color"], linewidth=2)
|
||||
axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"],
|
||||
linestyle="--", alpha=0.5, label="Baseline")
|
||||
setup_plot(fig, axes[2], "GPU Memory Usage Over Time",
|
||||
xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
|
||||
sns.lineplot(
|
||||
x=elapsed_time,
|
||||
y=smoothed_gpu,
|
||||
ax=axes[2],
|
||||
color=STYLE_CONFIG["primary_color"],
|
||||
linewidth=2,
|
||||
)
|
||||
axes[2].axhline(
|
||||
y=baseline_gpu,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
linestyle="--",
|
||||
alpha=0.5,
|
||||
label="Baseline",
|
||||
)
|
||||
setup_plot(
|
||||
fig,
|
||||
axes[2],
|
||||
"GPU Memory Usage Over Time",
|
||||
xlabel="Time (seconds)",
|
||||
ylabel="GPU Memory (GB)",
|
||||
)
|
||||
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
|
||||
axes[2].legend()
|
||||
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
def plot_timeline(df, output_path, suffix=""):
|
||||
|
||||
def plot_timeline(df, output_path, suffix="", prefix=""):
|
||||
"""Create timeline plot showing latency for each run.
|
||||
|
||||
|
||||
Args:
|
||||
df: pandas DataFrame containing run data with columns:
|
||||
- target_tokens: number of tokens
|
||||
|
@ -149,124 +205,161 @@ def plot_timeline(df, output_path, suffix=""):
|
|||
output_path: str, path to save the output plot
|
||||
"""
|
||||
plt.style.use("dark_background")
|
||||
|
||||
|
||||
# Sort by tokens and run number
|
||||
df = df.sort_values(['target_tokens', 'run_number'])
|
||||
|
||||
df = df.sort_values(["target_tokens", "run_number"])
|
||||
|
||||
# Create figure and axis
|
||||
fig, ax = plt.subplots(figsize=(12, 6))
|
||||
|
||||
|
||||
# Calculate y positions for each run with tighter grouping
|
||||
unique_tokens = sorted(df['target_tokens'].unique())
|
||||
unique_tokens = sorted(df["target_tokens"].unique())
|
||||
y_positions = {}
|
||||
current_y = 0
|
||||
group_spacing = 0.8 # Space between groups
|
||||
run_spacing = 0.2 # Space between runs in a group
|
||||
|
||||
run_spacing = 0.2 # Space between runs in a group
|
||||
|
||||
for tokens in unique_tokens:
|
||||
runs = df[df['target_tokens'] == tokens]
|
||||
runs = df[df["target_tokens"] == tokens]
|
||||
base_y = current_y
|
||||
for i, (_, run) in enumerate(runs.iterrows()):
|
||||
y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
|
||||
y_positions[(tokens, run["run_number"])] = base_y + (i * run_spacing)
|
||||
current_y = base_y + (len(runs) * run_spacing) + group_spacing
|
||||
|
||||
|
||||
# Plot bars and points with more transparency
|
||||
bar_height = 0.15
|
||||
for _, row in df.iterrows():
|
||||
y = y_positions[(row['target_tokens'], row['run_number'])]
|
||||
latency = row['time_to_first_chunk']
|
||||
|
||||
y = y_positions[(row["target_tokens"], row["run_number"])]
|
||||
latency = row["time_to_first_chunk"]
|
||||
|
||||
# Latency bar
|
||||
ax.add_patch(patches.Rectangle(
|
||||
(0, y - bar_height/2),
|
||||
latency,
|
||||
bar_height,
|
||||
facecolor=STYLE_CONFIG["primary_color"],
|
||||
alpha=0.3
|
||||
))
|
||||
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(0, y - bar_height / 2),
|
||||
latency,
|
||||
bar_height,
|
||||
facecolor=STYLE_CONFIG["primary_color"],
|
||||
alpha=0.3,
|
||||
)
|
||||
)
|
||||
|
||||
# End point
|
||||
ax.plot(latency, y, 'o',
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
markersize=4,
|
||||
alpha=0.5)
|
||||
|
||||
ax.plot(
|
||||
latency,
|
||||
y,
|
||||
"o",
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
markersize=4,
|
||||
alpha=0.5,
|
||||
)
|
||||
|
||||
# Add mean lines and values for each token group
|
||||
for tokens in unique_tokens:
|
||||
token_runs = df[df['target_tokens'] == tokens]
|
||||
mean_latency = token_runs['time_to_first_chunk'].mean()
|
||||
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
|
||||
token_runs = df[df["target_tokens"] == tokens]
|
||||
mean_latency = token_runs["time_to_first_chunk"].mean()
|
||||
y_positions_for_token = [
|
||||
y_positions[(tokens, run["run_number"])] for _, run in token_runs.iterrows()
|
||||
]
|
||||
min_y = min(y_positions_for_token)
|
||||
max_y = max(y_positions_for_token)
|
||||
group_center = (min_y + max_y) / 2
|
||||
|
||||
|
||||
# Plot mean line with gradient alpha
|
||||
gradient = np.linspace(0.2, 0.8, 100)
|
||||
for i in range(len(gradient)-1):
|
||||
y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
|
||||
y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
|
||||
ax.plot([mean_latency, mean_latency], [y1, y2],
|
||||
'-', color=STYLE_CONFIG["secondary_color"],
|
||||
linewidth=3, alpha=gradient[i])
|
||||
|
||||
for i in range(len(gradient) - 1):
|
||||
y1 = (
|
||||
min_y
|
||||
- bar_height
|
||||
+ (max_y - min_y + 2 * bar_height) * (i / len(gradient))
|
||||
)
|
||||
y2 = (
|
||||
min_y
|
||||
- bar_height
|
||||
+ (max_y - min_y + 2 * bar_height) * ((i + 1) / len(gradient))
|
||||
)
|
||||
ax.plot(
|
||||
[mean_latency, mean_latency],
|
||||
[y1, y2],
|
||||
"-",
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
linewidth=3,
|
||||
alpha=gradient[i],
|
||||
)
|
||||
|
||||
# Add mean value label with background
|
||||
label_text = f'Mean: {mean_latency:.3f}s'
|
||||
label_text = f"Mean: {mean_latency:.3f}s"
|
||||
bbox_props = dict(
|
||||
facecolor=STYLE_CONFIG["background_color"],
|
||||
edgecolor=STYLE_CONFIG["secondary_color"],
|
||||
alpha=0.8,
|
||||
pad=3,
|
||||
linewidth=1
|
||||
linewidth=1,
|
||||
)
|
||||
ax.text(mean_latency + 0.02, group_center,
|
||||
label_text,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
va='center',
|
||||
fontsize=10,
|
||||
fontweight='bold',
|
||||
bbox=bbox_props)
|
||||
|
||||
ax.text(
|
||||
mean_latency + 0.02,
|
||||
group_center,
|
||||
label_text,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
va="center",
|
||||
fontsize=10,
|
||||
fontweight="bold",
|
||||
bbox=bbox_props,
|
||||
)
|
||||
|
||||
# Customize plot
|
||||
ax.set_ylim(-1, current_y)
|
||||
ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3) # Extra space for labels
|
||||
|
||||
ax.set_xlim(0, df["time_to_first_chunk"].max() * 1.3) # Extra space for labels
|
||||
|
||||
# Add labels for token groups with tighter spacing
|
||||
group_positions = {}
|
||||
for tokens in unique_tokens:
|
||||
runs = df[df['target_tokens'] == tokens]
|
||||
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
|
||||
group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
|
||||
plt.axhline(y=min(y_positions_for_token) - bar_height,
|
||||
color='white', alpha=0.1, linestyle='-')
|
||||
|
||||
runs = df[df["target_tokens"] == tokens]
|
||||
y_positions_for_token = [
|
||||
y_positions[(tokens, run["run_number"])] for _, run in runs.iterrows()
|
||||
]
|
||||
group_positions[tokens] = sum(y_positions_for_token) / len(
|
||||
y_positions_for_token
|
||||
)
|
||||
plt.axhline(
|
||||
y=min(y_positions_for_token) - bar_height,
|
||||
color="white",
|
||||
alpha=0.1,
|
||||
linestyle="-",
|
||||
)
|
||||
|
||||
# Calculate mean audio length for each token group
|
||||
audio_lengths = {}
|
||||
for tokens in unique_tokens:
|
||||
token_runs = df[df['target_tokens'] == tokens]
|
||||
audio_lengths[tokens] = token_runs['audio_length'].mean()
|
||||
token_runs = df[df["target_tokens"] == tokens]
|
||||
audio_lengths[tokens] = token_runs["audio_length"].mean()
|
||||
|
||||
# Set y-ticks at group centers with token counts and audio lengths
|
||||
plt.yticks(
|
||||
list(group_positions.values()),
|
||||
[f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
|
||||
fontsize=10
|
||||
[
|
||||
f"{tokens} tokens\n({audio_lengths[tokens]:.1f}s)"
|
||||
for tokens in group_positions.keys()
|
||||
],
|
||||
fontsize=10,
|
||||
)
|
||||
|
||||
|
||||
# Customize appearance
|
||||
setup_plot(
|
||||
fig, ax,
|
||||
"Time-To-Audio Latency" + suffix,
|
||||
fig,
|
||||
ax,
|
||||
prefix.upper() + " Time-To-Audio Latency " + suffix,
|
||||
xlabel="Time (seconds)",
|
||||
ylabel="Input Size"
|
||||
ylabel="Input Size",
|
||||
)
|
||||
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
|
||||
"""Create correlation plot with regression line and correlation coefficient.
|
||||
|
||||
|
||||
Args:
|
||||
df: pandas DataFrame containing the data
|
||||
x: str, column name for x-axis
|
||||
|
@ -277,28 +370,40 @@ def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
|
|||
output_path: str, path to save the output plot
|
||||
"""
|
||||
plt.style.use("dark_background")
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
|
||||
|
||||
# Scatter plot
|
||||
sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6,
|
||||
color=STYLE_CONFIG["primary_color"])
|
||||
|
||||
sns.scatterplot(
|
||||
data=df, x=x, y=y, s=100, alpha=0.6, color=STYLE_CONFIG["primary_color"]
|
||||
)
|
||||
|
||||
# Regression line
|
||||
sns.regplot(data=df, x=x, y=y, scatter=False,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
line_kws={"linewidth": 2})
|
||||
|
||||
sns.regplot(
|
||||
data=df,
|
||||
x=x,
|
||||
y=y,
|
||||
scatter=False,
|
||||
color=STYLE_CONFIG["secondary_color"],
|
||||
line_kws={"linewidth": 2},
|
||||
)
|
||||
|
||||
# Add correlation coefficient
|
||||
corr = df[x].corr(df[y])
|
||||
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}",
|
||||
transform=ax.transAxes,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["text"],
|
||||
color=STYLE_CONFIG["text_color"],
|
||||
bbox=dict(facecolor=STYLE_CONFIG["background_color"],
|
||||
edgecolor=STYLE_CONFIG["text_color"],
|
||||
alpha=0.7))
|
||||
|
||||
plt.text(
|
||||
0.05,
|
||||
0.95,
|
||||
f"Correlation: {corr:.2f}",
|
||||
transform=ax.transAxes,
|
||||
fontsize=STYLE_CONFIG["font_sizes"]["text"],
|
||||
color=STYLE_CONFIG["text_color"],
|
||||
bbox=dict(
|
||||
facecolor=STYLE_CONFIG["background_color"],
|
||||
edgecolor=STYLE_CONFIG["text_color"],
|
||||
alpha=0.7,
|
||||
),
|
||||
)
|
||||
|
||||
setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
|
||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
"""Shared utilities for benchmarks and tests."""
|
||||
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
from typing import Any, Dict, List, Union, Optional
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import psutil
|
||||
import scipy.io.wavfile as wavfile
|
||||
|
@ -12,28 +13,46 @@ import scipy.io.wavfile as wavfile
|
|||
TORCH_AVAILABLE = False
|
||||
try:
|
||||
import torch
|
||||
|
||||
TORCH_AVAILABLE = torch.cuda.is_available()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def check_audio_file_is_silent(audio_path: str, threshold: float = 0.01) -> bool:
|
||||
"""Check if an audio file is silent by comparing peak amplitude to a threshold.
|
||||
|
||||
Args:
|
||||
audio_path: Path to the audio file
|
||||
threshold: Peak amplitude threshold for silence
|
||||
|
||||
Returns:
|
||||
bool: True if audio is silent, False otherwise
|
||||
"""
|
||||
rate, data = wavfile.read(audio_path)
|
||||
peak_amplitude = max(abs(data.min()), abs(data.max())) / 32768.0 # 16-bit audio
|
||||
|
||||
return peak_amplitude < threshold
|
||||
|
||||
|
||||
def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
|
||||
"""Get audio length in seconds from bytes data.
|
||||
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes
|
||||
temp_dir: Directory for temporary file. If None, uses system temp directory.
|
||||
|
||||
|
||||
Returns:
|
||||
float: Audio length in seconds
|
||||
"""
|
||||
if temp_dir is None:
|
||||
import tempfile
|
||||
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
|
||||
temp_path = os.path.join(temp_dir, "temp.wav")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(audio_data)
|
||||
|
||||
|
@ -47,11 +66,11 @@ def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
|
|||
|
||||
def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
|
||||
"""Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
|
||||
|
||||
|
||||
Args:
|
||||
average: If True and multiple GPUs present, returns average memory usage.
|
||||
If False, returns list of memory usage per GPU.
|
||||
|
||||
|
||||
Returns:
|
||||
float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
|
||||
If average=False and multiple GPUs present, returns list of values.
|
||||
|
@ -60,19 +79,23 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
|
|||
n_gpus = torch.cuda.device_count()
|
||||
memory_used = []
|
||||
for i in range(n_gpus):
|
||||
memory_used.append(torch.cuda.memory_allocated(i) / 1024**2) # Convert to MB
|
||||
|
||||
memory_used.append(
|
||||
torch.cuda.memory_allocated(i) / 1024**2
|
||||
) # Convert to MB
|
||||
|
||||
if average and len(memory_used) > 0:
|
||||
return sum(memory_used) / len(memory_used)
|
||||
return memory_used if len(memory_used) > 1 else memory_used[0]
|
||||
|
||||
|
||||
# Fall back to nvidia-smi
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
|
||||
)
|
||||
memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
|
||||
|
||||
memory_values = [
|
||||
float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()
|
||||
]
|
||||
|
||||
if average and len(memory_values) > 0:
|
||||
return sum(memory_values) / len(memory_values)
|
||||
return memory_values if len(memory_values) > 1 else memory_values[0]
|
||||
|
@ -82,14 +105,14 @@ def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
|
|||
|
||||
def get_system_metrics() -> Dict[str, Union[str, float]]:
|
||||
"""Get current system metrics including CPU, RAM, and GPU if available.
|
||||
|
||||
|
||||
Returns:
|
||||
dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
|
||||
"""
|
||||
# Get per-CPU percentages and calculate average
|
||||
cpu_percentages = psutil.cpu_percent(percpu=True)
|
||||
avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
|
||||
|
||||
|
||||
metrics = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"cpu_percent": round(avg_cpu, 2),
|
||||
|
@ -106,40 +129,40 @@ def get_system_metrics() -> Dict[str, Union[str, float]]:
|
|||
|
||||
def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
|
||||
"""Save audio data to a file with proper naming and directory creation.
|
||||
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes
|
||||
identifier: String to identify this audio file (e.g. token count, test name)
|
||||
output_dir: Directory to save the file
|
||||
|
||||
|
||||
Returns:
|
||||
str: Path to the saved audio file
|
||||
"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_file = os.path.join(output_dir, f"{identifier}.wav")
|
||||
|
||||
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(audio_data)
|
||||
|
||||
|
||||
return output_file
|
||||
|
||||
|
||||
def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
|
||||
"""Write benchmark statistics to a file in a clean, organized format.
|
||||
|
||||
|
||||
Args:
|
||||
stats: List of dictionaries containing stat name/value pairs
|
||||
output_file: Path to output file
|
||||
"""
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
for section in stats:
|
||||
# Write section header
|
||||
f.write(f"=== {section['title']} ===\n\n")
|
||||
|
||||
|
||||
# Write stats
|
||||
for label, value in section['stats'].items():
|
||||
for label, value in section["stats"].items():
|
||||
if isinstance(value, float):
|
||||
f.write(f"{label}: {value:.2f}\n")
|
||||
else:
|
||||
|
@ -149,7 +172,7 @@ def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None
|
|||
|
||||
def save_json_results(results: Dict[str, Any], output_file: str) -> None:
|
||||
"""Save benchmark results to a JSON file with proper formatting.
|
||||
|
||||
|
||||
Args:
|
||||
results: Dictionary of results to save
|
||||
output_file: Path to output file
|
||||
|
@ -159,14 +182,16 @@ def save_json_results(results: Dict[str, Any], output_file: str) -> None:
|
|||
json.dump(results, f, indent=2)
|
||||
|
||||
|
||||
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
|
||||
def real_time_factor(
|
||||
processing_time: float, audio_length: float, decimals: int = 2
|
||||
) -> float:
|
||||
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
|
||||
|
||||
|
||||
Args:
|
||||
processing_time: Time taken to process/generate audio
|
||||
audio_length: Length of the generated audio
|
||||
decimals: Number of decimal places to round to
|
||||
|
||||
|
||||
Returns:
|
||||
float: RTF value
|
||||
"""
|
||||
|
|
205
examples/assorted_checks/benchmarks/lib/stream_utils.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import wave
|
||||
from typing import Any, Dict, List, Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
import scipy.io.wavfile as wavfile
|
||||
|
||||
from .shared_utils import save_json_results
|
||||
from .shared_plotting import plot_timeline, plot_correlation
|
||||
from .shared_benchmark_utils import enc, get_text_for_tokens
|
||||
|
||||
|
||||
def check_audio_silence(audio_path: str) -> bool:
|
||||
"""Check if audio file contains only silence"""
|
||||
sample_rate, audio_data = wavfile.read(audio_path)
|
||||
# Convert to float for RMS calculation
|
||||
audio_float = audio_data.astype(float)
|
||||
# Calculate RMS value
|
||||
rms = (audio_float**2).mean() ** 0.5
|
||||
# Define silence threshold (adjust if needed)
|
||||
SILENCE_THRESHOLD = 50.0
|
||||
return rms < SILENCE_THRESHOLD
|
||||
|
||||
|
||||
def process_benchmark_results(
|
||||
all_results: List[Dict[str, Any]], token_sizes: List[int]
|
||||
) -> Dict[str, Any]:
|
||||
"""Process benchmark results and generate summary"""
|
||||
summary = {}
|
||||
for tokens in token_sizes:
|
||||
matching_results = [
|
||||
r for r in all_results if r["target_tokens"] == tokens and not r["error"]
|
||||
]
|
||||
if matching_results:
|
||||
avg_first_chunk = sum(
|
||||
r["time_to_first_chunk"] for r in matching_results
|
||||
) / len(matching_results)
|
||||
avg_total = sum(r["total_time"] for r in matching_results) / len(
|
||||
matching_results
|
||||
)
|
||||
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(
|
||||
matching_results
|
||||
)
|
||||
summary[tokens] = {
|
||||
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||
"avg_total_time": round(avg_total, 3),
|
||||
"avg_audio_length": round(avg_audio_length, 3),
|
||||
"num_successful_runs": len(matching_results),
|
||||
}
|
||||
return summary
|
||||
|
||||
|
||||
def save_benchmark_results(
|
||||
all_results: List[Dict[str, Any]],
|
||||
summary: Dict[str, Any],
|
||||
output_data_dir: str,
|
||||
output_plots_dir: str,
|
||||
suffix: str,
|
||||
plot_title_suffix: str,
|
||||
prefix: str = "",
|
||||
):
|
||||
"""Save benchmark results and generate plots"""
|
||||
# Save results
|
||||
results_data = {
|
||||
"individual_runs": all_results,
|
||||
"summary": summary,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
save_json_results(
|
||||
results_data,
|
||||
os.path.join(output_data_dir, f"{prefix}first_token_benchmark{suffix}.json"),
|
||||
)
|
||||
|
||||
# Create DataFrame for plotting
|
||||
df = pd.DataFrame(all_results)
|
||||
|
||||
# Create plots
|
||||
plot_correlation(
|
||||
df,
|
||||
"target_tokens",
|
||||
"time_to_first_chunk",
|
||||
f"Time to First Audio vs Input Size {plot_title_suffix}",
|
||||
"Number of Input Tokens",
|
||||
"Time to First Audio (seconds)",
|
||||
os.path.join(output_plots_dir, f"{prefix}first_token_latency{suffix}.png"),
|
||||
)
|
||||
|
||||
plot_correlation(
|
||||
df,
|
||||
"target_tokens",
|
||||
"total_time",
|
||||
f"Total Time vs Input Size {plot_title_suffix}",
|
||||
"Number of Input Tokens",
|
||||
"Total Time (seconds)",
|
||||
os.path.join(output_plots_dir, f"{prefix}total_time_latency{suffix}.png"),
|
||||
)
|
||||
|
||||
plot_timeline(
|
||||
df,
|
||||
os.path.join(output_plots_dir, f"{prefix}first_token_timeline{suffix}.png"),
|
||||
suffix=plot_title_suffix,
|
||||
)
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
measure_func: Callable,
|
||||
output_dir: str,
|
||||
output_data_dir: str,
|
||||
output_plots_dir: str,
|
||||
suffix: str = "",
|
||||
plot_title_suffix: str = "",
|
||||
num_runs: int = 5,
|
||||
client=None,
|
||||
prefix="",
|
||||
):
|
||||
"""Run benchmark with the given measurement function"""
|
||||
# Create output directories
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(output_data_dir, exist_ok=True)
|
||||
os.makedirs(output_plots_dir, exist_ok=True)
|
||||
|
||||
# Load sample text
|
||||
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
with open(
|
||||
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
|
||||
) as f:
|
||||
text = f.read()
|
||||
|
||||
# Test specific token counts
|
||||
token_sizes = [10, 50, 100, 250, 500]
|
||||
all_results = []
|
||||
silent_files = []
|
||||
|
||||
for tokens in token_sizes:
|
||||
print(
|
||||
f"\nTesting {tokens} tokens{' ' + plot_title_suffix if plot_title_suffix else ''}"
|
||||
)
|
||||
test_text = get_text_for_tokens(text, tokens)
|
||||
actual_tokens = len(enc.encode(test_text))
|
||||
print(f"Text preview: {test_text[:50]}...")
|
||||
|
||||
for i in range(num_runs):
|
||||
print(f"Run {i+1}/{num_runs}...")
|
||||
result = measure_func(test_text, output_dir, tokens, i + 1)
|
||||
result["target_tokens"] = tokens
|
||||
result["actual_tokens"] = actual_tokens
|
||||
result["run_number"] = i + 1
|
||||
|
||||
# Handle time to first audio
|
||||
first_chunk = result.get('time_to_first_chunk')
|
||||
print(
|
||||
f"Time to First Audio: {f'{first_chunk:.3f}s' if first_chunk is not None else 'N/A'}"
|
||||
)
|
||||
|
||||
# Handle total time
|
||||
total_time = result.get('total_time')
|
||||
print(
|
||||
f"Time to Save Complete: {f'{total_time:.3f}s' if total_time is not None else 'N/A'}"
|
||||
)
|
||||
|
||||
# Handle audio length
|
||||
audio_length = result.get('audio_length')
|
||||
print(
|
||||
f"Audio length: {f'{audio_length:.3f}s' if audio_length is not None else 'N/A'}"
|
||||
)
|
||||
# Calculate streaming overhead only if both values exist
|
||||
if total_time is not None and first_chunk is not None:
|
||||
print(f"Streaming overhead: {(total_time - first_chunk):.3f}s")
|
||||
else:
|
||||
print("Streaming overhead: N/A")
|
||||
|
||||
if result["error"]:
|
||||
print(f"Error: {result['error']}")
|
||||
elif result["audio_path"] and check_audio_silence(result["audio_path"]):
|
||||
silent_files.append(result["audio_path"])
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
# Process and save results
|
||||
summary = process_benchmark_results(all_results, token_sizes)
|
||||
save_benchmark_results(
|
||||
all_results,
|
||||
summary,
|
||||
output_data_dir,
|
||||
output_plots_dir,
|
||||
suffix,
|
||||
plot_title_suffix,
|
||||
)
|
||||
|
||||
# Print paths
|
||||
print("\nResults and plots saved to:")
|
||||
print(f"- {os.path.join(output_data_dir, f'{prefix}first_token_benchmark{suffix}.json')}")
|
||||
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_latency{suffix}.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, f'{prefix}total_time_latency{suffix}.png')}")
|
||||
print(f"- {os.path.join(output_plots_dir, f'{prefix}first_token_timeline{suffix}.png')}")
|
||||
|
||||
# Print silence check summary
|
||||
if silent_files:
|
||||
print("\nWARNING: The following files contain only silence:")
|
||||
for file in silent_files:
|
||||
print(f"- {file}")
|
||||
else:
|
||||
print("\nAll generated audio files contain valid audio content.")
|
|
@ -1,111 +0,0 @@
|
|||
{
|
||||
"results": [
|
||||
{
|
||||
"tokens": 100,
|
||||
"processing_time": 18.833295583724976,
|
||||
"output_length": 31.15,
|
||||
"realtime_factor": 1.6539856161403135,
|
||||
"elapsed_time": 19.024322748184204
|
||||
},
|
||||
{
|
||||
"tokens": 200,
|
||||
"processing_time": 38.95506024360657,
|
||||
"output_length": 62.6,
|
||||
"realtime_factor": 1.6069799304257042,
|
||||
"elapsed_time": 58.21527123451233
|
||||
},
|
||||
{
|
||||
"tokens": 300,
|
||||
"processing_time": 49.74252939224243,
|
||||
"output_length": 96.325,
|
||||
"realtime_factor": 1.9364716908630366,
|
||||
"elapsed_time": 108.19673728942871
|
||||
},
|
||||
{
|
||||
"tokens": 400,
|
||||
"processing_time": 61.349056243896484,
|
||||
"output_length": 128.575,
|
||||
"realtime_factor": 2.095794261102292,
|
||||
"elapsed_time": 169.733656167984
|
||||
},
|
||||
{
|
||||
"tokens": 500,
|
||||
"processing_time": 82.86568236351013,
|
||||
"output_length": 158.575,
|
||||
"realtime_factor": 1.9136389815071193,
|
||||
"elapsed_time": 252.7968451976776
|
||||
}
|
||||
],
|
||||
"system_metrics": [
|
||||
{
|
||||
"timestamp": "2025-01-03T00:13:49.865330",
|
||||
"cpu_percent": 8.0,
|
||||
"ram_percent": 39.4,
|
||||
"ram_used_gb": 25.03811264038086,
|
||||
"gpu_memory_used": 1204.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:14:08.781551",
|
||||
"cpu_percent": 26.8,
|
||||
"ram_percent": 42.6,
|
||||
"ram_used_gb": 27.090862274169922,
|
||||
"gpu_memory_used": 1225.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:14:08.916973",
|
||||
"cpu_percent": 16.1,
|
||||
"ram_percent": 42.6,
|
||||
"ram_used_gb": 27.089553833007812,
|
||||
"gpu_memory_used": 1225.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:14:47.979053",
|
||||
"cpu_percent": 31.5,
|
||||
"ram_percent": 43.6,
|
||||
"ram_used_gb": 27.714427947998047,
|
||||
"gpu_memory_used": 1225.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:14:48.098976",
|
||||
"cpu_percent": 20.0,
|
||||
"ram_percent": 43.6,
|
||||
"ram_used_gb": 27.704315185546875,
|
||||
"gpu_memory_used": 1211.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:15:37.944729",
|
||||
"cpu_percent": 29.7,
|
||||
"ram_percent": 38.6,
|
||||
"ram_used_gb": 24.53925323486328,
|
||||
"gpu_memory_used": 1217.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:15:38.071915",
|
||||
"cpu_percent": 8.6,
|
||||
"ram_percent": 38.5,
|
||||
"ram_used_gb": 24.51690673828125,
|
||||
"gpu_memory_used": 1208.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:16:39.525449",
|
||||
"cpu_percent": 23.4,
|
||||
"ram_percent": 38.8,
|
||||
"ram_used_gb": 24.71230697631836,
|
||||
"gpu_memory_used": 1221.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:16:39.612442",
|
||||
"cpu_percent": 5.5,
|
||||
"ram_percent": 38.9,
|
||||
"ram_used_gb": 24.72066879272461,
|
||||
"gpu_memory_used": 1221.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:18:02.569076",
|
||||
"cpu_percent": 27.4,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.868202209472656,
|
||||
"gpu_memory_used": 1264.0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,216 +0,0 @@
|
|||
{
|
||||
"results": [
|
||||
{
|
||||
"tokens": 100,
|
||||
"processing_time": 14.349808931350708,
|
||||
"output_length": 31.15,
|
||||
"rtf": 0.46,
|
||||
"elapsed_time": 14.716031074523926
|
||||
},
|
||||
{
|
||||
"tokens": 200,
|
||||
"processing_time": 28.341803312301636,
|
||||
"output_length": 62.6,
|
||||
"rtf": 0.45,
|
||||
"elapsed_time": 43.44207406044006
|
||||
},
|
||||
{
|
||||
"tokens": 300,
|
||||
"processing_time": 43.352553606033325,
|
||||
"output_length": 96.325,
|
||||
"rtf": 0.45,
|
||||
"elapsed_time": 87.26906609535217
|
||||
},
|
||||
{
|
||||
"tokens": 400,
|
||||
"processing_time": 71.02449822425842,
|
||||
"output_length": 128.575,
|
||||
"rtf": 0.55,
|
||||
"elapsed_time": 158.7198133468628
|
||||
},
|
||||
{
|
||||
"tokens": 500,
|
||||
"processing_time": 70.92521691322327,
|
||||
"output_length": 158.575,
|
||||
"rtf": 0.45,
|
||||
"elapsed_time": 230.01379895210266
|
||||
},
|
||||
{
|
||||
"tokens": 600,
|
||||
"processing_time": 83.6328592300415,
|
||||
"output_length": 189.25,
|
||||
"rtf": 0.44,
|
||||
"elapsed_time": 314.02610969543457
|
||||
},
|
||||
{
|
||||
"tokens": 700,
|
||||
"processing_time": 103.0810194015503,
|
||||
"output_length": 222.075,
|
||||
"rtf": 0.46,
|
||||
"elapsed_time": 417.5678551197052
|
||||
},
|
||||
{
|
||||
"tokens": 800,
|
||||
"processing_time": 127.02162909507751,
|
||||
"output_length": 253.85,
|
||||
"rtf": 0.5,
|
||||
"elapsed_time": 545.0128681659698
|
||||
},
|
||||
{
|
||||
"tokens": 900,
|
||||
"processing_time": 130.49781227111816,
|
||||
"output_length": 283.775,
|
||||
"rtf": 0.46,
|
||||
"elapsed_time": 675.8943417072296
|
||||
},
|
||||
{
|
||||
"tokens": 1000,
|
||||
"processing_time": 154.76425909996033,
|
||||
"output_length": 315.475,
|
||||
"rtf": 0.49,
|
||||
"elapsed_time": 831.0677945613861
|
||||
}
|
||||
],
|
||||
"system_metrics": [
|
||||
{
|
||||
"timestamp": "2025-01-03T00:23:52.896889",
|
||||
"cpu_percent": 4.5,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.86032485961914,
|
||||
"gpu_memory_used": 1281.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:24:07.429461",
|
||||
"cpu_percent": 4.5,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.847564697265625,
|
||||
"gpu_memory_used": 1285.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:24:07.620587",
|
||||
"cpu_percent": 2.7,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.846607208251953,
|
||||
"gpu_memory_used": 1275.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:24:36.140754",
|
||||
"cpu_percent": 5.4,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.857810974121094,
|
||||
"gpu_memory_used": 1267.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:24:36.340675",
|
||||
"cpu_percent": 6.2,
|
||||
"ram_percent": 39.1,
|
||||
"ram_used_gb": 24.85773468017578,
|
||||
"gpu_memory_used": 1267.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:25:19.905634",
|
||||
"cpu_percent": 29.1,
|
||||
"ram_percent": 39.2,
|
||||
"ram_used_gb": 24.920318603515625,
|
||||
"gpu_memory_used": 1256.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:25:20.182219",
|
||||
"cpu_percent": 20.0,
|
||||
"ram_percent": 39.2,
|
||||
"ram_used_gb": 24.930198669433594,
|
||||
"gpu_memory_used": 1256.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:26:31.414760",
|
||||
"cpu_percent": 5.3,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.127891540527344,
|
||||
"gpu_memory_used": 1259.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:26:31.617256",
|
||||
"cpu_percent": 3.6,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.126346588134766,
|
||||
"gpu_memory_used": 1252.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:27:42.736097",
|
||||
"cpu_percent": 10.5,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.100231170654297,
|
||||
"gpu_memory_used": 1249.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:27:42.912870",
|
||||
"cpu_percent": 5.3,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.098285675048828,
|
||||
"gpu_memory_used": 1249.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:29:06.725264",
|
||||
"cpu_percent": 8.9,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.123123168945312,
|
||||
"gpu_memory_used": 1239.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:29:06.928826",
|
||||
"cpu_percent": 5.5,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.128646850585938,
|
||||
"gpu_memory_used": 1239.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:30:50.206349",
|
||||
"cpu_percent": 49.6,
|
||||
"ram_percent": 39.6,
|
||||
"ram_used_gb": 25.162948608398438,
|
||||
"gpu_memory_used": 1245.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:30:50.491837",
|
||||
"cpu_percent": 14.8,
|
||||
"ram_percent": 39.5,
|
||||
"ram_used_gb": 25.13379669189453,
|
||||
"gpu_memory_used": 1245.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:32:57.721467",
|
||||
"cpu_percent": 6.2,
|
||||
"ram_percent": 39.6,
|
||||
"ram_used_gb": 25.187721252441406,
|
||||
"gpu_memory_used": 1384.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:32:57.913350",
|
||||
"cpu_percent": 3.6,
|
||||
"ram_percent": 39.6,
|
||||
"ram_used_gb": 25.199390411376953,
|
||||
"gpu_memory_used": 1384.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:35:08.608730",
|
||||
"cpu_percent": 6.3,
|
||||
"ram_percent": 39.8,
|
||||
"ram_used_gb": 25.311710357666016,
|
||||
"gpu_memory_used": 1330.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:35:08.791851",
|
||||
"cpu_percent": 5.3,
|
||||
"ram_percent": 39.8,
|
||||
"ram_used_gb": 25.326683044433594,
|
||||
"gpu_memory_used": 1333.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T00:37:43.782406",
|
||||
"cpu_percent": 6.8,
|
||||
"ram_percent": 40.6,
|
||||
"ram_used_gb": 25.803058624267578,
|
||||
"gpu_memory_used": 1409.0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,300 +0,0 @@
|
|||
{
|
||||
"results": [
|
||||
{
|
||||
"tokens": 100,
|
||||
"processing_time": 0.96,
|
||||
"output_length": 31.1,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 1.11
|
||||
},
|
||||
{
|
||||
"tokens": 250,
|
||||
"processing_time": 2.23,
|
||||
"output_length": 77.17,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 3.49
|
||||
},
|
||||
{
|
||||
"tokens": 400,
|
||||
"processing_time": 4.05,
|
||||
"output_length": 128.05,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 7.77
|
||||
},
|
||||
{
|
||||
"tokens": 550,
|
||||
"processing_time": 4.06,
|
||||
"output_length": 171.45,
|
||||
"rtf": 0.02,
|
||||
"elapsed_time": 12.0
|
||||
},
|
||||
{
|
||||
"tokens": 700,
|
||||
"processing_time": 6.01,
|
||||
"output_length": 221.6,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 18.16
|
||||
},
|
||||
{
|
||||
"tokens": 850,
|
||||
"processing_time": 6.9,
|
||||
"output_length": 269.1,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 25.21
|
||||
},
|
||||
{
|
||||
"tokens": 1000,
|
||||
"processing_time": 7.65,
|
||||
"output_length": 315.05,
|
||||
"rtf": 0.02,
|
||||
"elapsed_time": 33.03
|
||||
},
|
||||
{
|
||||
"tokens": 6000,
|
||||
"processing_time": 48.7,
|
||||
"output_length": 1837.1,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 82.21
|
||||
},
|
||||
{
|
||||
"tokens": 11000,
|
||||
"processing_time": 92.44,
|
||||
"output_length": 3388.57,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 175.46
|
||||
},
|
||||
{
|
||||
"tokens": 16000,
|
||||
"processing_time": 163.61,
|
||||
"output_length": 4977.32,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 340.46
|
||||
},
|
||||
{
|
||||
"tokens": 21000,
|
||||
"processing_time": 209.72,
|
||||
"output_length": 6533.3,
|
||||
"rtf": 0.03,
|
||||
"elapsed_time": 551.92
|
||||
},
|
||||
{
|
||||
"tokens": 26000,
|
||||
"processing_time": 329.35,
|
||||
"output_length": 8068.15,
|
||||
"rtf": 0.04,
|
||||
"elapsed_time": 883.37
|
||||
},
|
||||
{
|
||||
"tokens": 31000,
|
||||
"processing_time": 473.52,
|
||||
"output_length": 9611.48,
|
||||
"rtf": 0.05,
|
||||
"elapsed_time": 1359.28
|
||||
},
|
||||
{
|
||||
"tokens": 36000,
|
||||
"processing_time": 650.98,
|
||||
"output_length": 11157.15,
|
||||
"rtf": 0.06,
|
||||
"elapsed_time": 2012.9
|
||||
}
|
||||
],
|
||||
"system_metrics": [
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:01.331735",
|
||||
"cpu_percent": 7.5,
|
||||
"ram_percent": 50.2,
|
||||
"ram_used_gb": 31.960269927978516,
|
||||
"gpu_memory_used": 3191.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:02.357116",
|
||||
"cpu_percent": 17.01,
|
||||
"ram_percent": 50.2,
|
||||
"ram_used_gb": 31.96163558959961,
|
||||
"gpu_memory_used": 3426.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:02.445009",
|
||||
"cpu_percent": 9.5,
|
||||
"ram_percent": 50.3,
|
||||
"ram_used_gb": 31.966781616210938,
|
||||
"gpu_memory_used": 3426.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:04.742152",
|
||||
"cpu_percent": 18.27,
|
||||
"ram_percent": 50.4,
|
||||
"ram_used_gb": 32.08788299560547,
|
||||
"gpu_memory_used": 3642.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:04.847795",
|
||||
"cpu_percent": 16.27,
|
||||
"ram_percent": 50.5,
|
||||
"ram_used_gb": 32.094364166259766,
|
||||
"gpu_memory_used": 3640.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:09.019590",
|
||||
"cpu_percent": 15.97,
|
||||
"ram_percent": 50.7,
|
||||
"ram_used_gb": 32.23244094848633,
|
||||
"gpu_memory_used": 3640.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:09.110324",
|
||||
"cpu_percent": 3.54,
|
||||
"ram_percent": 50.7,
|
||||
"ram_used_gb": 32.234458923339844,
|
||||
"gpu_memory_used": 3640.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:13.252607",
|
||||
"cpu_percent": 13.4,
|
||||
"ram_percent": 50.6,
|
||||
"ram_used_gb": 32.194271087646484,
|
||||
"gpu_memory_used": 3935.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:13.327557",
|
||||
"cpu_percent": 4.69,
|
||||
"ram_percent": 50.6,
|
||||
"ram_used_gb": 32.191776275634766,
|
||||
"gpu_memory_used": 3935.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:19.413633",
|
||||
"cpu_percent": 12.92,
|
||||
"ram_percent": 50.9,
|
||||
"ram_used_gb": 32.3467903137207,
|
||||
"gpu_memory_used": 4250.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:19.492758",
|
||||
"cpu_percent": 7.5,
|
||||
"ram_percent": 50.8,
|
||||
"ram_used_gb": 32.34375,
|
||||
"gpu_memory_used": 4250.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:26.467284",
|
||||
"cpu_percent": 13.09,
|
||||
"ram_percent": 51.2,
|
||||
"ram_used_gb": 32.56281280517578,
|
||||
"gpu_memory_used": 4249.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:26.553559",
|
||||
"cpu_percent": 8.39,
|
||||
"ram_percent": 51.2,
|
||||
"ram_used_gb": 32.56183624267578,
|
||||
"gpu_memory_used": 4249.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:34.284362",
|
||||
"cpu_percent": 12.61,
|
||||
"ram_percent": 51.7,
|
||||
"ram_used_gb": 32.874778747558594,
|
||||
"gpu_memory_used": 4250.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:41:34.362353",
|
||||
"cpu_percent": 1.25,
|
||||
"ram_percent": 51.7,
|
||||
"ram_used_gb": 32.87461471557617,
|
||||
"gpu_memory_used": 4250.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:42:23.471312",
|
||||
"cpu_percent": 11.64,
|
||||
"ram_percent": 54.9,
|
||||
"ram_used_gb": 34.90264129638672,
|
||||
"gpu_memory_used": 4647.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:42:23.547203",
|
||||
"cpu_percent": 5.31,
|
||||
"ram_percent": 54.9,
|
||||
"ram_used_gb": 34.91563415527344,
|
||||
"gpu_memory_used": 4647.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:43:56.724933",
|
||||
"cpu_percent": 12.97,
|
||||
"ram_percent": 59.5,
|
||||
"ram_used_gb": 37.84241485595703,
|
||||
"gpu_memory_used": 4655.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:43:56.815453",
|
||||
"cpu_percent": 11.75,
|
||||
"ram_percent": 59.5,
|
||||
"ram_used_gb": 37.832679748535156,
|
||||
"gpu_memory_used": 4655.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:46:41.705155",
|
||||
"cpu_percent": 12.94,
|
||||
"ram_percent": 66.3,
|
||||
"ram_used_gb": 42.1534538269043,
|
||||
"gpu_memory_used": 4729.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:46:41.835177",
|
||||
"cpu_percent": 7.73,
|
||||
"ram_percent": 66.2,
|
||||
"ram_used_gb": 42.13554000854492,
|
||||
"gpu_memory_used": 4729.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:50:13.166236",
|
||||
"cpu_percent": 11.62,
|
||||
"ram_percent": 73.4,
|
||||
"ram_used_gb": 46.71288299560547,
|
||||
"gpu_memory_used": 4676.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:50:13.261611",
|
||||
"cpu_percent": 8.16,
|
||||
"ram_percent": 73.4,
|
||||
"ram_used_gb": 46.71356201171875,
|
||||
"gpu_memory_used": 4676.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:55:44.623607",
|
||||
"cpu_percent": 12.92,
|
||||
"ram_percent": 82.8,
|
||||
"ram_used_gb": 52.65533447265625,
|
||||
"gpu_memory_used": 4636.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T14:55:44.735410",
|
||||
"cpu_percent": 15.29,
|
||||
"ram_percent": 82.7,
|
||||
"ram_used_gb": 52.63290786743164,
|
||||
"gpu_memory_used": 4636.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T15:03:40.534449",
|
||||
"cpu_percent": 13.88,
|
||||
"ram_percent": 85.0,
|
||||
"ram_used_gb": 54.050071716308594,
|
||||
"gpu_memory_used": 4771.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T15:03:40.638708",
|
||||
"cpu_percent": 12.21,
|
||||
"ram_percent": 85.0,
|
||||
"ram_used_gb": 54.053733825683594,
|
||||
"gpu_memory_used": 4771.0
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-01-03T15:14:34.159142",
|
||||
"cpu_percent": 14.51,
|
||||
"ram_percent": 78.1,
|
||||
"ram_used_gb": 49.70396423339844,
|
||||
"gpu_memory_used": 4739.0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
=== Benchmark Statistics (with correct RTF) ===
|
||||
|
||||
Overall Stats:
|
||||
Total tokens processed: 5500
|
||||
Total audio generated: 1741.65s
|
||||
Total test duration: 831.07s
|
||||
Average processing rate: 6.72 tokens/second
|
||||
Average RTF: 0.47x
|
||||
|
||||
Per-chunk Stats:
|
||||
Average chunk size: 550.00 tokens
|
||||
Min chunk size: 100.00 tokens
|
||||
Max chunk size: 1000.00 tokens
|
||||
Average processing time: 82.70s
|
||||
Average output length: 174.17s
|
||||
|
||||
Performance Ranges:
|
||||
Processing rate range: 5.63 - 7.17 tokens/second
|
||||
RTF range: 0.44x - 0.55x
|
|
@ -1,9 +0,0 @@
|
|||
=== Benchmark Statistics (with correct RTF) ===
|
||||
|
||||
Overall Stats:
|
||||
Total tokens processed: 150850
|
||||
Total audio generated: 46786.59s
|
||||
Total test duration: 2012.90s
|
||||
Average processing rate: 104.34 tokens/second
|
||||
Average RTF: 0.03x
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
=== Benchmark Statistics (with correct RTF) ===
|
||||
|
||||
Total tokens processed: 1800
|
||||
Total audio generated (s): 568.53
|
||||
Total test duration (s): 244.10
|
||||
Average processing rate (tokens/s): 7.34
|
||||
Average RTF: 0.43
|
||||
Average Real Time Speed: 2.33
|
||||
|
||||
=== Per-chunk Stats ===
|
||||
|
||||
Average chunk size (tokens): 600.00
|
||||
Min chunk size (tokens): 300
|
||||
Max chunk size (tokens): 900
|
||||
Average processing time (s): 81.30
|
||||
Average output length (s): 189.51
|
||||
|
||||
=== Performance Ranges ===
|
||||
|
||||
Processing rate range (tokens/s): 7.21 - 7.47
|
||||
RTF range: 0.43x - 0.43x
|
||||
Real Time Speed range: 2.33x - 2.33x
|
||||
|
|
@ -1,403 +0,0 @@
|
|||
{
|
||||
"individual_runs": [
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": 10,
|
||||
"total_time": 0.16574740409851074,
|
||||
"time_to_first_chunk": 0.16574740409851074,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": 10,
|
||||
"total_time": 0.18812799453735352,
|
||||
"time_to_first_chunk": 0.18812799453735352,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": 10,
|
||||
"total_time": 0.18645429611206055,
|
||||
"time_to_first_chunk": 0.18645429611206055,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": 10,
|
||||
"total_time": 0.17632031440734863,
|
||||
"time_to_first_chunk": 0.17632031440734863,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": 10,
|
||||
"total_time": 0.13381195068359375,
|
||||
"time_to_first_chunk": 0.13381195068359375,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 102,
|
||||
"token_count": 25,
|
||||
"total_time": 0.2086498737335205,
|
||||
"time_to_first_chunk": 0.2086498737335205,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
|
||||
"audio_length": 7.225,
|
||||
"target_tokens": 25,
|
||||
"actual_tokens": 25,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 102,
|
||||
"token_count": 25,
|
||||
"total_time": 0.2727653980255127,
|
||||
"time_to_first_chunk": 0.2727653980255127,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
|
||||
"audio_length": 7.225,
|
||||
"target_tokens": 25,
|
||||
"actual_tokens": 25,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 102,
|
||||
"token_count": 25,
|
||||
"total_time": 0.2096250057220459,
|
||||
"time_to_first_chunk": 0.2096250057220459,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
|
||||
"audio_length": 7.225,
|
||||
"target_tokens": 25,
|
||||
"actual_tokens": 25,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 102,
|
||||
"token_count": 25,
|
||||
"total_time": 0.2256758213043213,
|
||||
"time_to_first_chunk": 0.2256758213043213,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
|
||||
"audio_length": 7.225,
|
||||
"target_tokens": 25,
|
||||
"actual_tokens": 25,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 102,
|
||||
"token_count": 25,
|
||||
"total_time": 0.1945042610168457,
|
||||
"time_to_first_chunk": 0.1945042610168457,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
|
||||
"audio_length": 7.225,
|
||||
"target_tokens": 25,
|
||||
"actual_tokens": 25,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.4975121021270752,
|
||||
"time_to_first_chunk": 0.4975121021270752,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
|
||||
"audio_length": 16.325,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.4518404006958008,
|
||||
"time_to_first_chunk": 0.4518404006958008,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
|
||||
"audio_length": 16.325,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.5640325546264648,
|
||||
"time_to_first_chunk": 0.5640325546264648,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
|
||||
"audio_length": 16.325,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.5305957794189453,
|
||||
"time_to_first_chunk": 0.5305957794189453,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
|
||||
"audio_length": 16.325,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.5540030002593994,
|
||||
"time_to_first_chunk": 0.5540030002593994,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
|
||||
"audio_length": 16.325,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.7963137626647949,
|
||||
"time_to_first_chunk": 0.7963137626647949,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
|
||||
"audio_length": 31.1,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.9320805072784424,
|
||||
"time_to_first_chunk": 0.9320805072784424,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
|
||||
"audio_length": 31.1,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.824256181716919,
|
||||
"time_to_first_chunk": 0.824256181716919,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
|
||||
"audio_length": 31.1,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.9034836292266846,
|
||||
"time_to_first_chunk": 0.9034836292266846,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
|
||||
"audio_length": 31.1,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.8364357948303223,
|
||||
"time_to_first_chunk": 0.8364357948303223,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
|
||||
"audio_length": 31.1,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.8122682571411133,
|
||||
"time_to_first_chunk": 1.8122682571411133,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.7290427684783936,
|
||||
"time_to_first_chunk": 1.7290427684783936,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 2.141728401184082,
|
||||
"time_to_first_chunk": 2.141728401184082,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 2.0155680179595947,
|
||||
"time_to_first_chunk": 2.0155680179595947,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.8707575798034668,
|
||||
"time_to_first_chunk": 1.8707575798034668,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.822713851928711,
|
||||
"time_to_first_chunk": 4.822713851928711,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
|
||||
"audio_length": 157.875,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.227782726287842,
|
||||
"time_to_first_chunk": 4.227782726287842,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
|
||||
"audio_length": 157.875,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.414916276931763,
|
||||
"time_to_first_chunk": 4.414916276931763,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
|
||||
"audio_length": 157.875,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.579505681991577,
|
||||
"time_to_first_chunk": 4.579505681991577,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
|
||||
"audio_length": 157.875,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.332529067993164,
|
||||
"time_to_first_chunk": 4.332529067993164,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
|
||||
"audio_length": 157.875,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 5
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"10": {
|
||||
"avg_time_to_first_chunk": 0.17,
|
||||
"avg_total_time": 0.17,
|
||||
"avg_audio_length": 3.45,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"25": {
|
||||
"avg_time_to_first_chunk": 0.222,
|
||||
"avg_total_time": 0.222,
|
||||
"avg_audio_length": 7.225,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"50": {
|
||||
"avg_time_to_first_chunk": 0.52,
|
||||
"avg_total_time": 0.52,
|
||||
"avg_audio_length": 16.325,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"100": {
|
||||
"avg_time_to_first_chunk": 0.859,
|
||||
"avg_total_time": 0.859,
|
||||
"avg_audio_length": 31.1,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"200": {
|
||||
"avg_time_to_first_chunk": 1.914,
|
||||
"avg_total_time": 1.914,
|
||||
"avg_audio_length": 62.625,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"500": {
|
||||
"avg_time_to_first_chunk": 4.475,
|
||||
"avg_total_time": 4.475,
|
||||
"avg_audio_length": 157.875,
|
||||
"num_successful_runs": 5
|
||||
}
|
||||
},
|
||||
"timestamp": "2025-01-04 13:52:28"
|
||||
}
|
|
@ -1,271 +1,337 @@
|
|||
{
|
||||
"individual_runs": [
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.7278211116790771,
|
||||
"time_to_first_chunk": 0.3613290786743164,
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.4376556873321533,
|
||||
"time_to_first_chunk": 0.4189143180847168,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run1_stream.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.37163758277893066,
|
||||
"time_to_first_chunk": 0.34892702102661133,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run2_stream.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.2654602527618408,
|
||||
"time_to_first_chunk": 0.2409076690673828,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run3_stream.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.24376440048217773,
|
||||
"time_to_first_chunk": 0.23003816604614258,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run4_stream.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.25968003273010254,
|
||||
"time_to_first_chunk": 0.24081206321716309,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens10_run5_stream.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": null,
|
||||
"total_time": 1.049060344696045,
|
||||
"time_to_first_chunk": 0.3336215019226074,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.4556088447570801,
|
||||
"time_to_first_chunk": 0.18642044067382812,
|
||||
"token_count": null,
|
||||
"total_time": 0.8934676647186279,
|
||||
"time_to_first_chunk": 0.3011031150817871,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.5538768768310547,
|
||||
"time_to_first_chunk": 0.2720797061920166,
|
||||
"token_count": null,
|
||||
"total_time": 0.9444286823272705,
|
||||
"time_to_first_chunk": 0.3198091983795166,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.4395604133605957,
|
||||
"time_to_first_chunk": 0.15613913536071777,
|
||||
"token_count": null,
|
||||
"total_time": 0.9735183715820312,
|
||||
"time_to_first_chunk": 0.369948148727417,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run4_stream.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.45748305320739746,
|
||||
"time_to_first_chunk": 0.18805718421936035,
|
||||
"token_count": null,
|
||||
"total_time": 0.8089118003845215,
|
||||
"time_to_first_chunk": 0.30179858207702637,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run5_stream.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.7347762584686279,
|
||||
"time_to_first_chunk": 0.16963744163513184,
|
||||
"token_count": null,
|
||||
"total_time": 1.641003131866455,
|
||||
"time_to_first_chunk": 0.2979745864868164,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.8288509845733643,
|
||||
"time_to_first_chunk": 0.20123004913330078,
|
||||
"token_count": null,
|
||||
"total_time": 1.3709619045257568,
|
||||
"time_to_first_chunk": 0.4272146224975586,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.7503848075866699,
|
||||
"time_to_first_chunk": 0.21662068367004395,
|
||||
"token_count": null,
|
||||
"total_time": 1.2554471492767334,
|
||||
"time_to_first_chunk": 0.29790568351745605,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.694899320602417,
|
||||
"time_to_first_chunk": 0.1966841220855713,
|
||||
"token_count": null,
|
||||
"total_time": 1.3761844635009766,
|
||||
"time_to_first_chunk": 0.32633328437805176,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run4_stream.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 0.68701171875,
|
||||
"time_to_first_chunk": 0.19341063499450684,
|
||||
"token_count": null,
|
||||
"total_time": 1.56705904006958,
|
||||
"time_to_first_chunk": 0.32801246643066406,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run5_stream.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.6845426559448242,
|
||||
"time_to_first_chunk": 0.21096158027648926,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 5.086699962615967,
|
||||
"time_to_first_chunk": 0.33925390243530273,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run1_stream.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.3545098304748535,
|
||||
"time_to_first_chunk": 0.18648386001586914,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 3.827953338623047,
|
||||
"time_to_first_chunk": 0.39266157150268555,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run2_stream.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.426060676574707,
|
||||
"time_to_first_chunk": 0.20081472396850586,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 3.9389824867248535,
|
||||
"time_to_first_chunk": 0.3231511116027832,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run3_stream.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.4084081649780273,
|
||||
"time_to_first_chunk": 0.18551135063171387,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 3.942399740219116,
|
||||
"time_to_first_chunk": 0.34731340408325195,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run4_stream.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run4_stream.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.4703152179718018,
|
||||
"time_to_first_chunk": 0.17750859260559082,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 3.7748308181762695,
|
||||
"time_to_first_chunk": 0.40787601470947266,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run5_stream.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens250_run5_stream.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.289574384689331,
|
||||
"time_to_first_chunk": 0.1997976303100586,
|
||||
"token_count": null,
|
||||
"total_time": 9.003147840499878,
|
||||
"time_to_first_chunk": 0.5455703735351562,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 3.7089381217956543,
|
||||
"time_to_first_chunk": 0.25969815254211426,
|
||||
"token_count": null,
|
||||
"total_time": 10.081491231918335,
|
||||
"time_to_first_chunk": 0.4591703414916992,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.138366222381592,
|
||||
"time_to_first_chunk": 0.1831505298614502,
|
||||
"token_count": null,
|
||||
"total_time": 9.767668962478638,
|
||||
"time_to_first_chunk": 0.31237053871154785,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 3.980635643005371,
|
||||
"time_to_first_chunk": 0.20493030548095703,
|
||||
"token_count": null,
|
||||
"total_time": 9.090342998504639,
|
||||
"time_to_first_chunk": 0.41753244400024414,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run4_stream.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.1370298862457275,
|
||||
"time_to_first_chunk": 0.19150757789611816,
|
||||
"token_count": null,
|
||||
"total_time": 9.876578330993652,
|
||||
"time_to_first_chunk": 0.3965120315551758,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run5_stream.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 5
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"10": {
|
||||
"avg_time_to_first_chunk": 0.296,
|
||||
"avg_total_time": 0.316,
|
||||
"avg_audio_length": 3.45,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"50": {
|
||||
"avg_time_to_first_chunk": 0.233,
|
||||
"avg_total_time": 0.527,
|
||||
"avg_audio_length": 16.325,
|
||||
"avg_time_to_first_chunk": 0.325,
|
||||
"avg_total_time": 0.934,
|
||||
"avg_audio_length": 15.925,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"100": {
|
||||
"avg_time_to_first_chunk": 0.196,
|
||||
"avg_total_time": 0.739,
|
||||
"avg_audio_length": 31.1,
|
||||
"avg_time_to_first_chunk": 0.335,
|
||||
"avg_total_time": 1.442,
|
||||
"avg_audio_length": 30.5,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"200": {
|
||||
"avg_time_to_first_chunk": 0.192,
|
||||
"avg_total_time": 1.469,
|
||||
"avg_audio_length": 62.625,
|
||||
"250": {
|
||||
"avg_time_to_first_chunk": 0.362,
|
||||
"avg_total_time": 4.114,
|
||||
"avg_audio_length": 78.775,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"500": {
|
||||
"avg_time_to_first_chunk": 0.208,
|
||||
"avg_total_time": 4.051,
|
||||
"avg_audio_length": 157.875,
|
||||
"avg_time_to_first_chunk": 0.426,
|
||||
"avg_total_time": 9.564,
|
||||
"avg_audio_length": 156.475,
|
||||
"num_successful_runs": 5
|
||||
}
|
||||
},
|
||||
"timestamp": "2025-01-04 22:16:30"
|
||||
"timestamp": "2025-01-06 00:00:43"
|
||||
}
|
|
@ -1,271 +1,337 @@
|
|||
{
|
||||
"individual_runs": [
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 1.149611473083496,
|
||||
"time_to_first_chunk": 0.8767304420471191,
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.7105245590209961,
|
||||
"time_to_first_chunk": 0.6905441284179688,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run1_stream_openai.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.35063982009887695,
|
||||
"time_to_first_chunk": 0.32647228240966797,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run2_stream_openai.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.43519043922424316,
|
||||
"time_to_first_chunk": 0.41011548042297363,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run3_stream_openai.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.33886170387268066,
|
||||
"time_to_first_chunk": 0.32068943977355957,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run4_stream_openai.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 37,
|
||||
"token_count": null,
|
||||
"total_time": 0.31725525856018066,
|
||||
"time_to_first_chunk": 0.29624342918395996,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens10_run5_stream_openai.wav",
|
||||
"audio_length": 3.45,
|
||||
"target_tokens": 10,
|
||||
"actual_tokens": 10,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": null,
|
||||
"total_time": 1.0215234756469727,
|
||||
"time_to_first_chunk": 0.38323354721069336,
|
||||
"error": null,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run1_stream_openai.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.9325947761535645,
|
||||
"time_to_first_chunk": 0.5965914726257324,
|
||||
"token_count": null,
|
||||
"total_time": 1.38511061668396,
|
||||
"time_to_first_chunk": 0.47052764892578125,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run2_stream_openai.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 0.9205234050750732,
|
||||
"time_to_first_chunk": 0.5961906909942627,
|
||||
"token_count": null,
|
||||
"total_time": 1.0185234546661377,
|
||||
"time_to_first_chunk": 0.3535764217376709,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run3_stream_openai.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 1.1321916580200195,
|
||||
"time_to_first_chunk": 0.6946916580200195,
|
||||
"token_count": null,
|
||||
"total_time": 0.8875925540924072,
|
||||
"time_to_first_chunk": 0.3373105525970459,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run4_stream_openai.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 212,
|
||||
"token_count": 50,
|
||||
"total_time": 1.1146185398101807,
|
||||
"time_to_first_chunk": 0.6918885707855225,
|
||||
"token_count": null,
|
||||
"total_time": 0.9557526111602783,
|
||||
"time_to_first_chunk": 0.3364882469177246,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
|
||||
"audio_length": 16.325,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens50_run5_stream_openai.wav",
|
||||
"audio_length": 15.925,
|
||||
"target_tokens": 50,
|
||||
"actual_tokens": 50,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 1.3645410537719727,
|
||||
"time_to_first_chunk": 0.6802399158477783,
|
||||
"token_count": null,
|
||||
"total_time": 1.569596767425537,
|
||||
"time_to_first_chunk": 0.42070746421813965,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run1_stream_openai.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 1.4154777526855469,
|
||||
"time_to_first_chunk": 0.7297353744506836,
|
||||
"token_count": null,
|
||||
"total_time": 1.5172030925750732,
|
||||
"time_to_first_chunk": 0.3982264995574951,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run2_stream_openai.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 1.3589520454406738,
|
||||
"time_to_first_chunk": 0.698603630065918,
|
||||
"token_count": null,
|
||||
"total_time": 1.5318474769592285,
|
||||
"time_to_first_chunk": 0.3533785343170166,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run3_stream_openai.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 1.2276430130004883,
|
||||
"time_to_first_chunk": 0.6705801486968994,
|
||||
"token_count": null,
|
||||
"total_time": 1.3858752250671387,
|
||||
"time_to_first_chunk": 0.3360786437988281,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run4_stream_openai.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 448,
|
||||
"token_count": 100,
|
||||
"total_time": 1.0949454307556152,
|
||||
"time_to_first_chunk": 0.5698442459106445,
|
||||
"token_count": null,
|
||||
"total_time": 1.7841475009918213,
|
||||
"time_to_first_chunk": 0.34446048736572266,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
|
||||
"audio_length": 31.1,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens100_run5_stream_openai.wav",
|
||||
"audio_length": 30.5,
|
||||
"target_tokens": 100,
|
||||
"actual_tokens": 100,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.8211240768432617,
|
||||
"time_to_first_chunk": 0.6070489883422852,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 4.334965467453003,
|
||||
"time_to_first_chunk": 0.4336512088775635,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run1_stream_openai.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run1_stream_openai.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.8376774787902832,
|
||||
"time_to_first_chunk": 0.6538689136505127,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 5.265941858291626,
|
||||
"time_to_first_chunk": 0.5461773872375488,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run2_stream_openai.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run2_stream_openai.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.6953792572021484,
|
||||
"time_to_first_chunk": 0.5554308891296387,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 5.66066575050354,
|
||||
"time_to_first_chunk": 0.4757547378540039,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run3_stream_openai.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run3_stream_openai.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.887030839920044,
|
||||
"time_to_first_chunk": 0.5866930484771729,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 9.289174318313599,
|
||||
"time_to_first_chunk": 0.40159058570861816,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run4_stream_openai.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run4_stream_openai.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 906,
|
||||
"token_count": 200,
|
||||
"total_time": 1.7908406257629395,
|
||||
"time_to_first_chunk": 0.5897490978240967,
|
||||
"text_length": 1140,
|
||||
"token_count": null,
|
||||
"total_time": 4.425869703292847,
|
||||
"time_to_first_chunk": 0.40808558464050293,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens200_run5_stream_openai.wav",
|
||||
"audio_length": 62.625,
|
||||
"target_tokens": 200,
|
||||
"actual_tokens": 200,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens250_run5_stream_openai.wav",
|
||||
"audio_length": 78.775,
|
||||
"target_tokens": 250,
|
||||
"actual_tokens": 250,
|
||||
"run_number": 5
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.228837013244629,
|
||||
"time_to_first_chunk": 0.5315976142883301,
|
||||
"token_count": null,
|
||||
"total_time": 9.600461483001709,
|
||||
"time_to_first_chunk": 0.3966805934906006,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run1_stream_openai.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 1
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.489210367202759,
|
||||
"time_to_first_chunk": 0.5261838436126709,
|
||||
"token_count": null,
|
||||
"total_time": 8.82239580154419,
|
||||
"time_to_first_chunk": 0.3900904655456543,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run2_stream_openai.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 2
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.5290446281433105,
|
||||
"time_to_first_chunk": 0.6186764240264893,
|
||||
"token_count": null,
|
||||
"total_time": 10.99152159690857,
|
||||
"time_to_first_chunk": 0.4041757583618164,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run3_stream_openai.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 3
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.209261178970337,
|
||||
"time_to_first_chunk": 0.5990591049194336,
|
||||
"token_count": null,
|
||||
"total_time": 9.12995958328247,
|
||||
"time_to_first_chunk": 0.43430614471435547,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run4_stream_openai.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 4
|
||||
},
|
||||
{
|
||||
"text_length": 2232,
|
||||
"token_count": 500,
|
||||
"total_time": 4.218762636184692,
|
||||
"time_to_first_chunk": 0.5466251373291016,
|
||||
"token_count": null,
|
||||
"total_time": 10.043727159500122,
|
||||
"time_to_first_chunk": 0.41181445121765137,
|
||||
"error": null,
|
||||
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
|
||||
"audio_length": 157.875,
|
||||
"audio_path": "C:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream_openai\\benchmark_tokens500_run5_stream_openai.wav",
|
||||
"audio_length": 156.475,
|
||||
"target_tokens": 500,
|
||||
"actual_tokens": 500,
|
||||
"run_number": 5
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"10": {
|
||||
"avg_time_to_first_chunk": 0.409,
|
||||
"avg_total_time": 0.43,
|
||||
"avg_audio_length": 3.45,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"50": {
|
||||
"avg_time_to_first_chunk": 0.691,
|
||||
"avg_total_time": 1.05,
|
||||
"avg_audio_length": 16.325,
|
||||
"avg_time_to_first_chunk": 0.376,
|
||||
"avg_total_time": 1.054,
|
||||
"avg_audio_length": 15.925,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"100": {
|
||||
"avg_time_to_first_chunk": 0.67,
|
||||
"avg_total_time": 1.292,
|
||||
"avg_audio_length": 31.1,
|
||||
"avg_time_to_first_chunk": 0.371,
|
||||
"avg_total_time": 1.558,
|
||||
"avg_audio_length": 30.5,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"200": {
|
||||
"avg_time_to_first_chunk": 0.599,
|
||||
"avg_total_time": 1.806,
|
||||
"avg_audio_length": 62.625,
|
||||
"250": {
|
||||
"avg_time_to_first_chunk": 0.453,
|
||||
"avg_total_time": 5.795,
|
||||
"avg_audio_length": 78.775,
|
||||
"num_successful_runs": 5
|
||||
},
|
||||
"500": {
|
||||
"avg_time_to_first_chunk": 0.564,
|
||||
"avg_total_time": 4.335,
|
||||
"avg_audio_length": 157.875,
|
||||
"avg_time_to_first_chunk": 0.407,
|
||||
"avg_total_time": 9.718,
|
||||
"avg_audio_length": 156.475,
|
||||
"num_successful_runs": 5
|
||||
}
|
||||
},
|
||||
"timestamp": "2025-01-04 22:18:03"
|
||||
"timestamp": "2025-01-06 00:02:21"
|
||||
}
|
|
@ -1,23 +1,23 @@
|
|||
=== Benchmark Statistics (with correct RTF) ===
|
||||
|
||||
Total tokens processed: 17150
|
||||
Total audio generated (s): 5296.38
|
||||
Total test duration (s): 155.23
|
||||
Average processing rate (tokens/s): 102.86
|
||||
Average RTF: 0.03
|
||||
Average Real Time Speed: 31.25
|
||||
Total tokens processed: 3150
|
||||
Total audio generated (s): 1056.03
|
||||
Total test duration (s): 70.20
|
||||
Average processing rate (tokens/s): 46.46
|
||||
Average RTF: 0.07
|
||||
Average Real Time Speed: 15.00
|
||||
|
||||
=== Per-chunk Stats ===
|
||||
|
||||
Average chunk size (tokens): 1715.00
|
||||
Average chunk size (tokens): 525.00
|
||||
Min chunk size (tokens): 150
|
||||
Max chunk size (tokens): 5000
|
||||
Average processing time (s): 15.39
|
||||
Average output length (s): 529.64
|
||||
Max chunk size (tokens): 900
|
||||
Average processing time (s): 11.57
|
||||
Average output length (s): 176.00
|
||||
|
||||
=== Performance Ranges ===
|
||||
|
||||
Processing rate range (tokens/s): 80.65 - 125.10
|
||||
RTF range: 0.03x - 0.04x
|
||||
Real Time Speed range: 25.00x - 33.33x
|
||||
Processing rate range (tokens/s): 40.07 - 53.57
|
||||
RTF range: 0.06x - 0.08x
|
||||
Real Time Speed range: 12.50x - 16.67x
|
||||
|
||||
|
|
Before Width: | Height: | Size: 231 KiB After Width: | Height: | Size: 230 KiB |
Before Width: | Height: | Size: 181 KiB After Width: | Height: | Size: 206 KiB |
Before Width: | Height: | Size: 454 KiB After Width: | Height: | Size: 491 KiB |
Before Width: | Height: | Size: 246 KiB |
Before Width: | Height: | Size: 210 KiB After Width: | Height: | Size: 238 KiB |
Before Width: | Height: | Size: 268 KiB After Width: | Height: | Size: 236 KiB |
Before Width: | Height: | Size: 233 KiB |
Before Width: | Height: | Size: 193 KiB After Width: | Height: | Size: 226 KiB |
Before Width: | Height: | Size: 196 KiB After Width: | Height: | Size: 236 KiB |
Before Width: | Height: | Size: 764 KiB |
Before Width: | Height: | Size: 238 KiB After Width: | Height: | Size: 224 KiB |
Before Width: | Height: | Size: 250 KiB After Width: | Height: | Size: 221 KiB |
Before Width: | Height: | Size: 459 KiB After Width: | Height: | Size: 463 KiB |
Before Width: | Height: | Size: 198 KiB |
Before Width: | Height: | Size: 252 KiB After Width: | Height: | Size: 238 KiB |
Before Width: | Height: | Size: 258 KiB After Width: | Height: | Size: 263 KiB |
198
examples/assorted_checks/generate_readme_plots.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Script to generate all plots needed for the README."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from validate_wav import validate_tts
|
||||
|
||||
# Get absolute paths
|
||||
script_dir = Path(__file__).parent.resolve()
|
||||
project_root = script_dir.parent.parent
|
||||
|
||||
# Add directories to Python path for imports
|
||||
sys.path.append(str(script_dir))
|
||||
sys.path.append(str(script_dir / "benchmarks"))
|
||||
|
||||
# Import test scripts
|
||||
from benchmark_tts_rtf import main as benchmark_rtf
|
||||
from test_formats.test_audio_formats import main as test_formats
|
||||
from benchmark_first_token_stream_unified import main as benchmark_stream
|
||||
from test_combinations.test_analyze_combined_voices import main as test_voice_analysis
|
||||
|
||||
# Remove directories from path after imports
|
||||
sys.path.remove(str(script_dir))
|
||||
sys.path.remove(str(script_dir / "benchmarks"))
|
||||
|
||||
|
||||
def ensure_assets_dir():
|
||||
"""Create assets directory if it doesn't exist."""
|
||||
assets_dir = project_root / "assets"
|
||||
assets_dir.mkdir(exist_ok=True)
|
||||
return assets_dir
|
||||
|
||||
|
||||
def copy_plot(src_path: str, dest_name: str, assets_dir: Path):
|
||||
"""Copy a plot to the assets directory with a new name."""
|
||||
if os.path.exists(src_path):
|
||||
shutil.copy2(src_path, assets_dir / dest_name)
|
||||
print(f"Copied {src_path} to {assets_dir / dest_name}")
|
||||
else:
|
||||
print(f"Warning: Source plot not found at {src_path}")
|
||||
|
||||
|
||||
def validate_and_print(wav_path: str, category: str):
|
||||
"""Validate a WAV file and print results."""
|
||||
if not os.path.exists(wav_path):
|
||||
print(f"Warning: WAV file not found at {wav_path}")
|
||||
return
|
||||
|
||||
print(f"\n=== Validating {category} Audio ===")
|
||||
result = validate_tts(wav_path)
|
||||
|
||||
if "error" in result:
|
||||
print(f"Error: {result['error']}")
|
||||
else:
|
||||
print(f"Duration: {result['duration']}")
|
||||
print(f"Sample Rate: {result['sample_rate']} Hz")
|
||||
print(f"Peak Amplitude: {result['peak_amplitude']}")
|
||||
print(f"RMS Level: {result['rms_level']}")
|
||||
|
||||
if result["issues"]:
|
||||
print("\nIssues Found:")
|
||||
for issue in result["issues"]:
|
||||
print(f"- {issue}")
|
||||
else:
|
||||
print("\nNo issues found")
|
||||
|
||||
|
||||
def main():
|
||||
"""Generate all plots needed for the README."""
|
||||
# Ensure assets directory exists
|
||||
prefix = "gpu"
|
||||
assets_dir = ensure_assets_dir()
|
||||
|
||||
print("\n=== Generating Format Comparison Plot ===")
|
||||
test_formats()
|
||||
copy_plot(
|
||||
str(script_dir / "test_formats/output/test_formats/format_comparison.png"),
|
||||
"format_comparison.png",
|
||||
assets_dir,
|
||||
)
|
||||
# Validate WAV output from format test
|
||||
validate_and_print(
|
||||
str(script_dir / "test_formats/output/test_formats/speech.wav"),
|
||||
"Format Test WAV",
|
||||
)
|
||||
|
||||
print("\n=== Generating Voice Analysis Plot ===")
|
||||
test_voice_analysis()
|
||||
copy_plot(
|
||||
str(script_dir / "test_combinations/output/analysis_comparison.png"),
|
||||
"voice_analysis.png",
|
||||
assets_dir,
|
||||
)
|
||||
# Validate combined voice output
|
||||
validate_and_print(
|
||||
str(
|
||||
script_dir
|
||||
/ "test_combinations/output/analysis_combined_af_bella_af_nicole.wav"
|
||||
),
|
||||
"Combined Voice",
|
||||
)
|
||||
|
||||
print("\n=== Generating Performance Benchmark Plots ===")
|
||||
benchmark_rtf()
|
||||
copy_plot(
|
||||
str(script_dir / f"benchmarks/output_plots/{prefix}_processing_time_rtf.png"),
|
||||
f"{prefix}_processing_time.png",
|
||||
assets_dir,
|
||||
)
|
||||
copy_plot(
|
||||
str(script_dir / f"benchmarks/output_plots/{prefix}_realtime_factor_rtf.png"),
|
||||
f"{prefix}_realtime_factor.png",
|
||||
assets_dir,
|
||||
)
|
||||
# Validate RTF benchmark output (~500 tokens)
|
||||
validate_and_print(
|
||||
str(script_dir / "benchmarks/output_audio/chunk_450_tokens.wav"),
|
||||
"RTF Benchmark",
|
||||
)
|
||||
|
||||
print("\n=== Generating Streaming Benchmark Plots ===")
|
||||
benchmark_stream()
|
||||
|
||||
# Copy direct streaming plots
|
||||
copy_plot(
|
||||
str(script_dir / "benchmarks/output_plots/first_token_latency_stream.png"),
|
||||
f"{prefix}_first_token_latency_direct.png",
|
||||
assets_dir,
|
||||
)
|
||||
copy_plot(
|
||||
str(script_dir / "benchmarks/output_plots/first_token_timeline_stream.png"),
|
||||
f"{prefix}_first_token_timeline_direct.png",
|
||||
assets_dir,
|
||||
)
|
||||
copy_plot(
|
||||
str(script_dir / "benchmarks/output_plots/total_time_latency_stream.png"),
|
||||
f"{prefix}_total_time_latency_direct.png",
|
||||
assets_dir,
|
||||
)
|
||||
|
||||
# Copy OpenAI streaming plots
|
||||
copy_plot(
|
||||
str(
|
||||
script_dir / "benchmarks/output_plots/first_token_latency_stream_openai.png"
|
||||
),
|
||||
f"{prefix}_first_token_latency_openai.png",
|
||||
assets_dir,
|
||||
)
|
||||
copy_plot(
|
||||
str(
|
||||
script_dir
|
||||
/ "benchmarks/output_plots/first_token_timeline_stream_openai.png"
|
||||
),
|
||||
f"{prefix}_first_token_timeline_openai.png",
|
||||
assets_dir,
|
||||
)
|
||||
copy_plot(
|
||||
str(
|
||||
script_dir / "benchmarks/output_plots/total_time_latency_stream_openai.png"
|
||||
),
|
||||
f"{prefix}_total_time_latency_openai.png",
|
||||
assets_dir,
|
||||
)
|
||||
|
||||
# Wait a moment for files to be generated
|
||||
import time
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Validate streaming outputs (~500 tokens)
|
||||
validate_and_print(
|
||||
str(
|
||||
script_dir
|
||||
/ "benchmarks/output_audio_stream/benchmark_tokens500_run1_stream.wav"
|
||||
),
|
||||
"Direct Streaming",
|
||||
)
|
||||
validate_and_print(
|
||||
str(
|
||||
script_dir
|
||||
/ "benchmarks/output_audio_stream_openai/benchmark_tokens500_run1_stream_openai.wav"
|
||||
),
|
||||
"OpenAI Streaming",
|
||||
)
|
||||
|
||||
validate_and_print(
|
||||
str(script_dir / "test_formats/output/test_formats/test_audio.wav"),
|
||||
"Format Test WAV",
|
||||
)
|
||||
|
||||
print("\nAll plots have been generated and copied to the assets directory")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -73,6 +73,7 @@ def generate_speech(
|
|||
"voice": voice,
|
||||
"speed": 1.0,
|
||||
"response_format": "wav", # Use WAV for analysis
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -193,9 +194,10 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
|
|||
fig.patch.set_facecolor("#1a1a2e")
|
||||
num_files = len(audio_files)
|
||||
|
||||
# Create subplot grid with proper spacing
|
||||
# Create subplot grid with proper spacing for waveforms and metrics
|
||||
total_rows = num_files + 2 # Add one more row for metrics
|
||||
gs = plt.GridSpec(
|
||||
num_files + 1, 2, height_ratios=[1.5] * num_files + [1], hspace=0.4, wspace=0.3
|
||||
total_rows, 2, height_ratios=[1.5] * num_files + [1, 1], hspace=0.4, wspace=0.3
|
||||
)
|
||||
|
||||
# Analyze all files first
|
||||
|
@ -216,48 +218,74 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
|
|||
# Colors for voices
|
||||
colors = ["#ff2a6d", "#05d9e8", "#d1f7ff"]
|
||||
|
||||
# Create two subplots for metrics with similar scales
|
||||
# Left subplot: Brightness and Volume
|
||||
ax1 = plt.subplot(gs[num_files, 0])
|
||||
metrics1 = [
|
||||
# Create metrics for each subplot
|
||||
metrics = [
|
||||
(
|
||||
"Brightness",
|
||||
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
|
||||
"kHz",
|
||||
),
|
||||
("Volume", [chars["rms"] * 100 for chars in all_chars.values()], "RMS×100"),
|
||||
]
|
||||
|
||||
# Right subplot: Voice Pitch and Texture
|
||||
ax2 = plt.subplot(gs[num_files, 1])
|
||||
metrics2 = [
|
||||
(
|
||||
"Voice Pitch",
|
||||
[min(chars["dominant_frequencies"]) for chars in all_chars.values()],
|
||||
"Hz",
|
||||
plt.subplot(gs[num_files, 0]),
|
||||
[
|
||||
(
|
||||
"Volume",
|
||||
[chars["rms"] * 100 for chars in all_chars.values()],
|
||||
"RMS×100",
|
||||
)
|
||||
],
|
||||
),
|
||||
(
|
||||
"Texture",
|
||||
[chars["zero_crossing_rate"] * 1000 for chars in all_chars.values()],
|
||||
"ZCR×1000",
|
||||
plt.subplot(gs[num_files, 1]),
|
||||
[
|
||||
(
|
||||
"Brightness",
|
||||
[chars["spectral_centroid"] / 1000 for chars in all_chars.values()],
|
||||
"kHz",
|
||||
)
|
||||
],
|
||||
),
|
||||
(
|
||||
plt.subplot(gs[num_files + 1, 0]),
|
||||
[
|
||||
(
|
||||
"Voice Pitch",
|
||||
[
|
||||
min(chars["dominant_frequencies"])
|
||||
for chars in all_chars.values()
|
||||
],
|
||||
"Hz",
|
||||
)
|
||||
],
|
||||
),
|
||||
(
|
||||
plt.subplot(gs[num_files + 1, 1]),
|
||||
[
|
||||
(
|
||||
"Texture",
|
||||
[
|
||||
chars["zero_crossing_rate"] * 1000
|
||||
for chars in all_chars.values()
|
||||
],
|
||||
"ZCR×1000",
|
||||
)
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
def plot_grouped_bars(ax, metrics, show_legend=True):
|
||||
n_groups = len(metrics)
|
||||
# Plot each metric
|
||||
for i, (ax, metric_data) in enumerate(metrics):
|
||||
n_voices = len(audio_files)
|
||||
bar_width = 0.25
|
||||
indices = np.array([0])
|
||||
|
||||
indices = np.arange(n_groups)
|
||||
values = metric_data[0][1]
|
||||
max_val = max(values)
|
||||
|
||||
# Get max value for y-axis scaling
|
||||
max_val = max(max(m[1]) for m in metrics)
|
||||
|
||||
for i, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
|
||||
values = [m[1][i] for m in metrics]
|
||||
offset = (i - n_voices / 2 + 0.5) * bar_width
|
||||
for j, (voice, color) in enumerate(zip(audio_files.keys(), colors)):
|
||||
offset = (j - n_voices / 2 + 0.5) * bar_width
|
||||
bars = ax.bar(
|
||||
indices + offset, values, bar_width, label=voice, color=color, alpha=0.8
|
||||
indices + offset,
|
||||
[values[j]],
|
||||
bar_width,
|
||||
label=voice,
|
||||
color=color,
|
||||
alpha=0.8,
|
||||
)
|
||||
|
||||
# Add value labels on top of bars
|
||||
|
@ -274,12 +302,12 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
|
|||
)
|
||||
|
||||
ax.set_xticks(indices)
|
||||
ax.set_xticklabels([f"{m[0]}\n({m[2]})" for m in metrics])
|
||||
|
||||
# Set y-axis limits with some padding
|
||||
ax.set_xticklabels([f"{metric_data[0][0]}\n({metric_data[0][2]})"])
|
||||
ax.set_ylim(0, max_val * 1.2)
|
||||
ax.set_ylabel("Value")
|
||||
|
||||
if show_legend:
|
||||
# Only show legend on first metric plot
|
||||
if i == 0:
|
||||
ax.legend(
|
||||
bbox_to_anchor=(1.05, 1),
|
||||
loc="upper left",
|
||||
|
@ -287,22 +315,11 @@ def plot_analysis(audio_files: Dict[str, str], output_dir: str):
|
|||
edgecolor="#ffffff",
|
||||
)
|
||||
|
||||
# Plot both subplots
|
||||
plot_grouped_bars(ax1, metrics1, show_legend=True)
|
||||
plot_grouped_bars(ax2, metrics2, show_legend=False)
|
||||
# Style the subplot
|
||||
setup_plot(fig, ax, metric_data[0][0])
|
||||
|
||||
# Style both subplots
|
||||
setup_plot(fig, ax1, "Brightness and Volume")
|
||||
setup_plot(fig, ax2, "Voice Pitch and Texture")
|
||||
|
||||
# Add y-axis labels
|
||||
ax1.set_ylabel("Value")
|
||||
ax2.set_ylabel("Value")
|
||||
|
||||
# Adjust the figure size to accommodate the legend
|
||||
fig.set_size_inches(15, 15)
|
||||
|
||||
# Add padding around the entire figure
|
||||
# Adjust the figure size and padding
|
||||
fig.set_size_inches(15, 20)
|
||||
plt.subplots_adjust(right=0.85, top=0.95, bottom=0.05, left=0.1)
|
||||
plt.savefig(os.path.join(output_dir, "analysis_comparison.png"), dpi=300)
|
||||
print(f"Saved analysis comparison to {output_dir}/analysis_comparison.png")
|
||||
|
@ -332,7 +349,7 @@ def main():
|
|||
)
|
||||
parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
"--output-dir",
|
||||
default="examples/assorted_checks/test_combinations/output",
|
||||
help="Output directory for audio files",
|
||||
)
|
||||
|
|
|
@ -66,26 +66,27 @@ def plot_format_comparison(stats: list, output_dir: str):
|
|||
for i, stat in enumerate(stats):
|
||||
format_name = stat["format"].upper()
|
||||
try:
|
||||
# Handle PCM format differently
|
||||
if stat["format"] == "pcm":
|
||||
# Read raw PCM data (16-bit mono)
|
||||
with open(
|
||||
os.path.join(output_dir, f"test_audio.{stat['format']}"), "rb"
|
||||
) as f:
|
||||
raw_data = f.read()
|
||||
data = np.frombuffer(raw_data, dtype=np.int16)
|
||||
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
|
||||
sr = 24000
|
||||
else:
|
||||
# Read other formats with soundfile
|
||||
data, sr = sf.read(
|
||||
os.path.join(output_dir, f"test_audio.{stat['format']}")
|
||||
)
|
||||
file_path = os.path.join(output_dir, f"test_audio.{stat['format']}")
|
||||
|
||||
# Plot waveform
|
||||
if stat["format"] == "wav":
|
||||
# Use scipy.io.wavfile for WAV files
|
||||
sr, data = wavfile.read(file_path)
|
||||
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
|
||||
elif stat["format"] == "pcm":
|
||||
# Read raw 16-bit signed little-endian PCM data at 24kHz
|
||||
data = np.frombuffer(
|
||||
open(file_path, "rb").read(), dtype="<i2"
|
||||
) # '<i2' means little-endian 16-bit signed int
|
||||
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
|
||||
sr = 24000 # Known sample rate for our endpoint
|
||||
else:
|
||||
# Use soundfile for other formats (mp3, opus, flac)
|
||||
data, sr = sf.read(file_path)
|
||||
|
||||
# Plot waveform with consistent normalization
|
||||
ax = plt.subplot(gs_waves[i])
|
||||
time = np.arange(len(data)) / sr
|
||||
plt.plot(time, data / np.max(np.abs(data)), linewidth=0.5, color="#ff2a6d")
|
||||
plt.plot(time, data, linewidth=0.5, color="#ff2a6d")
|
||||
ax.set_xlabel("Time (seconds)")
|
||||
ax.set_ylabel("")
|
||||
ax.set_ylim(-1.1, 1.1)
|
||||
|
@ -200,41 +201,42 @@ def get_audio_stats(file_path: str) -> dict:
|
|||
"""Get audio file statistics"""
|
||||
file_size = os.path.getsize(file_path)
|
||||
file_size_kb = file_size / 1024 # Convert to KB
|
||||
format_name = Path(file_path).suffix[1:]
|
||||
|
||||
try:
|
||||
# Try reading with soundfile first
|
||||
if format_name == "wav":
|
||||
# Use scipy.io.wavfile for WAV files
|
||||
sample_rate, data = wavfile.read(file_path)
|
||||
data = data.astype(np.float32) / 32768.0 # Convert to float [-1, 1]
|
||||
duration = len(data) / sample_rate
|
||||
channels = 1 if len(data.shape) == 1 else data.shape[1]
|
||||
elif format_name == "pcm":
|
||||
# For PCM, read raw 16-bit signed little-endian PCM data at 24kHz
|
||||
data = np.frombuffer(
|
||||
open(file_path, "rb").read(), dtype="<i2"
|
||||
) # '<i2' means little-endian 16-bit signed int
|
||||
data = data.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
|
||||
sample_rate = 24000 # Known sample rate for our endpoint
|
||||
duration = len(data) / sample_rate
|
||||
channels = 1
|
||||
else:
|
||||
# Use soundfile for other formats (mp3, opus, flac)
|
||||
data, sample_rate = sf.read(file_path)
|
||||
duration = len(data) / sample_rate
|
||||
channels = 1 if len(data.shape) == 1 else data.shape[1]
|
||||
|
||||
# Calculate audio statistics
|
||||
stats = {
|
||||
"format": Path(file_path).suffix[1:],
|
||||
"file_size_kb": round(file_size_kb, 2),
|
||||
"duration_seconds": round(duration, 2),
|
||||
"sample_rate": sample_rate,
|
||||
"channels": channels,
|
||||
"min_amplitude": float(np.min(data)),
|
||||
"max_amplitude": float(np.max(data)),
|
||||
"mean_amplitude": float(np.mean(np.abs(data))),
|
||||
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
|
||||
}
|
||||
return stats
|
||||
except:
|
||||
# For PCM, read raw bytes and estimate duration
|
||||
with open(file_path, "rb") as f:
|
||||
data = f.read()
|
||||
# Assuming 16-bit PCM mono at 24kHz
|
||||
samples = len(data) // 2 # 2 bytes per sample
|
||||
duration = samples / 24000
|
||||
return {
|
||||
"format": "pcm",
|
||||
"file_size_kb": round(file_size_kb, 2),
|
||||
"duration_seconds": round(duration, 2),
|
||||
"sample_rate": 24000,
|
||||
"channels": 1,
|
||||
"note": "PCM stats are estimated from raw bytes",
|
||||
}
|
||||
# Calculate audio statistics
|
||||
stats = {
|
||||
"format": format_name,
|
||||
"file_size_kb": round(file_size_kb, 2),
|
||||
"duration_seconds": round(duration, 2),
|
||||
"sample_rate": sample_rate,
|
||||
"channels": channels,
|
||||
"min_amplitude": float(np.min(data)),
|
||||
"max_amplitude": float(np.max(data)),
|
||||
"mean_amplitude": float(np.mean(np.abs(data))),
|
||||
"rms_amplitude": float(np.sqrt(np.mean(np.square(data)))),
|
||||
}
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -254,13 +256,49 @@ def main():
|
|||
|
||||
# Generate and save
|
||||
start_time = time.time()
|
||||
response = client.audio.speech.create(
|
||||
model="kokoro", voice=voice, input=SAMPLE_TEXT, response_format=fmt
|
||||
|
||||
# Use requests with stream=False for consistent data handling
|
||||
response = requests.post(
|
||||
"http://localhost:8880/v1/audio/speech",
|
||||
json={
|
||||
"model": "kokoro",
|
||||
"voice": voice,
|
||||
"input": SAMPLE_TEXT,
|
||||
"response_format": fmt,
|
||||
"stream": False, # Explicitly disable streaming to get single complete chunk
|
||||
},
|
||||
stream=False,
|
||||
headers={"Accept": f"audio/{fmt}"}, # Explicitly request audio format
|
||||
)
|
||||
generation_time = time.time() - start_time
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
print(f"\nResponse headers for {fmt}:")
|
||||
for header, value in response.headers.items():
|
||||
print(f"{header}: {value}")
|
||||
print(f"Content length: {len(response.content)} bytes")
|
||||
print(f"First few bytes: {response.content[:20].hex()}")
|
||||
|
||||
# Write the file and verify it was written correctly
|
||||
try:
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
# Verify file was written
|
||||
if not output_path.exists():
|
||||
raise Exception(f"Failed to write {fmt} file")
|
||||
|
||||
# Check file size matches content length
|
||||
written_size = output_path.stat().st_size
|
||||
if written_size != len(response.content):
|
||||
raise Exception(
|
||||
f"File size mismatch: expected {len(response.content)} bytes, got {written_size}"
|
||||
)
|
||||
|
||||
print(f"Successfully wrote {fmt} file")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error writing {fmt} file: {e}")
|
||||
continue
|
||||
|
||||
# Get stats
|
||||
file_stats = get_audio_stats(str(output_path))
|
||||
|
|
|
@ -4,15 +4,19 @@ import random
|
|||
import string
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
def create_test_cases() -> List[str]:
|
||||
"""Create a variety of test cases with different characteristics"""
|
||||
|
||||
|
||||
# Helper to create random text with specific patterns
|
||||
def random_text(length: int) -> str:
|
||||
return ''.join(random.choice(string.ascii_letters + string.digits + " .,!?") for _ in range(length))
|
||||
|
||||
return "".join(
|
||||
random.choice(string.ascii_letters + string.digits + " .,!?")
|
||||
for _ in range(length)
|
||||
)
|
||||
|
||||
test_cases = []
|
||||
|
||||
|
||||
# Base test cases that hit specific patterns
|
||||
base_cases = [
|
||||
"Dr. Smith and Mr. Jones discussed the $1,234.56 million investment.",
|
||||
|
@ -21,10 +25,10 @@ def create_test_cases() -> List[str]:
|
|||
"X's and Y's properties cost £50 million in the 1990s",
|
||||
"こんにちは。今日は!",
|
||||
]
|
||||
|
||||
|
||||
# Add base cases
|
||||
test_cases.extend(base_cases)
|
||||
|
||||
|
||||
# Add variations with random content
|
||||
for length in [100, 1000, 10000]:
|
||||
# Create 3 variations of each length
|
||||
|
@ -35,23 +39,24 @@ def create_test_cases() -> List[str]:
|
|||
text = text.replace(text[30:40], "$1,234.56")
|
||||
text = text.replace(text[50:60], "A.B.C. xyz")
|
||||
test_cases.append(text)
|
||||
|
||||
|
||||
return test_cases
|
||||
|
||||
|
||||
class TextNormalizerInline:
|
||||
"""Text normalizer using inline patterns"""
|
||||
|
||||
|
||||
def normalize(self, text: str) -> str:
|
||||
# Replace quotes and brackets
|
||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
||||
text = text.replace("(", "«").replace(")", "»")
|
||||
|
||||
|
||||
# Handle CJK punctuation
|
||||
for a, b in zip("、。!,:;?", ",.!,:;?"):
|
||||
text = text.replace(a, b + " ")
|
||||
|
||||
|
||||
text = re.sub(r"[^\S \n]", " ", text)
|
||||
text = re.sub(r" +", " ", text)
|
||||
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
|
||||
|
@ -61,108 +66,132 @@ class TextNormalizerInline:
|
|||
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
|
||||
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
|
||||
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
|
||||
text = re.sub(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text)
|
||||
text = re.sub(
|
||||
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
|
||||
split_num,
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"(?<=\d),(?=\d)", "", text)
|
||||
text = re.sub(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b", handle_money, text)
|
||||
text = re.sub(
|
||||
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
|
||||
handle_money,
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"\d*\.\d+", handle_decimal, text)
|
||||
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
|
||||
text = re.sub(r"(?<=\d)S", " S", text)
|
||||
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
|
||||
text = re.sub(r"(?<=X')S\b", "s", text)
|
||||
text = re.sub(r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text)
|
||||
text = re.sub(
|
||||
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
|
||||
)
|
||||
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
|
||||
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
class TextNormalizerCompiled:
|
||||
"""Text normalizer using all compiled patterns"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.patterns = {
|
||||
'whitespace': re.compile(r"[^\S \n]"),
|
||||
'multi_space': re.compile(r" +"),
|
||||
'newline_space': re.compile(r"(?<=\n) +(?=\n)"),
|
||||
'doctor': re.compile(r"\bD[Rr]\.(?= [A-Z])"),
|
||||
'mister': re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
|
||||
'miss': re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
|
||||
'mrs': re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
|
||||
'etc': re.compile(r"\betc\.(?! [A-Z])"),
|
||||
'yeah': re.compile(r"(?i)\b(y)eah?\b"),
|
||||
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
|
||||
'comma_in_number': re.compile(r"(?<=\d),(?=\d)"),
|
||||
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
|
||||
'decimal': re.compile(r"\d*\.\d+"),
|
||||
'range': re.compile(r"(?<=\d)-(?=\d)"),
|
||||
's_after_number': re.compile(r"(?<=\d)S"),
|
||||
'possessive_s': re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
|
||||
'x_possessive': re.compile(r"(?<=X')S\b"),
|
||||
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
|
||||
'single_initial': re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])")
|
||||
"whitespace": re.compile(r"[^\S \n]"),
|
||||
"multi_space": re.compile(r" +"),
|
||||
"newline_space": re.compile(r"(?<=\n) +(?=\n)"),
|
||||
"doctor": re.compile(r"\bD[Rr]\.(?= [A-Z])"),
|
||||
"mister": re.compile(r"\b(?:Mr\.|MR\.(?= [A-Z]))"),
|
||||
"miss": re.compile(r"\b(?:Ms\.|MS\.(?= [A-Z]))"),
|
||||
"mrs": re.compile(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))"),
|
||||
"etc": re.compile(r"\betc\.(?! [A-Z])"),
|
||||
"yeah": re.compile(r"(?i)\b(y)eah?\b"),
|
||||
"numbers": re.compile(
|
||||
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
|
||||
),
|
||||
"comma_in_number": re.compile(r"(?<=\d),(?=\d)"),
|
||||
"money": re.compile(
|
||||
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
|
||||
),
|
||||
"decimal": re.compile(r"\d*\.\d+"),
|
||||
"range": re.compile(r"(?<=\d)-(?=\d)"),
|
||||
"s_after_number": re.compile(r"(?<=\d)S"),
|
||||
"possessive_s": re.compile(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b"),
|
||||
"x_possessive": re.compile(r"(?<=X')S\b"),
|
||||
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
|
||||
"single_initial": re.compile(r"(?i)(?<=[A-Z])\.(?=[A-Z])"),
|
||||
}
|
||||
|
||||
|
||||
def normalize(self, text: str) -> str:
|
||||
# Replace quotes and brackets
|
||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
||||
text = text.replace("(", "«").replace(")", "»")
|
||||
|
||||
|
||||
# Handle CJK punctuation
|
||||
for a, b in zip("、。!,:;?", ",.!,:;?"):
|
||||
text = text.replace(a, b + " ")
|
||||
|
||||
|
||||
# Use compiled patterns
|
||||
text = self.patterns['whitespace'].sub(" ", text)
|
||||
text = self.patterns['multi_space'].sub(" ", text)
|
||||
text = self.patterns['newline_space'].sub("", text)
|
||||
text = self.patterns['doctor'].sub("Doctor", text)
|
||||
text = self.patterns['mister'].sub("Mister", text)
|
||||
text = self.patterns['miss'].sub("Miss", text)
|
||||
text = self.patterns['mrs'].sub("Mrs", text)
|
||||
text = self.patterns['etc'].sub("etc", text)
|
||||
text = self.patterns['yeah'].sub(r"\1e'a", text)
|
||||
text = self.patterns['numbers'].sub(split_num, text)
|
||||
text = self.patterns['comma_in_number'].sub("", text)
|
||||
text = self.patterns['money'].sub(handle_money, text)
|
||||
text = self.patterns['decimal'].sub(handle_decimal, text)
|
||||
text = self.patterns['range'].sub(" to ", text)
|
||||
text = self.patterns['s_after_number'].sub(" S", text)
|
||||
text = self.patterns['possessive_s'].sub("'S", text)
|
||||
text = self.patterns['x_possessive'].sub("s", text)
|
||||
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
|
||||
text = self.patterns['single_initial'].sub("-", text)
|
||||
|
||||
text = self.patterns["whitespace"].sub(" ", text)
|
||||
text = self.patterns["multi_space"].sub(" ", text)
|
||||
text = self.patterns["newline_space"].sub("", text)
|
||||
text = self.patterns["doctor"].sub("Doctor", text)
|
||||
text = self.patterns["mister"].sub("Mister", text)
|
||||
text = self.patterns["miss"].sub("Miss", text)
|
||||
text = self.patterns["mrs"].sub("Mrs", text)
|
||||
text = self.patterns["etc"].sub("etc", text)
|
||||
text = self.patterns["yeah"].sub(r"\1e'a", text)
|
||||
text = self.patterns["numbers"].sub(split_num, text)
|
||||
text = self.patterns["comma_in_number"].sub("", text)
|
||||
text = self.patterns["money"].sub(handle_money, text)
|
||||
text = self.patterns["decimal"].sub(handle_decimal, text)
|
||||
text = self.patterns["range"].sub(" to ", text)
|
||||
text = self.patterns["s_after_number"].sub(" S", text)
|
||||
text = self.patterns["possessive_s"].sub("'S", text)
|
||||
text = self.patterns["x_possessive"].sub("s", text)
|
||||
text = self.patterns["initials"].sub(
|
||||
lambda m: m.group().replace(".", "-"), text
|
||||
)
|
||||
text = self.patterns["single_initial"].sub("-", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
class TextNormalizerHybrid:
|
||||
"""Text normalizer using hybrid approach - compile only complex/frequent patterns"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
# Only compile patterns that are complex or frequently used
|
||||
self.patterns = {
|
||||
'whitespace': re.compile(r"[^\S \n]"),
|
||||
'numbers': re.compile(r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"),
|
||||
'money': re.compile(r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"),
|
||||
'initials': re.compile(r"(?:[A-Za-z]\.){2,} [a-z]")
|
||||
"whitespace": re.compile(r"[^\S \n]"),
|
||||
"numbers": re.compile(
|
||||
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)"
|
||||
),
|
||||
"money": re.compile(
|
||||
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b"
|
||||
),
|
||||
"initials": re.compile(r"(?:[A-Za-z]\.){2,} [a-z]"),
|
||||
}
|
||||
|
||||
|
||||
def normalize(self, text: str) -> str:
|
||||
# Replace quotes and brackets
|
||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
||||
text = text.replace("(", "«").replace(")", "»")
|
||||
|
||||
|
||||
# Handle CJK punctuation
|
||||
for a, b in zip("、。!,:;?", ",.!,:;?"):
|
||||
text = text.replace(a, b + " ")
|
||||
|
||||
|
||||
# Use compiled patterns for complex operations
|
||||
text = self.patterns['whitespace'].sub(" ", text)
|
||||
text = self.patterns['numbers'].sub(split_num, text)
|
||||
text = self.patterns['money'].sub(handle_money, text)
|
||||
text = self.patterns['initials'].sub(lambda m: m.group().replace(".", "-"), text)
|
||||
|
||||
text = self.patterns["whitespace"].sub(" ", text)
|
||||
text = self.patterns["numbers"].sub(split_num, text)
|
||||
text = self.patterns["money"].sub(handle_money, text)
|
||||
text = self.patterns["initials"].sub(
|
||||
lambda m: m.group().replace(".", "-"), text
|
||||
)
|
||||
|
||||
# Use inline patterns for simpler operations
|
||||
text = re.sub(r" +", " ", text)
|
||||
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
|
||||
|
@ -179,9 +208,10 @@ class TextNormalizerHybrid:
|
|||
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
|
||||
text = re.sub(r"(?<=X')S\b", "s", text)
|
||||
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
|
||||
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def split_num(match: re.Match) -> str:
|
||||
"""Split numbers for TTS processing"""
|
||||
num = match.group(0)
|
||||
|
@ -192,61 +222,70 @@ def split_num(match: re.Match) -> str:
|
|||
return f"{num[:-1]} s"
|
||||
return num
|
||||
|
||||
|
||||
def handle_money(match: re.Match) -> str:
|
||||
"""Format money strings for TTS"""
|
||||
text = match.group(0)
|
||||
return text.replace("$", " dollars ").replace("£", " pounds ")
|
||||
|
||||
|
||||
def handle_decimal(match: re.Match) -> str:
|
||||
"""Format decimal numbers for TTS"""
|
||||
num = match.group(0)
|
||||
return num.replace(".", " point ")
|
||||
|
||||
def benchmark_normalizers(test_cases: List[str], iterations: int = 100) -> Tuple[float, float, float]:
|
||||
|
||||
def benchmark_normalizers(
|
||||
test_cases: List[str], iterations: int = 100
|
||||
) -> Tuple[float, float, float]:
|
||||
"""Benchmark all three implementations"""
|
||||
|
||||
|
||||
normalizers = {
|
||||
'inline': TextNormalizerInline(),
|
||||
'compiled': TextNormalizerCompiled(),
|
||||
'hybrid': TextNormalizerHybrid()
|
||||
"inline": TextNormalizerInline(),
|
||||
"compiled": TextNormalizerCompiled(),
|
||||
"hybrid": TextNormalizerHybrid(),
|
||||
}
|
||||
|
||||
|
||||
results = {}
|
||||
|
||||
|
||||
# Test each normalizer
|
||||
for name, normalizer in normalizers.items():
|
||||
start = time.perf_counter()
|
||||
|
||||
|
||||
# Run normalizations
|
||||
for _ in range(iterations):
|
||||
for test in test_cases:
|
||||
normalizer.normalize(test)
|
||||
|
||||
|
||||
results[name] = time.perf_counter() - start
|
||||
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def verify_outputs(test_cases: List[str]) -> bool:
|
||||
"""Verify that all implementations produce identical output"""
|
||||
normalizers = {
|
||||
'inline': TextNormalizerInline(),
|
||||
'compiled': TextNormalizerCompiled(),
|
||||
'hybrid': TextNormalizerHybrid()
|
||||
"inline": TextNormalizerInline(),
|
||||
"compiled": TextNormalizerCompiled(),
|
||||
"hybrid": TextNormalizerHybrid(),
|
||||
}
|
||||
|
||||
|
||||
for test in test_cases:
|
||||
results = [norm.normalize(test) for norm in normalizers.values()]
|
||||
if not all(r == results[0] for r in results):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
# Create test cases
|
||||
print("Generating test cases...")
|
||||
test_cases = create_test_cases()
|
||||
total_chars = sum(len(t) for t in test_cases)
|
||||
print(f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters")
|
||||
|
||||
print(
|
||||
f"Created {len(test_cases)} test cases, total size: {total_chars:,} characters"
|
||||
)
|
||||
|
||||
# Verify output consistency
|
||||
print("\nVerifying output consistency...")
|
||||
if verify_outputs(test_cases):
|
||||
|
@ -254,15 +293,16 @@ def main():
|
|||
else:
|
||||
print("✗ Warning: Implementations produce different outputs!")
|
||||
return
|
||||
|
||||
|
||||
# Run benchmarks
|
||||
print("\nRunning benchmarks...")
|
||||
iterations = 100
|
||||
results = benchmark_normalizers(test_cases, iterations)
|
||||
|
||||
|
||||
# Print results
|
||||
print(f"\nResults for {iterations} iterations: ")
|
||||
for name, time_taken in results.items():
|
||||
print(f"{name.capitalize()}: {time_taken:.3f}s")
|
||||
|
||||
main()
|
||||
|
||||
main()
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import argparse
|
||||
from typing import Any, Dict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def validate_tts(wav_path: str) -> dict:
|
||||
"""
|
||||
|
@ -13,34 +16,40 @@ def validate_tts(wav_path: str) -> dict:
|
|||
audio, sr = sf.read(wav_path)
|
||||
if len(audio.shape) > 1:
|
||||
audio = np.mean(audio, axis=1)
|
||||
|
||||
|
||||
duration = len(audio) / sr
|
||||
issues = []
|
||||
|
||||
|
||||
# Basic quality checks
|
||||
abs_audio = np.abs(audio)
|
||||
stats = {
|
||||
'rms': float(np.sqrt(np.mean(audio**2))),
|
||||
'peak': float(np.max(abs_audio)),
|
||||
'dc_offset': float(np.mean(audio))
|
||||
"rms": float(np.sqrt(np.mean(audio**2))),
|
||||
"peak": float(np.max(abs_audio)),
|
||||
"dc_offset": float(np.mean(audio)),
|
||||
}
|
||||
|
||||
|
||||
clip_count = np.sum(abs_audio >= 0.99)
|
||||
clip_percent = (clip_count / len(audio)) * 100
|
||||
|
||||
|
||||
if duration < 0.1:
|
||||
issues.append("WARNING: Audio is suspiciously short - possible failed generation")
|
||||
|
||||
if stats['peak'] >= 1.0:
|
||||
issues.append(
|
||||
"WARNING: Audio is suspiciously short - possible failed generation"
|
||||
)
|
||||
|
||||
if stats["peak"] >= 1.0:
|
||||
if clip_percent > 1.0:
|
||||
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
|
||||
issues.append(
|
||||
f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)"
|
||||
)
|
||||
elif clip_percent > 0.01:
|
||||
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
|
||||
|
||||
if stats['rms'] < 0.01:
|
||||
issues.append(
|
||||
f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)"
|
||||
)
|
||||
|
||||
if stats["rms"] < 0.01:
|
||||
issues.append("WARNING: Audio is very quiet - possible failed generation")
|
||||
|
||||
if abs(stats['dc_offset']) > 0.1:
|
||||
|
||||
if abs(stats["dc_offset"]) > 0.1:
|
||||
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
|
||||
|
||||
# Check for long silence gaps
|
||||
|
@ -51,66 +60,79 @@ def validate_tts(wav_path: str) -> dict:
|
|||
window_size = int(min_silence * sr)
|
||||
silence_count = 0
|
||||
last_silence = -1
|
||||
|
||||
|
||||
start_idx = int(0.2 * sr) # Skip first 0.2s
|
||||
for i in range(start_idx, len(db) - window_size, window_size):
|
||||
window = db[i:i+window_size]
|
||||
for i in tqdm(
|
||||
range(start_idx, len(db) - window_size, window_size),
|
||||
desc="Checking for silence",
|
||||
):
|
||||
window = db[i : i + window_size]
|
||||
if np.mean(window) < silence_threshold:
|
||||
silent_ratio = np.mean(window < silence_threshold)
|
||||
if silent_ratio > 0.9:
|
||||
if last_silence == -1 or (i/sr - last_silence) > 2.0:
|
||||
if last_silence == -1 or (i / sr - last_silence) > 2.0:
|
||||
silence_count += 1
|
||||
last_silence = i/sr
|
||||
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
|
||||
|
||||
last_silence = i / sr
|
||||
issues.append(
|
||||
f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)"
|
||||
)
|
||||
|
||||
if silence_count > 2:
|
||||
issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
|
||||
issues.append(
|
||||
f"WARNING: Multiple long silences found ({silence_count} total)"
|
||||
)
|
||||
|
||||
# Detect audio artifacts
|
||||
diff = np.diff(audio)
|
||||
abs_diff = np.abs(diff)
|
||||
window_size = min(int(0.005 * sr), 256)
|
||||
window = np.ones(window_size)/window_size
|
||||
local_avg_diff = np.convolve(abs_diff, window, mode='same')
|
||||
|
||||
window = np.ones(window_size) / window_size
|
||||
local_avg_diff = np.convolve(abs_diff, window, mode="same")
|
||||
|
||||
spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
|
||||
artifact_indices = np.nonzero(spikes)[0]
|
||||
|
||||
|
||||
artifacts = []
|
||||
if len(artifact_indices) > 0:
|
||||
gaps = np.diff(artifact_indices)
|
||||
min_gap = int(0.005 * sr)
|
||||
break_points = np.nonzero(gaps > min_gap)[0] + 1
|
||||
groups = np.split(artifact_indices, break_points)
|
||||
|
||||
|
||||
for group in groups:
|
||||
if len(group) >= 5:
|
||||
severity = np.max(abs_diff[group])
|
||||
if severity > 0.2:
|
||||
center_idx = group[len(group)//2]
|
||||
artifacts.append({
|
||||
'time': float(center_idx/sr), # Ensure float for consistent timing
|
||||
'severity': float(severity)
|
||||
})
|
||||
center_idx = group[len(group) // 2]
|
||||
artifacts.append(
|
||||
{
|
||||
"time": float(
|
||||
center_idx / sr
|
||||
), # Ensure float for consistent timing
|
||||
"severity": float(severity),
|
||||
}
|
||||
)
|
||||
issues.append(
|
||||
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
|
||||
f"(severity: {severity:.3f})"
|
||||
)
|
||||
|
||||
# Check for repeated speech segments
|
||||
for chunk_duration in [5.0, 10.0]:
|
||||
for chunk_duration in tqdm(
|
||||
[0.5, 2.5, 5.0, 10.0], desc="Checking for repeated speech"
|
||||
):
|
||||
chunk_size = int(chunk_duration * sr)
|
||||
overlap = int(0.2 * chunk_size)
|
||||
|
||||
for i in range(0, len(audio) - 2*chunk_size, overlap):
|
||||
chunk1 = audio[i:i+chunk_size]
|
||||
chunk2 = audio[i+chunk_size:i+2*chunk_size]
|
||||
|
||||
|
||||
for i in range(0, len(audio) - 2 * chunk_size, overlap):
|
||||
chunk1 = audio[i : i + chunk_size]
|
||||
chunk2 = audio[i + chunk_size : i + 2 * chunk_size]
|
||||
|
||||
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
|
||||
continue
|
||||
|
||||
|
||||
try:
|
||||
correlation = np.corrcoef(chunk1, chunk2)[0,1]
|
||||
correlation = np.corrcoef(chunk1, chunk2)[0, 1]
|
||||
if not np.isnan(correlation) and correlation > 0.92:
|
||||
issues.append(
|
||||
f"WARNING: Possible repeated speech at {i/sr:.1f}s "
|
||||
|
@ -128,92 +150,113 @@ def validate_tts(wav_path: str) -> dict:
|
|||
"rms_level": f"{stats['rms']:.3f}",
|
||||
"dc_offset": f"{stats['dc_offset']:.3f}",
|
||||
"artifact_count": len(artifacts),
|
||||
"artifact_locations": [a['time'] for a in artifacts],
|
||||
"artifact_severities": [a['severity'] for a in artifacts],
|
||||
"artifact_locations": [a["time"] for a in artifacts],
|
||||
"artifact_severities": [a["severity"] for a in artifacts],
|
||||
"issues": issues,
|
||||
"valid": len(issues) == 0
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"file": wav_path,
|
||||
"error": str(e),
|
||||
"valid": False
|
||||
"valid": len(issues) == 0,
|
||||
}
|
||||
|
||||
def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
|
||||
except Exception as e:
|
||||
return {"file": wav_path, "error": str(e), "valid": False}
|
||||
|
||||
|
||||
def generate_analysis_plots(
|
||||
wav_path: str, output_dir: str, validation_result: Dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Generate analysis plots for audio file with time-aligned visualizations.
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.signal import spectrogram
|
||||
|
||||
|
||||
# Load audio
|
||||
audio, sr = sf.read(wav_path)
|
||||
if len(audio.shape) > 1:
|
||||
audio = np.mean(audio, axis=1)
|
||||
|
||||
|
||||
# Create figure with shared x-axis
|
||||
fig = plt.figure(figsize=(15, 8))
|
||||
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
|
||||
ax1 = fig.add_subplot(gs[0])
|
||||
ax2 = fig.add_subplot(gs[1], sharex=ax1)
|
||||
|
||||
|
||||
# Calculate spectrogram
|
||||
nperseg = 2048
|
||||
noverlap = 1536
|
||||
f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap,
|
||||
window='hann', scaling='spectrum')
|
||||
|
||||
f, t, Sxx = spectrogram(
|
||||
audio, sr, nperseg=nperseg, noverlap=noverlap, window="hann", scaling="spectrum"
|
||||
)
|
||||
|
||||
# Plot spectrogram
|
||||
im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10),
|
||||
shading='gouraud', cmap='viridis',
|
||||
vmin=-100, vmax=-20)
|
||||
ax1.set_ylabel('Frequency [Hz]', fontsize=10)
|
||||
cbar = plt.colorbar(im, ax=ax1, label='dB')
|
||||
ax1.set_title('Spectrogram', pad=10, fontsize=12)
|
||||
|
||||
im = ax1.pcolormesh(
|
||||
t,
|
||||
f,
|
||||
10 * np.log10(Sxx + 1e-10),
|
||||
shading="gouraud",
|
||||
cmap="viridis",
|
||||
vmin=-100,
|
||||
vmax=-20,
|
||||
)
|
||||
ax1.set_ylabel("Frequency [Hz]", fontsize=10)
|
||||
cbar = plt.colorbar(im, ax=ax1, label="dB")
|
||||
ax1.set_title("Spectrogram", pad=10, fontsize=12)
|
||||
|
||||
# Plot waveform with exact time alignment
|
||||
times = np.arange(len(audio)) / sr
|
||||
ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
|
||||
ax2.set_ylabel('Amplitude', fontsize=10)
|
||||
ax2.set_xlabel('Time [sec]', fontsize=10)
|
||||
ax2.plot(times, audio, color="#2E5596", alpha=0.7, linewidth=0.5, label="Audio")
|
||||
ax2.set_ylabel("Amplitude", fontsize=10)
|
||||
ax2.set_xlabel("Time [sec]", fontsize=10)
|
||||
ax2.grid(True, alpha=0.2)
|
||||
|
||||
|
||||
# Add artifact markers
|
||||
if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
|
||||
for loc in validation_result['artifact_locations']:
|
||||
ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
|
||||
ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
|
||||
|
||||
if (
|
||||
"artifact_locations" in validation_result
|
||||
and validation_result["artifact_locations"]
|
||||
):
|
||||
for loc in validation_result["artifact_locations"]:
|
||||
ax1.axvline(x=loc, color="red", alpha=0.7, linewidth=2)
|
||||
ax2.axvline(
|
||||
x=loc, color="red", alpha=0.7, linewidth=2, label="Detected Artifacts"
|
||||
)
|
||||
|
||||
# Add legend to both plots
|
||||
if len(validation_result['artifact_locations']) > 0:
|
||||
ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
|
||||
ax1.legend(loc='upper right', fontsize=8)
|
||||
if len(validation_result["artifact_locations"]) > 0:
|
||||
ax1.plot([], [], color="red", linewidth=2, label="Detected Artifacts")
|
||||
ax1.legend(loc="upper right", fontsize=8)
|
||||
# Only add unique labels to legend
|
||||
handles, labels = ax2.get_legend_handles_labels()
|
||||
unique_labels = dict(zip(labels, handles))
|
||||
ax2.legend(unique_labels.values(), unique_labels.keys(),
|
||||
loc='upper right', fontsize=8)
|
||||
|
||||
ax2.legend(
|
||||
unique_labels.values(),
|
||||
unique_labels.keys(),
|
||||
loc="upper right",
|
||||
fontsize=8,
|
||||
)
|
||||
|
||||
# Set common x limits
|
||||
xlim = (0, len(audio)/sr)
|
||||
xlim = (0, len(audio) / sr)
|
||||
ax1.set_xlim(xlim)
|
||||
ax2.set_xlim(xlim)
|
||||
og_filename = Path(wav_path).name.split(".")[0]
|
||||
# Save plot
|
||||
plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
|
||||
plt.savefig(
|
||||
Path(output_dir) / f"{og_filename}_audio_analysis.png",
|
||||
dpi=300,
|
||||
bbox_inches="tight",
|
||||
)
|
||||
plt.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
|
||||
silent=False
|
||||
|
||||
if __name__ == "__main__":
|
||||
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\assorted_checks\benchmarks\output_audio\chunk_600_tokens.wav"
|
||||
silent = False
|
||||
|
||||
print(f"\n\n Processing:\n\t{wav_file}")
|
||||
result = validate_tts(wav_file)
|
||||
if not silent:
|
||||
wav_root_dir = Path(wav_file).parent
|
||||
generate_analysis_plots(wav_file, wav_root_dir, result)
|
||||
|
||||
|
||||
print(f"\nValidating: {result['file']}")
|
||||
if "error" in result:
|
||||
print(f"Error: {result['error']}")
|
||||
|
@ -224,10 +267,10 @@ if __name__ == "__main__":
|
|||
print(f"RMS Level: {result['rms_level']}")
|
||||
print(f"DC Offset: {result['dc_offset']}")
|
||||
print(f"Detected Artifacts: {result['artifact_count']}")
|
||||
|
||||
|
||||
if result["issues"]:
|
||||
print("\nIssues Found:")
|
||||
for issue in result["issues"]:
|
||||
print(f"- {issue}")
|
||||
else:
|
||||
print("\nNo issues found")
|
||||
print("\nNo issues found")
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from validate_wav import validate_tts
|
||||
|
||||
|
||||
def print_validation_result(result: dict, rel_path: Path):
|
||||
"""Print full validation details for a single file."""
|
||||
print(f"\nValidating: {rel_path}")
|
||||
|
@ -13,7 +15,7 @@ def print_validation_result(result: dict, rel_path: Path):
|
|||
print(f"Peak Amplitude: {result['peak_amplitude']}")
|
||||
print(f"RMS Level: {result['rms_level']}")
|
||||
print(f"DC Offset: {result['dc_offset']}")
|
||||
|
||||
|
||||
if result["issues"]:
|
||||
print("\nIssues Found:")
|
||||
for issue in result["issues"]:
|
||||
|
@ -21,25 +23,26 @@ def print_validation_result(result: dict, rel_path: Path):
|
|||
else:
|
||||
print("\nNo issues found")
|
||||
|
||||
|
||||
def validate_directory(directory: str):
|
||||
"""Validate all wav files in a directory with detailed output and summary."""
|
||||
dir_path = Path(directory)
|
||||
|
||||
|
||||
# Find all wav files (including nested directories)
|
||||
wav_files = list(dir_path.rglob("*.wav"))
|
||||
wav_files.extend(dir_path.rglob("*.mp3")) # Also check mp3s
|
||||
wav_files = sorted(wav_files)
|
||||
|
||||
|
||||
if not wav_files:
|
||||
print(f"No .wav or .mp3 files found in {directory}")
|
||||
return
|
||||
|
||||
|
||||
print(f"Found {len(wav_files)} files in {directory}")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
# Store results for summary
|
||||
results = []
|
||||
|
||||
|
||||
# Detailed validation output
|
||||
for wav_file in wav_files:
|
||||
result = validate_tts(str(wav_file))
|
||||
|
@ -47,7 +50,7 @@ def validate_directory(directory: str):
|
|||
print_validation_result(result, rel_path)
|
||||
results.append((rel_path, result))
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
# Summary with detailed issues
|
||||
print("\nSUMMARY:")
|
||||
for rel_path, result in results:
|
||||
|
@ -58,15 +61,18 @@ def validate_directory(directory: str):
|
|||
issues = result["issues"]
|
||||
first_issue = issues[0].replace("WARNING: ", "")
|
||||
if len(issues) > 1:
|
||||
print(f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)")
|
||||
print(
|
||||
f"{rel_path}: FAIL - {first_issue} (+{len(issues)-1} more issues)"
|
||||
)
|
||||
else:
|
||||
print(f"{rel_path}: FAIL - {first_issue}")
|
||||
else:
|
||||
print(f"{rel_path}: PASS")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Batch validate TTS wav files")
|
||||
parser.add_argument("directory", help="Directory containing wav files to validate")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
validate_directory(args.directory)
|
||||
|
|
Before Width: | Height: | Size: 142 KiB |
|
@ -13,7 +13,7 @@ numpy==2.2.1
|
|||
scipy==1.14.1
|
||||
|
||||
# Audio processing
|
||||
soundfile==0.12.1
|
||||
soundfile==0.13.0
|
||||
|
||||
# Text processing
|
||||
phonemizer==3.3.0
|
||||
|
|