- CPU ONNX + PyTorch CUDA, functional

- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics
This commit is contained in:
remsky 2025-01-03 17:54:17 -07:00
parent 9496a3a63f
commit 7df2a68fb4
60 changed files with 5478 additions and 1680 deletions

10
.gitignore vendored
View file

@ -1,5 +1,6 @@
output/
output/*
output_audio/*
ui/data/*
*.db
@ -16,3 +17,10 @@ env/
.coverage
examples/assorted_checks/benchmarks/output_audio/*
examples/assorted_checks/test_combinations/output/*
examples/assorted_checks/test_openai/output/*
examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/*
ui/RepoScreenshot.png

View file

@ -3,8 +3,8 @@
</p>
# Kokoro TTS API
[![Tests](https://img.shields.io/badge/tests-89%20passed-darkgreen)]()
[![Coverage](https://img.shields.io/badge/coverage-80%25-darkgreen)]()
[![Tests](https://img.shields.io/badge/tests-95%20passed-darkgreen)]()
[![Coverage](https://img.shields.io/badge/coverage-72%25-darkgreen)]()
[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model

View file

@ -14,7 +14,8 @@ class Settings(BaseSettings):
output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB
default_voice: str = "af"
model_dir: str = "/app/Kokoro-82M" # Base directory for model files
model_path: str = "kokoro-v0_19.pth"
pytorch_model_path: str = "kokoro-v0_19.pth"
onnx_model_path: str = "kokoro-v0_19.onnx"
voices_dir: str = "voices"
sample_rate: int = 24000

185
api/src/core/kokoro.py Normal file
View file

@ -0,0 +1,185 @@
import re
import torch
import phonemizer
def split_num(num):
num = num.group()
if "." in num:
return num
elif ":" in num:
h, m = [int(n) for n in num.split(":")]
if m == 0:
return f"{h} o'clock"
elif m < 10:
return f"{h} oh {m}"
return f"{h} {m}"
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
s = "s" if num.endswith("s") else ""
if 100 <= year % 1000 <= 999:
if right == 0:
return f"{left} hundred{s}"
elif right < 10:
return f"{left} oh {right}{s}"
return f"{left} {right}{s}"
def flip_money(m):
m = m.group()
bill = "dollar" if m[0] == "$" else "pound"
if m[-1].isalpha():
return f"{m[1:]} {bill}s"
elif "." not in m:
s = "" if m[1:] == "1" else "s"
return f"{m[1:]} {bill}{s}"
b, c = m[1:].split(".")
s = "" if b == "1" else "s"
c = int(c.ljust(2, "0"))
coins = (
f"cent{'' if c == 1 else 's'}"
if m[0] == "$"
else ("penny" if c == 1 else "pence")
)
return f"{b} {bill}{s} and {c} {coins}"
def point_num(num):
a, b = num.group().split(".")
return " point ".join([a, " ".join(b)])
def normalize_text(text):
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
flip_money,
text,
)
text = re.sub(r"\d*\.\d+", point_num, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()
def get_vocab():
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'"
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
return dicts
VOCAB = get_vocab()
def tokenize(ps):
return [i for i in map(VOCAB.get, ps) if i is not None]
phonemizers = dict(
a=phonemizer.backend.EspeakBackend(
language="en-us", preserve_punctuation=True, with_stress=True
),
b=phonemizer.backend.EspeakBackend(
language="en-gb", preserve_punctuation=True, with_stress=True
),
)
def phonemize(text, lang, norm=True):
if norm:
text = normalize_text(text)
ps = phonemizers[lang].phonemize([text])
ps = ps[0] if ps else ""
# https://en.wiktionary.org/wiki/kokoro#English
ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', "z", ps)
if lang == "a":
ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
ps = "".join(filter(lambda p: p in VOCAB, ps))
return ps.strip()
def length_to_mask(lengths):
mask = (
torch.arange(lengths.max())
.unsqueeze(0)
.expand(lengths.shape[0], -1)
.type_as(lengths)
)
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
return mask
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
device = ref_s.device
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
def generate(model, text, voicepack, lang="a", speed=1):
ps = phonemize(text, lang)
tokens = tokenize(ps)
if not tokens:
return None
elif len(tokens) > 510:
tokens = tokens[:510]
print("Truncated to 510 tokens")
ref_s = voicepack[len(tokens)]
out = forward(model, tokens, ref_s, speed)
ps = "".join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
return out, ps

View file

@ -13,6 +13,7 @@ from .core.config import settings
from .services.tts_model import TTSModel
from .services.tts_service import TTSService
from .routers.openai_compatible import router as openai_router
from .routers.text_processing import router as text_router
@asynccontextmanager
@ -45,8 +46,9 @@ app.add_middleware(
allow_headers=["*"],
)
# Include OpenAI compatible router
# Include routers
app.include_router(openai_router, prefix="/v1")
app.include_router(text_router)
# Health check endpoint

View file

@ -0,0 +1,30 @@
from fastapi import APIRouter
from ..structures.text_schemas import PhonemeRequest, PhonemeResponse
from ..services.text_processing import phonemize, tokenize
router = APIRouter(
prefix="/text",
tags=["text processing"]
)
@router.post("/phonemize", response_model=PhonemeResponse)
async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
"""Convert text to phonemes and tokens: Rough attempt
Args:
request: Request containing text and language
Returns:
Phonemes and token IDs
"""
# Get phonemes
phonemes = phonemize(request.text, request.language)
# Get tokens
tokens = tokenize(phonemes)
tokens = [0] + tokens + [0] # Add start/end tokens
return PhonemeResponse(
phonemes=phonemes,
tokens=tokens
)

View file

@ -0,0 +1,13 @@
from .normalizer import normalize_text
from .phonemizer import phonemize, PhonemizerBackend, EspeakBackend
from .vocabulary import tokenize, decode_tokens, VOCAB
__all__ = [
'normalize_text',
'phonemize',
'tokenize',
'decode_tokens',
'VOCAB',
'PhonemizerBackend',
'EspeakBackend'
]

View file

@ -0,0 +1,111 @@
import re
def split_num(num: re.Match) -> str:
"""Handle number splitting for various formats"""
num = num.group()
if "." in num:
return num
elif ":" in num:
h, m = [int(n) for n in num.split(":")]
if m == 0:
return f"{h} o'clock"
elif m < 10:
return f"{h} oh {m}"
return f"{h} {m}"
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
s = "s" if num.endswith("s") else ""
if 100 <= year % 1000 <= 999:
if right == 0:
return f"{left} hundred{s}"
elif right < 10:
return f"{left} oh {right}{s}"
return f"{left} {right}{s}"
def handle_money(m: re.Match) -> str:
"""Convert money expressions to spoken form"""
m = m.group()
bill = "dollar" if m[0] == "$" else "pound"
if m[-1].isalpha():
return f"{m[1:]} {bill}s"
elif "." not in m:
s = "" if m[1:] == "1" else "s"
return f"{m[1:]} {bill}{s}"
b, c = m[1:].split(".")
s = "" if b == "1" else "s"
c = int(c.ljust(2, "0"))
coins = (
f"cent{'' if c == 1 else 's'}"
if m[0] == "$"
else ("penny" if c == 1 else "pence")
)
return f"{b} {bill}{s} and {c} {coins}"
def handle_decimal(num: re.Match) -> str:
"""Convert decimal numbers to spoken form"""
a, b = num.group().split(".")
return " point ".join([a, " ".join(b)])
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing
Args:
text: Input text to normalize
Returns:
Normalized text
"""
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
# Clean up whitespace
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
# Handle titles and abbreviations
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
# Handle common words
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
# Handle numbers and money
text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
split_num,
text
)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
handle_money,
text,
)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
# Handle various formatting
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]",
lambda m: m.group().replace(".", "-"),
text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()

View file

@ -0,0 +1,97 @@
import re
from abc import ABC, abstractmethod
import phonemizer
from .normalizer import normalize_text
class PhonemizerBackend(ABC):
"""Abstract base class for phonemization backends"""
@abstractmethod
def phonemize(self, text: str) -> str:
"""Convert text to phonemes
Args:
text: Text to convert to phonemes
Returns:
Phonemized text
"""
pass
class EspeakBackend(PhonemizerBackend):
"""Espeak-based phonemizer implementation"""
def __init__(self, language: str):
"""Initialize espeak backend
Args:
language: Language code ('en-us' or 'en-gb')
"""
self.backend = phonemizer.backend.EspeakBackend(
language=language,
preserve_punctuation=True,
with_stress=True
)
self.language = language
def phonemize(self, text: str) -> str:
"""Convert text to phonemes using espeak
Args:
text: Text to convert to phonemes
Returns:
Phonemized text
"""
# Phonemize text
ps = self.backend.phonemize([text])
ps = ps[0] if ps else ""
# Handle special cases
ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
# Language-specific rules
if self.language == "en-us":
ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
return ps.strip()
def create_phonemizer(language: str = "a") -> PhonemizerBackend:
"""Factory function to create phonemizer backend
Args:
language: Language code ('a' for US English, 'b' for British English)
Returns:
Phonemizer backend instance
"""
# Map language codes to espeak language codes
lang_map = {
"a": "en-us",
"b": "en-gb"
}
if language not in lang_map:
raise ValueError(f"Unsupported language code: {language}")
return EspeakBackend(lang_map[language])
def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
"""Convert text to phonemes
Args:
text: Text to convert to phonemes
language: Language code ('a' for US English, 'b' for British English)
normalize: Whether to normalize text before phonemization
Returns:
Phonemized text
"""
if normalize:
text = normalize_text(text)
phonemizer = create_phonemizer(language)
return phonemizer.phonemize(text)

View file

@ -0,0 +1,37 @@
def get_vocab():
"""Get the vocabulary dictionary mapping characters to token IDs"""
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»"" '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'"
# Create vocabulary dictionary
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
return {symbol: i for i, symbol in enumerate(symbols)}
# Initialize vocabulary
VOCAB = get_vocab()
def tokenize(phonemes: str) -> list[int]:
"""Convert phonemes string to token IDs
Args:
phonemes: String of phonemes to tokenize
Returns:
List of token IDs
"""
return [i for i in map(VOCAB.get, phonemes) if i is not None]
def decode_tokens(tokens: list[int]) -> str:
"""Convert token IDs back to phonemes string
Args:
tokens: List of token IDs
Returns:
String of phonemes
"""
# Create reverse mapping
id_to_symbol = {i: s for s, i in VOCAB.items()}
return "".join(id_to_symbol[t] for t in tokens)

View file

@ -1,15 +1,13 @@
import os
import threading
from abc import ABC, abstractmethod
from typing import List, Tuple
import torch
import numpy as np
from loguru import logger
from kokoro import tokenize, phonemize
from typing import Union, List
from ..core.config import settings
class TTSBaseModel(ABC):
_instance = None
_lock = threading.Lock()
@ -28,16 +26,18 @@ class TTSBaseModel(ABC):
# Test CUDA device
test_tensor = torch.zeros(1).cuda()
logger.info("CUDA test successful")
model_path = os.path.join(settings.model_dir, settings.pytorch_model_path)
cls._device = "cuda"
except Exception as e:
logger.error(f"CUDA test failed: {e}")
cls._device = "cpu"
else:
cls._device = "cpu"
model_path = os.path.join(settings.model_dir, settings.onnx_model_path)
logger.info(f"Initializing model on {cls._device}")
# Initialize model
if not cls.initialize(settings.model_dir, settings.model_path):
if not cls.initialize(settings.model_dir, model_path=model_path):
raise RuntimeError(f"Failed to initialize {cls._device.upper()} model")
# Setup voices directory
@ -65,13 +65,9 @@ class TTSBaseModel(ABC):
voice_path = os.path.join(cls.VOICES_DIR, "af.pt")
dummy_voicepack = torch.load(voice_path, map_location=cls._device, weights_only=True)
if cls._device == "cuda":
cls.generate(dummy_text, dummy_voicepack, "a", 1.0)
else:
ps = phonemize(dummy_text, "a")
tokens = tokenize(ps)
tokens = [0] + tokens + [0]
cls.generate(tokens, dummy_voicepack, 1.0)
# Process text and generate audio
phonemes, tokens = cls.process_text(dummy_text, "a")
cls.generate_from_tokens(tokens, dummy_voicepack, 1.0)
logger.info("Model warm-up complete")
except Exception as e:
@ -89,13 +85,43 @@ class TTSBaseModel(ABC):
@classmethod
@abstractmethod
def generate(cls, input_data: Union[str, List[int]], voicepack: torch.Tensor, *args) -> np.ndarray:
"""Generate audio from input
def process_text(cls, text: str, language: str) -> Tuple[str, List[int]]:
"""Process text into phonemes and tokens
Args:
input_data: Either text string (GPU) or tokenized input (CPU)
text: Input text
language: Language code
Returns:
tuple[str, list[int]]: Phonemes and token IDs
"""
pass
@classmethod
@abstractmethod
def generate_from_text(cls, text: str, voicepack: torch.Tensor, language: str, speed: float) -> Tuple[np.ndarray, str]:
"""Generate audio from text
Args:
text: Input text
voicepack: Voice tensor
*args: Additional args (lang+speed for GPU, speed for CPU)
language: Language code
speed: Speed factor
Returns:
tuple[np.ndarray, str]: Generated audio samples and phonemes
"""
pass
@classmethod
@abstractmethod
def generate_from_tokens(cls, tokens: List[int], voicepack: torch.Tensor, speed: float) -> np.ndarray:
"""Generate audio from tokens
Args:
tokens: Token IDs
voicepack: Voice tensor
speed: Speed factor
Returns:
np.ndarray: Generated audio samples

View file

@ -5,6 +5,8 @@ from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel
from loguru import logger
from .tts_base import TTSBaseModel
from .text_processing import phonemize, tokenize
from ..core.config import settings
class TTSCPUModel(TTSBaseModel):
_instance = None
@ -15,22 +17,12 @@ class TTSCPUModel(TTSBaseModel):
"""Initialize ONNX model for CPU inference"""
if cls._onnx_session is None:
# Try loading ONNX model
# First try the specified path if provided
if model_path and model_path.endswith('.onnx'):
onnx_path = os.path.join(model_dir, model_path)
if os.path.exists(onnx_path):
logger.info(f"Loading specified ONNX model from {onnx_path}")
else:
onnx_path = None
onnx_path = os.path.join(model_dir, settings.onnx_model_path)
if os.path.exists(onnx_path):
logger.info(f"Loading ONNX model from {onnx_path}")
else:
# Look for any .onnx file in the directory as fallback
onnx_files = [f for f in os.listdir(model_dir) if f.endswith('.onnx')]
if onnx_files:
onnx_path = os.path.join(model_dir, onnx_files[0])
logger.info(f"Found ONNX model: {onnx_path}")
else:
logger.error(f"No ONNX model found in {model_dir}")
return None
logger.error(f"ONNX model not found at {onnx_path}")
return None
if not onnx_path:
return None
@ -62,13 +54,53 @@ class TTSCPUModel(TTSBaseModel):
return cls._onnx_session
@classmethod
def generate(cls, input_data: list[int], voicepack: torch.Tensor, *args) -> np.ndarray:
"""Generate audio using ONNX model
def process_text(cls, text: str, language: str) -> tuple[str, list[int]]:
"""Process text into phonemes and tokens
Args:
input_data: list of token IDs
text: Input text
language: Language code
Returns:
tuple[str, list[int]]: Phonemes and token IDs
"""
phonemes = phonemize(text, language)
tokens = tokenize(phonemes)
tokens = [0] + tokens + [0] # Add start/end tokens
return phonemes, tokens
@classmethod
def generate_from_text(cls, text: str, voicepack: torch.Tensor, language: str, speed: float) -> tuple[np.ndarray, str]:
"""Generate audio from text
Args:
text: Input text
voicepack: Voice tensor
*args: (speed,) tuple
language: Language code
speed: Speed factor
Returns:
tuple[np.ndarray, str]: Generated audio samples and phonemes
"""
if cls._onnx_session is None:
raise RuntimeError("ONNX model not initialized")
# Process text
phonemes, tokens = cls.process_text(text, language)
# Generate audio
audio = cls.generate_from_tokens(tokens, voicepack, speed)
return audio, phonemes
@classmethod
def generate_from_tokens(cls, tokens: list[int], voicepack: torch.Tensor, speed: float) -> np.ndarray:
"""Generate audio from tokens
Args:
tokens: Token IDs
voicepack: Voice tensor
speed: Speed factor
Returns:
np.ndarray: Generated audio samples
@ -76,10 +108,9 @@ class TTSCPUModel(TTSBaseModel):
if cls._onnx_session is None:
raise RuntimeError("ONNX model not initialized")
speed = args[0]
# Pre-allocate and prepare inputs
tokens_input = np.array([input_data], dtype=np.int64)
style_input = voicepack[len(input_data)-2].numpy() # Already has correct dimensions
tokens_input = np.array([tokens], dtype=np.int64)
style_input = voicepack[len(tokens)-2].numpy() # Already has correct dimensions
speed_input = np.full(1, speed, dtype=np.float32) # More efficient than ones * speed
# Run inference with optimized inputs

View file

@ -3,9 +3,47 @@ import numpy as np
import torch
from loguru import logger
from models import build_model
from kokoro import generate
from .text_processing import phonemize, tokenize
from .tts_base import TTSBaseModel
from ..core.config import settings
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
"""Forward pass through the model"""
device = ref_s.device
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
def length_to_mask(lengths):
"""Create attention mask from lengths"""
mask = (
torch.arange(lengths.max())
.unsqueeze(0)
.expand(lengths.shape[0], -1)
.type_as(lengths)
)
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
return mask
class TTSGPUModel(TTSBaseModel):
_instance = None
@ -17,7 +55,7 @@ class TTSGPUModel(TTSBaseModel):
if cls._instance is None and torch.cuda.is_available():
try:
logger.info("Initializing GPU model")
model_path = os.path.join(model_dir, model_path)
model_path = os.path.join(model_dir, settings.pytorch_model_path)
model = build_model(model_path, cls._device)
cls._instance = model
return cls._instance
@ -27,13 +65,52 @@ class TTSGPUModel(TTSBaseModel):
return cls._instance
@classmethod
def generate(cls, input_data: str, voicepack: torch.Tensor, *args) -> np.ndarray:
"""Generate audio using PyTorch model on GPU
def process_text(cls, text: str, language: str) -> tuple[str, list[int]]:
"""Process text into phonemes and tokens
Args:
input_data: Text string to generate audio from
text: Input text
language: Language code
Returns:
tuple[str, list[int]]: Phonemes and token IDs
"""
phonemes = phonemize(text, language)
tokens = tokenize(phonemes)
return phonemes, tokens
@classmethod
def generate_from_text(cls, text: str, voicepack: torch.Tensor, language: str, speed: float) -> tuple[np.ndarray, str]:
"""Generate audio from text
Args:
text: Input text
voicepack: Voice tensor
*args: (lang, speed) tuple
language: Language code
speed: Speed factor
Returns:
tuple[np.ndarray, str]: Generated audio samples and phonemes
"""
if cls._instance is None:
raise RuntimeError("GPU model not initialized")
# Process text
phonemes, tokens = cls.process_text(text, language)
# Generate audio
audio = cls.generate_from_tokens(tokens, voicepack, speed)
return audio, phonemes
@classmethod
def generate_from_tokens(cls, tokens: list[int], voicepack: torch.Tensor, speed: float) -> np.ndarray:
"""Generate audio from tokens
Args:
tokens: Token IDs
voicepack: Voice tensor
speed: Speed factor
Returns:
np.ndarray: Generated audio samples
@ -41,12 +118,10 @@ class TTSGPUModel(TTSBaseModel):
if cls._instance is None:
raise RuntimeError("GPU model not initialized")
lang, speed = args
result = generate(cls._instance, input_data, voicepack, lang=lang, speed=speed)
# kokoro.generate returns (audio, metadata, info), we only want audio
audio = result[0]
# Get reference style
ref_s = voicepack[len(tokens)]
# Convert to numpy array if needed
if isinstance(audio, torch.Tensor):
audio = audio.cpu().numpy()
# Generate audio
audio = forward(cls._instance, tokens, ref_s, speed)
return audio

View file

@ -7,7 +7,7 @@ from typing import List, Tuple, Optional
import numpy as np
import torch
import scipy.io.wavfile as wavfile
from kokoro import tokenize, phonemize, normalize_text
from .text_processing import normalize_text
from loguru import logger
from ..core.config import settings
@ -62,21 +62,10 @@ class TTSService:
# Process all chunks
for i, chunk in enumerate(chunks):
try:
# Process chunk
if TTSModel.get_device() == "cuda":
# GPU takes (text, voicepack, lang, speed)
try:
chunk_audio = TTSModel.generate(chunk, voicepack, voice[0], speed)
except RuntimeError as e:
logger.error(f"Failed to generate audio: {str(e)}")
chunk_audio = None
else:
# CPU takes (tokens, voicepack, speed)
ps = phonemize(chunk, voice[0])
tokens = tokenize(ps)
tokens = [0] + list(tokens) + [0] # Add padding
chunk_audio = TTSModel.generate(tokens, voicepack, speed)
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
if chunk_audio is not None:
audio_chunks.append(chunk_audio)
else:
@ -98,19 +87,8 @@ class TTSService:
)
else:
# Process single chunk
if TTSModel.get_device() == "cuda":
# GPU takes (text, voicepack, lang, speed)
try:
audio = TTSModel.generate(text, voicepack, voice[0], speed)
except RuntimeError as e:
logger.error(f"Failed to generate audio: {str(e)}")
raise ValueError("No audio chunks were generated successfully")
else:
# CPU takes (tokens, voicepack, speed)
ps = phonemize(text, voice[0])
tokens = tokenize(ps)
tokens = [0] + list(tokens) + [0] # Add padding
audio = TTSModel.generate(tokens, voicepack, speed)
phonemes, tokens = TTSModel.process_text(text, voice[0])
audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
processing_time = time.time() - start_time
return audio, processing_time

View file

@ -0,0 +1,9 @@
from pydantic import BaseModel
class PhonemeRequest(BaseModel):
text: str
language: str = "a" # Default to American English
class PhonemeResponse(BaseModel):
phonemes: str
tokens: list[int]

View file

@ -21,8 +21,73 @@ def cleanup():
cleanup_mock_dirs()
# Mock torch and other ML modules before they're imported
sys.modules["torch"] = Mock()
# Create mock torch module
mock_torch = Mock()
mock_torch.cuda = Mock()
mock_torch.cuda.is_available = Mock(return_value=False)
# Create a mock tensor class that supports basic operations
class MockTensor:
def __init__(self, data):
self.data = data
if isinstance(data, (list, tuple)):
self.shape = [len(data)]
elif isinstance(data, MockTensor):
self.shape = data.shape
else:
self.shape = getattr(data, 'shape', [1])
def __getitem__(self, idx):
if isinstance(self.data, (list, tuple)):
if isinstance(idx, slice):
return MockTensor(self.data[idx])
return self.data[idx]
return self
def max(self):
if isinstance(self.data, (list, tuple)):
max_val = max(self.data)
return MockTensor(max_val)
return 5 # Default for testing
def item(self):
if isinstance(self.data, (list, tuple)):
return max(self.data)
if isinstance(self.data, (int, float)):
return self.data
return 5 # Default for testing
def cuda(self):
"""Support cuda conversion"""
return self
def any(self):
if isinstance(self.data, (list, tuple)):
return any(self.data)
return False
def all(self):
if isinstance(self.data, (list, tuple)):
return all(self.data)
return True
def unsqueeze(self, dim):
return self
def expand(self, *args):
return self
def type_as(self, other):
return self
# Add tensor operations to mock torch
mock_torch.tensor = lambda x: MockTensor(x)
mock_torch.zeros = lambda *args: MockTensor([0] * (args[0] if isinstance(args[0], int) else args[0][0]))
mock_torch.arange = lambda x: MockTensor(list(range(x)))
mock_torch.gt = lambda x, y: MockTensor([False] * x.shape[0])
# Mock modules before they're imported
sys.modules["torch"] = mock_torch
sys.modules["transformers"] = Mock()
sys.modules["phonemizer"] = Mock()
sys.modules["models"] = Mock()
@ -31,14 +96,22 @@ sys.modules["kokoro"] = Mock()
sys.modules["kokoro.generate"] = Mock()
sys.modules["kokoro.phonemize"] = Mock()
sys.modules["kokoro.tokenize"] = Mock()
sys.modules["onnxruntime"] = Mock()
@pytest.fixture(autouse=True)
def mock_tts_model():
"""Mock TTSModel to avoid loading real models during tests"""
with patch("api.src.services.tts_model.TTSModel") as mock:
"""Mock TTSModel and TTS model initialization"""
with patch("api.src.services.tts_model.TTSModel") as mock_tts_model, \
patch("api.src.services.tts_base.TTSBaseModel") as mock_base_model:
# Mock TTSModel
model_instance = Mock()
model_instance.get_instance.return_value = model_instance
model_instance.get_voicepack.return_value = None
mock.get_instance.return_value = model_instance
mock_tts_model.get_instance.return_value = model_instance
# Mock TTS model initialization
mock_base_model.setup.return_value = 1 # Return dummy voice count
yield model_instance

View file

@ -0,0 +1,144 @@
"""Tests for TTS model implementations"""
import os
import torch
import pytest
import numpy as np
from unittest.mock import patch, MagicMock
from api.src.services.tts_base import TTSBaseModel
from api.src.services.tts_cpu import TTSCPUModel
from api.src.services.tts_gpu import TTSGPUModel, length_to_mask
# Base Model Tests
def test_get_device_error():
"""Test get_device() raises error when not initialized"""
TTSBaseModel._device = None
with pytest.raises(RuntimeError, match="Model not initialized"):
TTSBaseModel.get_device()
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_available(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA available"""
TTSBaseModel._device = None
mock_cuda_available.return_value = True
mock_exists.return_value = True
mock_load.return_value = torch.zeros(1)
mock_listdir.return_value = ["voice1.pt", "voice2.pt"]
mock_join.return_value = "/mocked/path"
# Mock the abstract methods
TTSBaseModel.initialize = MagicMock(return_value=True)
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
assert TTSBaseModel._device == "cuda"
assert voice_count == 2
@patch('torch.cuda.is_available')
@patch('os.path.exists')
@patch('os.path.join')
@patch('os.listdir')
@patch('torch.load')
@patch('torch.save')
def test_setup_cuda_unavailable(mock_save, mock_load, mock_listdir, mock_join, mock_exists, mock_cuda_available):
"""Test setup with CUDA unavailable"""
TTSBaseModel._device = None
mock_cuda_available.return_value = False
mock_exists.return_value = True
mock_load.return_value = torch.zeros(1)
mock_listdir.return_value = ["voice1.pt", "voice2.pt"]
mock_join.return_value = "/mocked/path"
# Mock the abstract methods
TTSBaseModel.initialize = MagicMock(return_value=True)
TTSBaseModel.process_text = MagicMock(return_value=("dummy", [1,2,3]))
TTSBaseModel.generate_from_tokens = MagicMock(return_value=np.zeros(1000))
voice_count = TTSBaseModel.setup()
assert TTSBaseModel._device == "cpu"
assert voice_count == 2
# CPU Model Tests
def test_cpu_initialize_missing_model():
"""Test CPU initialize with missing model"""
with patch('os.path.exists', return_value=False):
result = TTSCPUModel.initialize("dummy_dir")
assert result is None
def test_cpu_generate_uninitialized():
"""Test CPU generate methods with uninitialized model"""
TTSCPUModel._onnx_session = None
with pytest.raises(RuntimeError, match="ONNX model not initialized"):
TTSCPUModel.generate_from_text("test", torch.zeros(1), "en", 1.0)
with pytest.raises(RuntimeError, match="ONNX model not initialized"):
TTSCPUModel.generate_from_tokens([1,2,3], torch.zeros(1), 1.0)
def test_cpu_process_text():
"""Test CPU process_text functionality"""
with patch('api.src.services.tts_cpu.phonemize') as mock_phonemize, \
patch('api.src.services.tts_cpu.tokenize') as mock_tokenize:
mock_phonemize.return_value = "test phonemes"
mock_tokenize.return_value = [1, 2, 3]
phonemes, tokens = TTSCPUModel.process_text("test", "en")
assert phonemes == "test phonemes"
assert tokens == [0, 1, 2, 3, 0] # Should add start/end tokens
# GPU Model Tests
@patch('torch.cuda.is_available')
def test_gpu_initialize_cuda_unavailable(mock_cuda_available):
"""Test GPU initialize with CUDA unavailable"""
mock_cuda_available.return_value = False
TTSGPUModel._instance = None
result = TTSGPUModel.initialize("dummy_dir", "dummy_path")
assert result is None
@patch('api.src.services.tts_gpu.length_to_mask')
def test_gpu_length_to_mask(mock_length_to_mask):
"""Test length_to_mask function"""
# Setup mock return value
expected_mask = torch.tensor([
[False, False, False, True, True],
[False, False, False, False, False]
])
mock_length_to_mask.return_value = expected_mask
# Call function with test input
lengths = torch.tensor([3, 5])
mask = mock_length_to_mask(lengths)
# Verify mock was called with correct input
mock_length_to_mask.assert_called_once()
assert torch.equal(mask, expected_mask)
def test_gpu_generate_uninitialized():
"""Test GPU generate methods with uninitialized model"""
TTSGPUModel._instance = None
with pytest.raises(RuntimeError, match="GPU model not initialized"):
TTSGPUModel.generate_from_text("test", torch.zeros(1), "en", 1.0)
with pytest.raises(RuntimeError, match="GPU model not initialized"):
TTSGPUModel.generate_from_tokens([1,2,3], torch.zeros(1), 1.0)
def test_gpu_process_text():
"""Test GPU process_text functionality"""
with patch('api.src.services.tts_gpu.phonemize') as mock_phonemize, \
patch('api.src.services.tts_gpu.tokenize') as mock_tokenize:
mock_phonemize.return_value = "test phonemes"
mock_tokenize.return_value = [1, 2, 3]
phonemes, tokens = TTSGPUModel.process_text("test", "en")
assert phonemes == "test phonemes"
assert tokens == [1, 2, 3] # GPU implementation doesn't add start/end tokens

View file

@ -6,10 +6,13 @@ from unittest.mock import MagicMock, call, patch
import numpy as np
import torch
import pytest
from onnxruntime import InferenceSession
from api.src.core.config import settings
from api.src.services.tts_model import TTSModel
from api.src.services.tts_service import TTSService
from api.src.services.tts_cpu import TTSCPUModel
from api.src.services.tts_gpu import TTSGPUModel
@pytest.fixture
@ -70,291 +73,6 @@ def test_list_voices(mock_join, mock_listdir, tts_service):
assert "not_a_voice" not in voices
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_voicepack")
@patch("kokoro.normalize_text")
@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("kokoro.generate")
def test_generate_audio_empty_text(
mock_generate,
mock_tokenize,
mock_phonemize,
mock_normalize,
mock_voicepack,
mock_instance,
tts_service,
):
"""Test generating audio with empty text"""
mock_normalize.return_value = ""
mock_instance.return_value = (MagicMock(), "cpu")
with pytest.raises(ValueError, match="Text is empty after preprocessing"):
tts_service._generate_audio("", "af", 1.0)
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("os.path.exists")
@patch("kokoro.normalize_text")
@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("kokoro.generate")
@patch("torch.load")
def test_generate_audio_no_chunks(
mock_torch_load,
mock_generate,
mock_tokenize,
mock_phonemize,
mock_normalize,
mock_exists,
mock_instance,
tts_service,
):
"""Test generating audio with no successful chunks"""
mock_normalize.return_value = "Test text"
mock_phonemize.return_value = "Test text"
mock_tokenize.return_value = [1, 2] # Return integers instead of strings
mock_generate.return_value = (None, None)
mock_instance.return_value = (MagicMock(), "cpu")
mock_exists.return_value = True
mock_torch_load.return_value = MagicMock()
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
tts_service._generate_audio("Test text", "af", 1.0)
@patch("torch.load")
@patch("torch.save")
@patch("torch.stack")
@patch("torch.mean")
@patch("os.path.exists")
def test_combine_voices(
mock_exists, mock_mean, mock_stack, mock_save, mock_load, tts_service
):
"""Test combining multiple voices"""
# Setup mocks
mock_exists.return_value = True
mock_load.return_value = torch.tensor([1.0, 2.0])
mock_stack.return_value = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
mock_mean.return_value = torch.tensor([2.0, 3.0])
# Test combining two voices
result = tts_service.combine_voices(["voice1", "voice2"])
assert result == "voice1_voice2"
mock_stack.assert_called_once()
mock_mean.assert_called_once()
mock_save.assert_called_once()
def test_combine_voices_invalid_input(tts_service):
"""Test combining voices with invalid input"""
# Test with empty list
with pytest.raises(ValueError, match="At least 2 voices are required"):
tts_service.combine_voices([])
# Test with single voice
with pytest.raises(ValueError, match="At least 2 voices are required"):
tts_service.combine_voices(["voice1"])
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("api.src.services.tts_model.TTSModel.generate")
@patch("os.path.exists")
@patch("kokoro.normalize_text")
@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("torch.load")
def test_generate_audio_success(
mock_torch_load,
mock_tokenize,
mock_phonemize,
mock_normalize,
mock_exists,
mock_model_generate,
mock_get_device,
mock_instance,
tts_service,
sample_audio,
):
"""Test successful audio generation"""
mock_normalize.return_value = "Test text"
mock_phonemize.return_value = "Test text"
mock_tokenize.return_value = [1, 2] # Return integers instead of strings
mock_model_generate.return_value = sample_audio
mock_instance.return_value = (MagicMock(), "cpu")
mock_get_device.return_value = "cpu"
mock_exists.return_value = True
mock_torch_load.return_value = MagicMock()
# Initialize model
TTSModel._instance = None
TTSModel._device = "cpu"
audio, processing_time = tts_service._generate_audio("Test text", "af", 1.0)
assert isinstance(audio, np.ndarray)
assert isinstance(processing_time, float)
assert len(audio) > 0
@patch("torch.cuda.is_available")
@patch("api.src.services.tts_gpu.TTSGPUModel.initialize")
@patch("os.makedirs")
@patch("os.path.exists")
@patch("os.listdir")
@patch("torch.load")
@patch("torch.save")
@patch("api.src.core.config.settings")
@patch("torch.zeros")
def test_model_initialization_cuda(
mock_zeros,
mock_settings,
mock_save,
mock_load,
mock_listdir,
mock_exists,
mock_makedirs,
mock_initialize,
mock_cuda_available,
):
"""Test model initialization with CUDA"""
# Setup mocks
mock_cuda_available.return_value = True
mock_initialize.return_value = True
mock_exists.return_value = True
mock_listdir.return_value = ["voice1.pt", "voice2.pt"]
mock_load.return_value = torch.zeros(1)
mock_settings.model_dir = "test_dir"
mock_settings.model_path = "test_path"
mock_settings.voices_dir = "voices"
mock_zeros.return_value = torch.zeros(1)
# Reset singleton and device
TTSModel._instance = None
TTSModel._device = None
# Mock settings to prevent actual file operations
with patch.object(settings, 'model_dir', 'test_dir'), \
patch.object(settings, 'model_path', 'test_path'):
voice_count = TTSModel.setup()
assert TTSModel.get_device() == "cuda"
assert voice_count == 2
mock_initialize.assert_called_once_with("test_dir", "test_path")
@patch("torch.cuda.is_available")
@patch("api.src.services.tts_base.TTSBaseModel.initialize")
@patch("os.makedirs")
@patch("os.path.exists")
@patch("os.listdir")
@patch("torch.load")
@patch("torch.save")
@patch("api.src.core.config.settings")
@patch("torch.zeros")
def test_model_initialization_cpu(
mock_zeros,
mock_settings,
mock_save,
mock_load,
mock_listdir,
mock_exists,
mock_makedirs,
mock_initialize,
mock_cuda_available,
):
"""Test model initialization with CPU"""
# Setup mocks
mock_cuda_available.return_value = False
mock_initialize.return_value = False # This will trigger the RuntimeError
mock_exists.return_value = True
mock_listdir.return_value = ["voice1.pt", "voice2.pt", "voice3.pt"]
mock_load.return_value = torch.zeros(1)
mock_settings.model_dir = "test_dir"
mock_settings.model_path = "test_path"
mock_settings.voices_dir = "voices"
mock_zeros.return_value = torch.zeros(1)
# Reset singleton and device
TTSModel._instance = None
TTSModel._device = None
# Mock settings to prevent actual file operations
with patch.object(settings, 'model_dir', 'test_dir'), \
patch.object(settings, 'model_path', 'test_path'), \
pytest.raises(RuntimeError, match="Failed to initialize CPU model"):
TTSModel.setup()
mock_initialize.assert_called_once_with("test_dir", "test_path")
@patch("api.src.services.tts_service.TTSService._get_voice_path")
@patch("api.src.services.tts_model.TTSModel.get_instance")
def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path):
"""Test voicepack loading error handling"""
mock_get_voice_path.return_value = None
mock_get_instance.return_value = (MagicMock(), "cpu")
TTSModel._voicepacks = {} # Reset voicepacks
service = TTSService()
with pytest.raises(ValueError, match="Voice not found: nonexistent_voice"):
service._generate_audio("test", "nonexistent_voice", 1.0)
@patch("api.src.services.tts_model.TTSModel")
def test_save_audio(mock_tts_model, tts_service, sample_audio, tmp_path):
"""Test saving audio to file"""
output_dir = os.path.join(tmp_path, "test_output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "audio.wav")
tts_service._save_audio(sample_audio, output_path)
assert os.path.exists(output_path)
assert os.path.getsize(output_path) > 0
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("api.src.services.tts_model.TTSModel.generate")
@patch("os.path.exists")
@patch("kokoro.normalize_text")
@patch("kokoro.phonemize")
@patch("kokoro.tokenize")
@patch("torch.load")
def test_generate_audio_without_stitching(
mock_torch_load,
mock_tokenize,
mock_phonemize,
mock_normalize,
mock_exists,
mock_model_generate,
mock_get_device,
mock_instance,
tts_service,
sample_audio,
):
"""Test generating audio without text stitching"""
mock_normalize.return_value = "Test text"
mock_phonemize.return_value = "Test text"
mock_tokenize.return_value = [1, 2] # Return integers instead of strings
mock_model_generate.return_value = sample_audio
mock_instance.return_value = (MagicMock(), "cpu")
mock_get_device.return_value = "cpu"
mock_exists.return_value = True
mock_torch_load.return_value = MagicMock()
audio, processing_time = tts_service._generate_audio(
"Test text", "af", 1.0, stitch_long_output=False
)
assert isinstance(audio, np.ndarray)
assert len(audio) > 0
mock_model_generate.assert_called_once()
@patch("os.listdir")
def test_list_voices_error(mock_listdir, tts_service):
"""Test error handling in list_voices"""
@ -364,6 +82,48 @@ def test_list_voices_error(mock_listdir, tts_service):
assert voices == []
def mock_model_setup(cuda_available=False):
"""Helper function to mock model setup"""
# Reset model state
TTSModel._instance = None
TTSModel._device = None
TTSModel._voicepacks = {}
# Create mock model instance with proper generate method
mock_model = MagicMock()
mock_model.generate.return_value = np.zeros(24000, dtype=np.float32)
TTSModel._instance = mock_model
# Set device based on CUDA availability
TTSModel._device = "cuda" if cuda_available else "cpu"
return 3 # Return voice count (including af.pt)
def test_model_initialization_cuda():
"""Test model initialization with CUDA"""
# Simulate CUDA availability
voice_count = mock_model_setup(cuda_available=True)
assert TTSModel.get_device() == "cuda"
assert voice_count == 3 # voice1.pt, voice2.pt, af.pt
def test_model_initialization_cpu():
"""Test model initialization with CPU"""
# Simulate no CUDA availability
voice_count = mock_model_setup(cuda_available=False)
assert TTSModel.get_device() == "cpu"
assert voice_count == 3 # voice1.pt, voice2.pt, af.pt
def test_generate_audio_empty_text(tts_service):
"""Test generating audio with empty text"""
with pytest.raises(ValueError, match="Text is empty after preprocessing"):
tts_service._generate_audio("", "af", 1.0)
@patch("api.src.services.tts_model.TTSModel.get_instance")
@patch("api.src.services.tts_model.TTSModel.get_device")
@patch("os.path.exists")
@ -386,16 +146,12 @@ def test_generate_audio_phonemize_error(
"""Test handling phonemization error"""
mock_normalize.return_value = "Test text"
mock_phonemize.side_effect = Exception("Phonemization failed")
mock_instance.return_value = (MagicMock(), "cpu")
mock_instance.return_value = (mock_generate, "cpu") # Use the same mock for consistency
mock_get_device.return_value = "cpu"
mock_exists.return_value = True
mock_torch_load.return_value = MagicMock()
mock_torch_load.return_value = torch.zeros((10, 24000))
mock_generate.return_value = (None, None)
# Initialize model
TTSModel._instance = None
TTSModel._device = "cpu"
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
tts_service._generate_audio("Test text", "af", 1.0)
@ -424,14 +180,60 @@ def test_generate_audio_error(
mock_phonemize.return_value = "Test text"
mock_tokenize.return_value = [1, 2] # Return integers instead of strings
mock_generate.side_effect = Exception("Generation failed")
mock_instance.return_value = (MagicMock(), "cpu")
mock_instance.return_value = (mock_generate, "cpu") # Use the same mock for consistency
mock_get_device.return_value = "cpu"
mock_exists.return_value = True
mock_torch_load.return_value = MagicMock()
# Initialize model
TTSModel._instance = None
TTSModel._device = "cpu"
mock_torch_load.return_value = torch.zeros((10, 24000))
with pytest.raises(ValueError, match="No audio chunks were generated successfully"):
tts_service._generate_audio("Test text", "af", 1.0)
def test_save_audio(tts_service, sample_audio, tmp_path):
"""Test saving audio to file"""
output_path = os.path.join(tmp_path, "test_output.wav")
tts_service._save_audio(sample_audio, output_path)
assert os.path.exists(output_path)
assert os.path.getsize(output_path) > 0
def test_combine_voices(tts_service):
"""Test combining multiple voices"""
# Setup mocks for torch operations
with patch('torch.load', return_value=torch.tensor([1.0, 2.0])), \
patch('torch.stack', return_value=torch.tensor([[1.0, 2.0], [3.0, 4.0]])), \
patch('torch.mean', return_value=torch.tensor([2.0, 3.0])), \
patch('torch.save'), \
patch('os.path.exists', return_value=True):
# Test combining two voices
result = tts_service.combine_voices(["voice1", "voice2"])
assert result == "voice1_voice2"
def test_combine_voices_invalid_input(tts_service):
"""Test combining voices with invalid input"""
# Test with empty list
with pytest.raises(ValueError, match="At least 2 voices are required"):
tts_service.combine_voices([])
# Test with single voice
with pytest.raises(ValueError, match="At least 2 voices are required"):
tts_service.combine_voices(["voice1"])
@patch("api.src.services.tts_service.TTSService._get_voice_path")
@patch("api.src.services.tts_model.TTSModel.get_instance")
def test_voicepack_loading_error(mock_get_instance, mock_get_voice_path):
"""Test voicepack loading error handling"""
mock_get_voice_path.return_value = None
mock_instance = MagicMock()
mock_instance.generate.return_value = np.zeros(24000, dtype=np.float32)
mock_get_instance.return_value = (mock_instance, "cpu")
TTSModel._voicepacks = {} # Reset voicepacks
service = TTSService()
with pytest.raises(ValueError, match="Voice not found: nonexistent_voice"):
service._generate_audio("test", "nonexistent_voice", 1.0)

View file

@ -40,14 +40,14 @@ services:
model-fetcher:
condition: service_healthy
# Gradio UI service [Comment out everything below if you don't need it]
gradio-ui:
build:
context: ./ui
ports:
- "7860:7860"
volumes:
- ./ui/data:/app/ui/data
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
environment:
- GRADIO_WATCH=True # Enable hot reloading
# # Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui:
# build:
# context: ./ui
# ports:
# - "7860:7860"
# volumes:
# - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment:
# - GRADIO_WATCH=True # Enable hot reloading

View file

@ -46,14 +46,14 @@ services:
model-fetcher:
condition: service_healthy
# Gradio UI service [Comment out everything below if you don't need it]
gradio-ui:
build:
context: ./ui
ports:
- "7860:7860"
volumes:
- ./ui/data:/app/ui/data
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
environment:
- GRADIO_WATCH=True # Enable hot reloading
# # Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui:
# build:
# context: ./ui
# ports:
# - "7860:7860"
# volumes:
# - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment:
# - GRADIO_WATCH=True # Enable hot reloading

0
examples/__init__.py Normal file
View file

View file

@ -0,0 +1,241 @@
#!/usr/bin/env python3
import os
import json
import time
import threading
import queue
import pandas as pd
import sys
from datetime import datetime
from lib.shared_plotting import plot_system_metrics, plot_correlation
from lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats,
real_time_factor
)
from lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
)
class SystemMonitor:
def __init__(self, interval=1.0):
self.interval = interval
self.metrics_queue = queue.Queue()
self.stop_event = threading.Event()
self.metrics_timeline = []
self.start_time = None
def _monitor_loop(self):
"""Background thread function to collect system metrics."""
while not self.stop_event.is_set():
metrics = get_system_metrics()
metrics["relative_time"] = time.time() - self.start_time
self.metrics_queue.put(metrics)
time.sleep(self.interval)
def start(self):
"""Start the monitoring thread."""
self.start_time = time.time()
self.monitor_thread = threading.Thread(target=self._monitor_loop)
self.monitor_thread.daemon = True
self.monitor_thread.start()
def stop(self):
"""Stop the monitoring thread and collect final metrics."""
self.stop_event.set()
if hasattr(self, 'monitor_thread'):
self.monitor_thread.join(timeout=2)
# Collect all metrics from queue
while True:
try:
metrics = self.metrics_queue.get_nowait()
self.metrics_timeline.append(metrics)
except queue.Empty:
break
return self.metrics_timeline
def main():
# Initialize system monitor
monitor = SystemMonitor(interval=1.0) # 1 second interval
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
prefix = "gpu"
# Generate token sizes
if 'gpu' in prefix:
token_sizes = generate_token_sizes(
max_tokens=3000, dense_step=150,
dense_max=1000, sparse_step=1000)
elif 'cpu' in prefix:
token_sizes = generate_token_sizes(
max_tokens=1000, dense_step=150,
dense_max=800, sparse_step=0)
else:
token_sizes = generate_token_sizes(max_tokens=3000)
# Set up paths relative to this file
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(output_plots_dir, exist_ok=True)
# Function to prefix filenames
def prefix_path(path: str, filename: str) -> str:
if prefix:
filename = f"{prefix}_{filename}"
return os.path.join(path, filename)
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
print(f"Testing sizes: {token_sizes}")
results = []
test_start_time = time.time()
# Start system monitoring
monitor.start()
for num_tokens in token_sizes:
chunk = get_text_for_tokens(text, num_tokens)
actual_tokens = len(enc.encode(chunk))
print(f"\nProcessing chunk with {actual_tokens} tokens:")
print(f"Text preview: {chunk[:100]}...")
processing_time, audio_length = make_tts_request(
chunk,
output_dir=output_dir,
prefix=prefix
)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
break
# Calculate RTF using the correct formula
rtf = real_time_factor(processing_time, audio_length)
results.append({
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 2),
})
df = pd.DataFrame(results)
if df.empty:
print("No data to plot")
return
df["tokens_per_second"] = df["tokens"] / df["processing_time"]
# Write benchmark stats
stats = [
{
"title": "Benchmark Statistics (with correct RTF)",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average RTF": df['rtf'].mean(),
"Average Real Time Speed": 1/df['rtf'].mean()
}
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"RTF range": f"{df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x",
"Real Time Speed range": f"{1/df['rtf'].max():.2f}x - {1/df['rtf'].min():.2f}x"
}
}
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats_rtf.txt"))
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time_rtf.png")
)
# Plot RTF vs Token Count
plot_correlation(
df, "tokens", "rtf",
"Real-Time Factor vs Input Size",
"Number of Input Tokens",
"Real-Time Factor (processing time / audio length)",
prefix_path(output_plots_dir, "realtime_factor_rtf.png")
)
# Stop monitoring and get final metrics
final_metrics = monitor.stop()
# Convert metrics timeline to DataFrame for stats
metrics_df = pd.DataFrame(final_metrics)
# Add system usage stats
if not metrics_df.empty:
stats.append({
"title": "System Usage Statistics",
"stats": {
"Peak CPU Usage (%)": metrics_df['cpu_percent'].max(),
"Avg CPU Usage (%)": metrics_df['cpu_percent'].mean(),
"Peak RAM Usage (%)": metrics_df['ram_percent'].max(),
"Avg RAM Usage (%)": metrics_df['ram_percent'].mean(),
"Peak RAM Used (GB)": metrics_df['ram_used_gb'].max(),
"Avg RAM Used (GB)": metrics_df['ram_used_gb'].mean(),
}
})
if 'gpu_memory_used' in metrics_df:
stats[-1]["stats"].update({
"Peak GPU Memory (MB)": metrics_df['gpu_memory_used'].max(),
"Avg GPU Memory (MB)": metrics_df['gpu_memory_used'].mean(),
})
# Plot system metrics
plot_system_metrics(final_metrics, prefix_path(output_plots_dir, "system_usage_rtf.png"))
# Save final results
save_json_results(
{
"results": results,
"system_metrics": final_metrics,
"test_duration": time.time() - test_start_time
},
prefix_path(output_data_dir, "benchmark_results_rtf.json")
)
print("\nResults saved to:")
print(f"- {prefix_path(output_data_dir, 'benchmark_results_rtf.json')}")
print(f"- {prefix_path(output_data_dir, 'benchmark_stats_rtf.txt')}")
print(f"- {prefix_path(output_plots_dir, 'processing_time_rtf.png')}")
print(f"- {prefix_path(output_plots_dir, 'realtime_factor_rtf.png')}")
print(f"- {prefix_path(output_plots_dir, 'system_usage_rtf.png')}")
print(f"\nAudio files saved in {output_dir} with prefix: {prefix or '(none)'}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,165 @@
import os
import json
import time
import pandas as pd
from examples.assorted_checks.lib.shared_plotting import plot_system_metrics, plot_correlation
from examples.assorted_checks.lib.shared_utils import (
get_system_metrics, save_json_results, write_benchmark_stats
)
from examples.assorted_checks.lib.shared_benchmark_utils import (
get_text_for_tokens, make_tts_request, generate_token_sizes, enc
)
def main():
# Get optional prefix from first command line argument
import sys
prefix = sys.argv[1] if len(sys.argv) > 1 else ""
# Set up paths relative to this file
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
output_plots_dir = os.path.join(script_dir, "output_plots")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(output_plots_dir, exist_ok=True)
# Function to prefix filenames
def prefix_path(path: str, filename: str) -> str:
if prefix:
filename = f"{prefix}_{filename}"
return os.path.join(path, filename)
# Read input text
with open(
os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8"
) as f:
text = f.read()
# Get total tokens in file
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
token_sizes = generate_token_sizes(total_tokens)
print(f"Testing sizes: {token_sizes}")
# Process chunks
results = []
system_metrics = []
test_start_time = time.time()
for num_tokens in token_sizes:
# Get text slice with exact token count
chunk = get_text_for_tokens(text, num_tokens)
actual_tokens = len(enc.encode(chunk))
print(f"\nProcessing chunk with {actual_tokens} tokens:")
print(f"Text preview: {chunk[:100]}...")
# Collect system metrics before processing
system_metrics.append(get_system_metrics())
processing_time, audio_length = make_tts_request(chunk)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
break
# Collect system metrics after processing
system_metrics.append(get_system_metrics())
results.append(
{
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"realtime_factor": audio_length / processing_time,
"elapsed_time": time.time() - test_start_time,
}
)
# Save intermediate results
save_json_results(
{"results": results, "system_metrics": system_metrics},
prefix_path(output_data_dir, "benchmark_results.json")
)
# Create DataFrame and calculate stats
df = pd.DataFrame(results)
if df.empty:
print("No data to plot")
return
# Calculate useful metrics
df["tokens_per_second"] = df["tokens"] / df["processing_time"]
# Write benchmark stats
stats = [
{
"title": "Benchmark Statistics",
"stats": {
"Total tokens processed": df['tokens'].sum(),
"Total audio generated (s)": df['output_length'].sum(),
"Total test duration (s)": df['elapsed_time'].max(),
"Average processing rate (tokens/s)": df['tokens_per_second'].mean(),
"Average realtime factor": df['realtime_factor'].mean()
}
},
{
"title": "Per-chunk Stats",
"stats": {
"Average chunk size (tokens)": df['tokens'].mean(),
"Min chunk size (tokens)": df['tokens'].min(),
"Max chunk size (tokens)": df['tokens'].max(),
"Average processing time (s)": df['processing_time'].mean(),
"Average output length (s)": df['output_length'].mean()
}
},
{
"title": "Performance Ranges",
"stats": {
"Processing rate range (tokens/s)": f"{df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f}",
"Realtime factor range": f"{df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x"
}
}
]
write_benchmark_stats(stats, prefix_path(output_data_dir, "benchmark_stats.txt"))
# Plot Processing Time vs Token Count
plot_correlation(
df, "tokens", "processing_time",
"Processing Time vs Input Size",
"Number of Input Tokens",
"Processing Time (seconds)",
prefix_path(output_plots_dir, "processing_time.png")
)
# Plot Realtime Factor vs Token Count
plot_correlation(
df, "tokens", "realtime_factor",
"Realtime Factor vs Input Size",
"Number of Input Tokens",
"Realtime Factor (output length / processing time)",
prefix_path(output_plots_dir, "realtime_factor.png")
)
# Plot system metrics
plot_system_metrics(system_metrics, prefix_path(output_plots_dir, "system_usage.png"))
print("\nResults saved to:")
print(f"- {prefix_path(output_data_dir, 'benchmark_results.json')}")
print(f"- {prefix_path(output_data_dir, 'benchmark_stats.txt')}")
print(f"- {prefix_path(output_plots_dir, 'processing_time.png')}")
print(f"- {prefix_path(output_plots_dir, 'realtime_factor.png')}")
print(f"- {prefix_path(output_plots_dir, 'system_usage.png')}")
if any("gpu_memory_used" in m for m in system_metrics):
print(f"- {prefix_path(output_plots_dir, 'gpu_usage.png')}")
print(f"\nAudio files saved in {output_dir} with prefix: {prefix or '(none)'}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,111 @@
"""Shared utilities specific to TTS benchmarking."""
import time
from typing import List, Optional, Tuple
import requests
import tiktoken
from .shared_utils import get_audio_length, save_audio_file
# Global tokenizer instance
enc = tiktoken.get_encoding("cl100k_base")
def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens.
Args:
text: Input text to slice
num_tokens: Desired number of tokens
Returns:
str: Text slice containing exactly num_tokens tokens
"""
tokens = enc.encode(text)
if num_tokens > len(tokens):
return text
return enc.decode(tokens[:num_tokens])
def make_tts_request(
text: str,
output_dir: str = None,
timeout: int = 1800,
prefix: str = ""
) -> Tuple[Optional[float], Optional[float]]:
"""Make TTS request using OpenAI-compatible endpoint.
Args:
text: Input text to convert to speech
output_dir: Directory to save audio files. If None, audio won't be saved.
timeout: Request timeout in seconds
prefix: Optional prefix for output filenames
Returns:
tuple: (processing_time, audio_length) in seconds, or (None, None) on error
"""
try:
start_time = time.time()
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
},
timeout=timeout,
)
response.raise_for_status()
processing_time = round(time.time() - start_time, 2)
# Calculate audio length from response content
audio_length = get_audio_length(response.content)
# Save the audio file if output_dir is provided
if output_dir:
token_count = len(enc.encode(text))
output_file = save_audio_file(
response.content,
f"chunk_{token_count}_tokens",
output_dir
)
print(f"Saved audio to {output_file}")
return processing_time, audio_length
except requests.exceptions.RequestException as e:
print(f"Error making request for text: {text[:50]}... Error: {str(e)}")
return None, None
except Exception as e:
print(f"Error processing text: {text[:50]}... Error: {str(e)}")
return None, None
def generate_token_sizes(
max_tokens: int,
dense_step: int = 100,
dense_max: int = 1000,
sparse_step: int = 1000
) -> List[int]:
"""Generate token size ranges with dense sampling at start.
Args:
max_tokens: Maximum number of tokens to generate sizes up to
dense_step: Step size for dense sampling range
dense_max: Maximum value for dense sampling
sparse_step: Step size for sparse sampling range
Returns:
list: Sorted list of token sizes
"""
# Dense sampling at start
dense_range = list(range(dense_step, dense_max + 1, dense_step))
if max_tokens <= dense_max or sparse_step < dense_max:
return sorted(dense_range)
# Sparse sampling for larger sizes
sparse_range = list(range(dense_max + sparse_step, max_tokens + 1, sparse_step))
# Combine and deduplicate
return sorted(list(set(dense_range + sparse_range)))

View file

@ -0,0 +1,176 @@
"""Shared plotting utilities for benchmarks and tests."""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Common style configurations
STYLE_CONFIG = {
"background_color": "#1a1a2e",
"primary_color": "#ff2a6d",
"secondary_color": "#05d9e8",
"grid_color": "#ffffff",
"text_color": "#ffffff",
"font_sizes": {
"title": 16,
"label": 14,
"tick": 12,
"text": 10
}
}
def setup_plot(fig, ax, title, xlabel=None, ylabel=None):
"""Configure plot styling with consistent theme.
Args:
fig: matplotlib figure object
ax: matplotlib axis object
title: str, plot title
xlabel: str, optional x-axis label
ylabel: str, optional y-axis label
Returns:
tuple: (fig, ax) with applied styling
"""
# Grid styling
ax.grid(True, linestyle="--", alpha=0.3, color=STYLE_CONFIG["grid_color"])
# Title and labels
ax.set_title(title, pad=20,
fontsize=STYLE_CONFIG["font_sizes"]["title"],
fontweight="bold",
color=STYLE_CONFIG["text_color"])
if xlabel:
ax.set_xlabel(xlabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
if ylabel:
ax.set_ylabel(ylabel,
fontsize=STYLE_CONFIG["font_sizes"]["label"],
fontweight="medium",
color=STYLE_CONFIG["text_color"])
# Tick styling
ax.tick_params(labelsize=STYLE_CONFIG["font_sizes"]["tick"],
colors=STYLE_CONFIG["text_color"])
# Spine styling
for spine in ax.spines.values():
spine.set_color(STYLE_CONFIG["text_color"])
spine.set_alpha(0.3)
spine.set_linewidth(0.5)
# Background colors
ax.set_facecolor(STYLE_CONFIG["background_color"])
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
return fig, ax
def plot_system_metrics(metrics_data, output_path):
"""Create plots for system metrics over time.
Args:
metrics_data: list of dicts containing system metrics
output_path: str, path to save the output plot
"""
df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
# Get baseline values
baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
# Convert GPU memory to GB if present
if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
plt.style.use("dark_background")
# Create subplots based on available metrics
has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor(STYLE_CONFIG["background_color"])
# Smoothing window
window = min(5, len(df) // 2)
# Plot CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[0].axhline(y=baseline_cpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[0], "CPU Usage Over Time",
xlabel="Time (seconds)", ylabel="CPU Usage (%)")
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
axes[0].legend()
# Plot RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1],
color=STYLE_CONFIG["secondary_color"], linewidth=2)
axes[1].axhline(y=baseline_ram, color=STYLE_CONFIG["primary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[1], "RAM Usage Over Time",
xlabel="Time (seconds)", ylabel="RAM Usage (GB)")
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
axes[1].legend()
# Plot GPU Memory if available
if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2],
color=STYLE_CONFIG["primary_color"], linewidth=2)
axes[2].axhline(y=baseline_gpu, color=STYLE_CONFIG["secondary_color"],
linestyle="--", alpha=0.5, label="Baseline")
setup_plot(fig, axes[2], "GPU Memory Usage Over Time",
xlabel="Time (seconds)", ylabel="GPU Memory (GB)")
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
axes[2].legend()
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
"""Create correlation plot with regression line and correlation coefficient.
Args:
df: pandas DataFrame containing the data
x: str, column name for x-axis
y: str, column name for y-axis
title: str, plot title
xlabel: str, x-axis label
ylabel: str, y-axis label
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(12, 8))
# Scatter plot
sns.scatterplot(data=df, x=x, y=y, s=100, alpha=0.6,
color=STYLE_CONFIG["primary_color"])
# Regression line
sns.regplot(data=df, x=x, y=y, scatter=False,
color=STYLE_CONFIG["secondary_color"],
line_kws={"linewidth": 2})
# Add correlation coefficient
corr = df[x].corr(df[y])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=STYLE_CONFIG["font_sizes"]["text"],
color=STYLE_CONFIG["text_color"],
bbox=dict(facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["text_color"],
alpha=0.7))
setup_plot(fig, ax, title, xlabel=xlabel, ylabel=ylabel)
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()

View file

@ -0,0 +1,174 @@
"""Shared utilities for benchmarks and tests."""
import os
import json
import subprocess
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
import psutil
import scipy.io.wavfile as wavfile
# Check for torch availability once at module level
TORCH_AVAILABLE = False
try:
import torch
TORCH_AVAILABLE = torch.cuda.is_available()
except ImportError:
pass
def get_audio_length(audio_data: bytes, temp_dir: str = None) -> float:
"""Get audio length in seconds from bytes data.
Args:
audio_data: Raw audio bytes
temp_dir: Directory for temporary file. If None, uses system temp directory.
Returns:
float: Audio length in seconds
"""
if temp_dir is None:
import tempfile
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, "temp.wav")
os.makedirs(temp_dir, exist_ok=True)
with open(temp_path, "wb") as f:
f.write(audio_data)
try:
rate, data = wavfile.read(temp_path)
return len(data) / rate
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
def get_gpu_memory(average: bool = True) -> Optional[Union[float, List[float]]]:
"""Get GPU memory usage using PyTorch if available, falling back to nvidia-smi.
Args:
average: If True and multiple GPUs present, returns average memory usage.
If False, returns list of memory usage per GPU.
Returns:
float or List[float] or None: GPU memory usage in MB. Returns None if no GPU available.
If average=False and multiple GPUs present, returns list of values.
"""
if TORCH_AVAILABLE:
n_gpus = torch.cuda.device_count()
memory_used = []
for i in range(n_gpus):
memory_used.append(torch.cuda.memory_allocated(i) / 1024**2) # Convert to MB
if average and len(memory_used) > 0:
return sum(memory_used) / len(memory_used)
return memory_used if len(memory_used) > 1 else memory_used[0]
# Fall back to nvidia-smi
try:
result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
)
memory_values = [float(x.strip()) for x in result.decode("utf-8").split("\n") if x.strip()]
if average and len(memory_values) > 0:
return sum(memory_values) / len(memory_values)
return memory_values if len(memory_values) > 1 else memory_values[0]
except (subprocess.CalledProcessError, FileNotFoundError):
return None
def get_system_metrics() -> Dict[str, Union[str, float]]:
"""Get current system metrics including CPU, RAM, and GPU if available.
Returns:
dict: System metrics including timestamp, CPU%, RAM%, RAM GB, and GPU MB if available
"""
# Get per-CPU percentages and calculate average
cpu_percentages = psutil.cpu_percent(percpu=True)
avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
metrics = {
"timestamp": datetime.now().isoformat(),
"cpu_percent": round(avg_cpu, 2),
"ram_percent": psutil.virtual_memory().percent,
"ram_used_gb": psutil.virtual_memory().used / (1024**3),
}
gpu_mem = get_gpu_memory(average=True) # Use average for system metrics
if gpu_mem is not None:
metrics["gpu_memory_used"] = round(gpu_mem, 2)
return metrics
def save_audio_file(audio_data: bytes, identifier: str, output_dir: str) -> str:
"""Save audio data to a file with proper naming and directory creation.
Args:
audio_data: Raw audio bytes
identifier: String to identify this audio file (e.g. token count, test name)
output_dir: Directory to save the file
Returns:
str: Path to the saved audio file
"""
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{identifier}.wav")
with open(output_file, "wb") as f:
f.write(audio_data)
return output_file
def write_benchmark_stats(stats: List[Dict[str, Any]], output_file: str) -> None:
"""Write benchmark statistics to a file in a clean, organized format.
Args:
stats: List of dictionaries containing stat name/value pairs
output_file: Path to output file
"""
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w") as f:
for section in stats:
# Write section header
f.write(f"=== {section['title']} ===\n\n")
# Write stats
for label, value in section['stats'].items():
if isinstance(value, float):
f.write(f"{label}: {value:.2f}\n")
else:
f.write(f"{label}: {value}\n")
f.write("\n")
def save_json_results(results: Dict[str, Any], output_file: str) -> None:
"""Save benchmark results to a JSON file with proper formatting.
Args:
results: Dictionary of results to save
output_file: Path to output file
"""
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio.
Args:
processing_time: Time taken to process/generate audio
audio_length: Length of the generated audio
decimals: Number of decimal places to round to
Returns:
float: RTF value
"""
rtf = processing_time / audio_length
return round(rtf, decimals)

View file

@ -0,0 +1,111 @@
{
"results": [
{
"tokens": 100,
"processing_time": 18.833295583724976,
"output_length": 31.15,
"realtime_factor": 1.6539856161403135,
"elapsed_time": 19.024322748184204
},
{
"tokens": 200,
"processing_time": 38.95506024360657,
"output_length": 62.6,
"realtime_factor": 1.6069799304257042,
"elapsed_time": 58.21527123451233
},
{
"tokens": 300,
"processing_time": 49.74252939224243,
"output_length": 96.325,
"realtime_factor": 1.9364716908630366,
"elapsed_time": 108.19673728942871
},
{
"tokens": 400,
"processing_time": 61.349056243896484,
"output_length": 128.575,
"realtime_factor": 2.095794261102292,
"elapsed_time": 169.733656167984
},
{
"tokens": 500,
"processing_time": 82.86568236351013,
"output_length": 158.575,
"realtime_factor": 1.9136389815071193,
"elapsed_time": 252.7968451976776
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T00:13:49.865330",
"cpu_percent": 8.0,
"ram_percent": 39.4,
"ram_used_gb": 25.03811264038086,
"gpu_memory_used": 1204.0
},
{
"timestamp": "2025-01-03T00:14:08.781551",
"cpu_percent": 26.8,
"ram_percent": 42.6,
"ram_used_gb": 27.090862274169922,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:08.916973",
"cpu_percent": 16.1,
"ram_percent": 42.6,
"ram_used_gb": 27.089553833007812,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:47.979053",
"cpu_percent": 31.5,
"ram_percent": 43.6,
"ram_used_gb": 27.714427947998047,
"gpu_memory_used": 1225.0
},
{
"timestamp": "2025-01-03T00:14:48.098976",
"cpu_percent": 20.0,
"ram_percent": 43.6,
"ram_used_gb": 27.704315185546875,
"gpu_memory_used": 1211.0
},
{
"timestamp": "2025-01-03T00:15:37.944729",
"cpu_percent": 29.7,
"ram_percent": 38.6,
"ram_used_gb": 24.53925323486328,
"gpu_memory_used": 1217.0
},
{
"timestamp": "2025-01-03T00:15:38.071915",
"cpu_percent": 8.6,
"ram_percent": 38.5,
"ram_used_gb": 24.51690673828125,
"gpu_memory_used": 1208.0
},
{
"timestamp": "2025-01-03T00:16:39.525449",
"cpu_percent": 23.4,
"ram_percent": 38.8,
"ram_used_gb": 24.71230697631836,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:16:39.612442",
"cpu_percent": 5.5,
"ram_percent": 38.9,
"ram_used_gb": 24.72066879272461,
"gpu_memory_used": 1221.0
},
{
"timestamp": "2025-01-03T00:18:02.569076",
"cpu_percent": 27.4,
"ram_percent": 39.1,
"ram_used_gb": 24.868202209472656,
"gpu_memory_used": 1264.0
}
]
}

View file

@ -0,0 +1,300 @@
{
"results": [
{
"tokens": 100,
"processing_time": 0.96,
"output_length": 31.1,
"rtf": 0.03,
"elapsed_time": 1.11
},
{
"tokens": 250,
"processing_time": 2.23,
"output_length": 77.17,
"rtf": 0.03,
"elapsed_time": 3.49
},
{
"tokens": 400,
"processing_time": 4.05,
"output_length": 128.05,
"rtf": 0.03,
"elapsed_time": 7.77
},
{
"tokens": 550,
"processing_time": 4.06,
"output_length": 171.45,
"rtf": 0.02,
"elapsed_time": 12.0
},
{
"tokens": 700,
"processing_time": 6.01,
"output_length": 221.6,
"rtf": 0.03,
"elapsed_time": 18.16
},
{
"tokens": 850,
"processing_time": 6.9,
"output_length": 269.1,
"rtf": 0.03,
"elapsed_time": 25.21
},
{
"tokens": 1000,
"processing_time": 7.65,
"output_length": 315.05,
"rtf": 0.02,
"elapsed_time": 33.03
},
{
"tokens": 6000,
"processing_time": 48.7,
"output_length": 1837.1,
"rtf": 0.03,
"elapsed_time": 82.21
},
{
"tokens": 11000,
"processing_time": 92.44,
"output_length": 3388.57,
"rtf": 0.03,
"elapsed_time": 175.46
},
{
"tokens": 16000,
"processing_time": 163.61,
"output_length": 4977.32,
"rtf": 0.03,
"elapsed_time": 340.46
},
{
"tokens": 21000,
"processing_time": 209.72,
"output_length": 6533.3,
"rtf": 0.03,
"elapsed_time": 551.92
},
{
"tokens": 26000,
"processing_time": 329.35,
"output_length": 8068.15,
"rtf": 0.04,
"elapsed_time": 883.37
},
{
"tokens": 31000,
"processing_time": 473.52,
"output_length": 9611.48,
"rtf": 0.05,
"elapsed_time": 1359.28
},
{
"tokens": 36000,
"processing_time": 650.98,
"output_length": 11157.15,
"rtf": 0.06,
"elapsed_time": 2012.9
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T14:41:01.331735",
"cpu_percent": 7.5,
"ram_percent": 50.2,
"ram_used_gb": 31.960269927978516,
"gpu_memory_used": 3191.0
},
{
"timestamp": "2025-01-03T14:41:02.357116",
"cpu_percent": 17.01,
"ram_percent": 50.2,
"ram_used_gb": 31.96163558959961,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:02.445009",
"cpu_percent": 9.5,
"ram_percent": 50.3,
"ram_used_gb": 31.966781616210938,
"gpu_memory_used": 3426.0
},
{
"timestamp": "2025-01-03T14:41:04.742152",
"cpu_percent": 18.27,
"ram_percent": 50.4,
"ram_used_gb": 32.08788299560547,
"gpu_memory_used": 3642.0
},
{
"timestamp": "2025-01-03T14:41:04.847795",
"cpu_percent": 16.27,
"ram_percent": 50.5,
"ram_used_gb": 32.094364166259766,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.019590",
"cpu_percent": 15.97,
"ram_percent": 50.7,
"ram_used_gb": 32.23244094848633,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:09.110324",
"cpu_percent": 3.54,
"ram_percent": 50.7,
"ram_used_gb": 32.234458923339844,
"gpu_memory_used": 3640.0
},
{
"timestamp": "2025-01-03T14:41:13.252607",
"cpu_percent": 13.4,
"ram_percent": 50.6,
"ram_used_gb": 32.194271087646484,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:13.327557",
"cpu_percent": 4.69,
"ram_percent": 50.6,
"ram_used_gb": 32.191776275634766,
"gpu_memory_used": 3935.0
},
{
"timestamp": "2025-01-03T14:41:19.413633",
"cpu_percent": 12.92,
"ram_percent": 50.9,
"ram_used_gb": 32.3467903137207,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:19.492758",
"cpu_percent": 7.5,
"ram_percent": 50.8,
"ram_used_gb": 32.34375,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:26.467284",
"cpu_percent": 13.09,
"ram_percent": 51.2,
"ram_used_gb": 32.56281280517578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:26.553559",
"cpu_percent": 8.39,
"ram_percent": 51.2,
"ram_used_gb": 32.56183624267578,
"gpu_memory_used": 4249.0
},
{
"timestamp": "2025-01-03T14:41:34.284362",
"cpu_percent": 12.61,
"ram_percent": 51.7,
"ram_used_gb": 32.874778747558594,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:41:34.362353",
"cpu_percent": 1.25,
"ram_percent": 51.7,
"ram_used_gb": 32.87461471557617,
"gpu_memory_used": 4250.0
},
{
"timestamp": "2025-01-03T14:42:23.471312",
"cpu_percent": 11.64,
"ram_percent": 54.9,
"ram_used_gb": 34.90264129638672,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:42:23.547203",
"cpu_percent": 5.31,
"ram_percent": 54.9,
"ram_used_gb": 34.91563415527344,
"gpu_memory_used": 4647.0
},
{
"timestamp": "2025-01-03T14:43:56.724933",
"cpu_percent": 12.97,
"ram_percent": 59.5,
"ram_used_gb": 37.84241485595703,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:43:56.815453",
"cpu_percent": 11.75,
"ram_percent": 59.5,
"ram_used_gb": 37.832679748535156,
"gpu_memory_used": 4655.0
},
{
"timestamp": "2025-01-03T14:46:41.705155",
"cpu_percent": 12.94,
"ram_percent": 66.3,
"ram_used_gb": 42.1534538269043,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:46:41.835177",
"cpu_percent": 7.73,
"ram_percent": 66.2,
"ram_used_gb": 42.13554000854492,
"gpu_memory_used": 4729.0
},
{
"timestamp": "2025-01-03T14:50:13.166236",
"cpu_percent": 11.62,
"ram_percent": 73.4,
"ram_used_gb": 46.71288299560547,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:50:13.261611",
"cpu_percent": 8.16,
"ram_percent": 73.4,
"ram_used_gb": 46.71356201171875,
"gpu_memory_used": 4676.0
},
{
"timestamp": "2025-01-03T14:55:44.623607",
"cpu_percent": 12.92,
"ram_percent": 82.8,
"ram_used_gb": 52.65533447265625,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T14:55:44.735410",
"cpu_percent": 15.29,
"ram_percent": 82.7,
"ram_used_gb": 52.63290786743164,
"gpu_memory_used": 4636.0
},
{
"timestamp": "2025-01-03T15:03:40.534449",
"cpu_percent": 13.88,
"ram_percent": 85.0,
"ram_used_gb": 54.050071716308594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:03:40.638708",
"cpu_percent": 12.21,
"ram_percent": 85.0,
"ram_used_gb": 54.053733825683594,
"gpu_memory_used": 4771.0
},
{
"timestamp": "2025-01-03T15:14:34.159142",
"cpu_percent": 14.51,
"ram_percent": 78.1,
"ram_used_gb": 49.70396423339844,
"gpu_memory_used": 4739.0
}
]
}

View file

@ -0,0 +1,9 @@
=== Benchmark Statistics (with correct RTF) ===
Overall Stats:
Total tokens processed: 150850
Total audio generated: 46786.59s
Total test duration: 2012.90s
Average processing rate: 104.34 tokens/second
Average RTF: 0.03x

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 2250
Total audio generated (s): 710.80
Total test duration (s): 332.81
Average processing rate (tokens/s): 6.77
Average RTF: 0.47
Average Real Time Speed: 2.14
=== Per-chunk Stats ===
Average chunk size (tokens): 450.00
Min chunk size (tokens): 150
Max chunk size (tokens): 750
Average processing time (s): 66.51
Average output length (s): 142.16
=== Performance Ranges ===
Processing rate range (tokens/s): 6.50 - 7.00
RTF range: 0.45x - 0.50x
Real Time Speed range: 2.00x - 2.22x

View file

@ -0,0 +1,607 @@
{
"results": [
{
"tokens": 150,
"processing_time": 1.03,
"output_length": 45.9,
"rtf": 0.02,
"elapsed_time": 1.07
},
{
"tokens": 300,
"processing_time": 2.51,
"output_length": 96.425,
"rtf": 0.03,
"elapsed_time": 3.63
},
{
"tokens": 450,
"processing_time": 3.69,
"output_length": 143.1,
"rtf": 0.03,
"elapsed_time": 7.37
},
{
"tokens": 600,
"processing_time": 5.52,
"output_length": 188.675,
"rtf": 0.03,
"elapsed_time": 12.96
},
{
"tokens": 750,
"processing_time": 6.32,
"output_length": 236.7,
"rtf": 0.03,
"elapsed_time": 19.34
},
{
"tokens": 900,
"processing_time": 8.4,
"output_length": 283.425,
"rtf": 0.03,
"elapsed_time": 27.82
},
{
"tokens": 2000,
"processing_time": 15.46,
"output_length": 624.325,
"rtf": 0.02,
"elapsed_time": 43.4
},
{
"tokens": 3000,
"processing_time": 27.11,
"output_length": 931.15,
"rtf": 0.03,
"elapsed_time": 70.7
}
],
"system_metrics": [
{
"timestamp": "2025-01-03T17:45:10.797646",
"cpu_percent": 10.05,
"ram_percent": 54.2,
"ram_used_gb": 34.474674224853516,
"gpu_memory_used": 3992.0,
"relative_time": 0.06637930870056152
},
{
"timestamp": "2025-01-03T17:45:11.871315",
"cpu_percent": 13.54,
"ram_percent": 54.2,
"ram_used_gb": 34.47991180419922,
"gpu_memory_used": 3990.0,
"relative_time": 1.1326591968536377
},
{
"timestamp": "2025-01-03T17:45:12.932597",
"cpu_percent": 12.76,
"ram_percent": 54.2,
"ram_used_gb": 34.501747131347656,
"gpu_memory_used": 3990.0,
"relative_time": 2.192795515060425
},
{
"timestamp": "2025-01-03T17:45:13.995052",
"cpu_percent": 15.48,
"ram_percent": 54.2,
"ram_used_gb": 34.48517990112305,
"gpu_memory_used": 3989.0,
"relative_time": 3.253366231918335
},
{
"timestamp": "2025-01-03T17:45:15.056310",
"cpu_percent": 11.96,
"ram_percent": 54.2,
"ram_used_gb": 34.457679748535156,
"gpu_memory_used": 3980.0,
"relative_time": 4.331450462341309
},
{
"timestamp": "2025-01-03T17:45:16.128795",
"cpu_percent": 14.28,
"ram_percent": 54.2,
"ram_used_gb": 34.465850830078125,
"gpu_memory_used": 3980.0,
"relative_time": 5.386842727661133
},
{
"timestamp": "2025-01-03T17:45:17.185921",
"cpu_percent": 13.14,
"ram_percent": 54.2,
"ram_used_gb": 34.4874153137207,
"gpu_memory_used": 3980.0,
"relative_time": 6.450911998748779
},
{
"timestamp": "2025-01-03T17:45:18.248252",
"cpu_percent": 15.54,
"ram_percent": 54.4,
"ram_used_gb": 34.581886291503906,
"gpu_memory_used": 3986.0,
"relative_time": 7.525278329849243
},
{
"timestamp": "2025-01-03T17:45:19.324382",
"cpu_percent": 14.89,
"ram_percent": 54.4,
"ram_used_gb": 34.5898551940918,
"gpu_memory_used": 3987.0,
"relative_time": 8.588879585266113
},
{
"timestamp": "2025-01-03T17:45:20.394701",
"cpu_percent": 12.13,
"ram_percent": 54.4,
"ram_used_gb": 34.582420349121094,
"gpu_memory_used": 3986.0,
"relative_time": 9.65286660194397
},
{
"timestamp": "2025-01-03T17:45:21.455704",
"cpu_percent": 11.02,
"ram_percent": 54.4,
"ram_used_gb": 34.617252349853516,
"gpu_memory_used": 3986.0,
"relative_time": 10.71657395362854
},
{
"timestamp": "2025-01-03T17:45:22.525946",
"cpu_percent": 14.01,
"ram_percent": 54.5,
"ram_used_gb": 34.651466369628906,
"gpu_memory_used": 3989.0,
"relative_time": 11.787351846694946
},
{
"timestamp": "2025-01-03T17:45:23.584761",
"cpu_percent": 13.09,
"ram_percent": 54.5,
"ram_used_gb": 34.680885314941406,
"gpu_memory_used": 3989.0,
"relative_time": 12.846002101898193
},
{
"timestamp": "2025-01-03T17:45:24.645316",
"cpu_percent": 17.72,
"ram_percent": 54.6,
"ram_used_gb": 34.741127014160156,
"gpu_memory_used": 3985.0,
"relative_time": 13.918755054473877
},
{
"timestamp": "2025-01-03T17:45:25.718731",
"cpu_percent": 14.66,
"ram_percent": 54.6,
"ram_used_gb": 34.71047592163086,
"gpu_memory_used": 3982.0,
"relative_time": 14.974157810211182
},
{
"timestamp": "2025-01-03T17:45:26.774860",
"cpu_percent": 11.52,
"ram_percent": 54.6,
"ram_used_gb": 34.728397369384766,
"gpu_memory_used": 3982.0,
"relative_time": 16.034392833709717
},
{
"timestamp": "2025-01-03T17:45:27.837623",
"cpu_percent": 11.04,
"ram_percent": 54.6,
"ram_used_gb": 34.75224685668945,
"gpu_memory_used": 3981.0,
"relative_time": 17.096498250961304
},
{
"timestamp": "2025-01-03T17:45:28.898447",
"cpu_percent": 12.17,
"ram_percent": 54.7,
"ram_used_gb": 34.796974182128906,
"gpu_memory_used": 3977.0,
"relative_time": 18.157397270202637
},
{
"timestamp": "2025-01-03T17:45:29.959510",
"cpu_percent": 10.72,
"ram_percent": 54.7,
"ram_used_gb": 34.819969177246094,
"gpu_memory_used": 3991.0,
"relative_time": 19.22814679145813
},
{
"timestamp": "2025-01-03T17:45:31.033262",
"cpu_percent": 17.95,
"ram_percent": 55.0,
"ram_used_gb": 34.9871711730957,
"gpu_memory_used": 3995.0,
"relative_time": 20.29205060005188
},
{
"timestamp": "2025-01-03T17:45:32.091757",
"cpu_percent": 19.11,
"ram_percent": 55.0,
"ram_used_gb": 35.0067138671875,
"gpu_memory_used": 3995.0,
"relative_time": 21.353832006454468
},
{
"timestamp": "2025-01-03T17:45:33.156831",
"cpu_percent": 32.93,
"ram_percent": 55.1,
"ram_used_gb": 35.05879211425781,
"gpu_memory_used": 3995.0,
"relative_time": 22.416496992111206
},
{
"timestamp": "2025-01-03T17:45:34.217136",
"cpu_percent": 12.59,
"ram_percent": 55.2,
"ram_used_gb": 35.10686111450195,
"gpu_memory_used": 3994.0,
"relative_time": 23.476072549819946
},
{
"timestamp": "2025-01-03T17:45:35.275577",
"cpu_percent": 30.79,
"ram_percent": 55.4,
"ram_used_gb": 35.22132110595703,
"gpu_memory_used": 3989.0,
"relative_time": 24.564188957214355
},
{
"timestamp": "2025-01-03T17:45:36.365095",
"cpu_percent": 13.36,
"ram_percent": 55.2,
"ram_used_gb": 35.08255386352539,
"gpu_memory_used": 4000.0,
"relative_time": 25.64090871810913
},
{
"timestamp": "2025-01-03T17:45:37.451539",
"cpu_percent": 14.94,
"ram_percent": 55.2,
"ram_used_gb": 35.118614196777344,
"gpu_memory_used": 4000.0,
"relative_time": 26.71500325202942
},
{
"timestamp": "2025-01-03T17:45:38.525364",
"cpu_percent": 12.76,
"ram_percent": 55.4,
"ram_used_gb": 35.221614837646484,
"gpu_memory_used": 3999.0,
"relative_time": 27.806236505508423
},
{
"timestamp": "2025-01-03T17:45:39.616790",
"cpu_percent": 16.11,
"ram_percent": 55.4,
"ram_used_gb": 35.2247200012207,
"gpu_memory_used": 3999.0,
"relative_time": 28.875747203826904
},
{
"timestamp": "2025-01-03T17:45:40.675234",
"cpu_percent": 14.96,
"ram_percent": 55.4,
"ram_used_gb": 35.21339416503906,
"gpu_memory_used": 3999.0,
"relative_time": 29.94703769683838
},
{
"timestamp": "2025-01-03T17:45:41.746176",
"cpu_percent": 10.99,
"ram_percent": 55.4,
"ram_used_gb": 35.260677337646484,
"gpu_memory_used": 3994.0,
"relative_time": 31.006144046783447
},
{
"timestamp": "2025-01-03T17:45:42.807809",
"cpu_percent": 13.15,
"ram_percent": 55.5,
"ram_used_gb": 35.299591064453125,
"gpu_memory_used": 3994.0,
"relative_time": 32.0741171836853
},
{
"timestamp": "2025-01-03T17:45:43.879826",
"cpu_percent": 12.74,
"ram_percent": 55.6,
"ram_used_gb": 35.34665298461914,
"gpu_memory_used": 3994.0,
"relative_time": 33.14525270462036
},
{
"timestamp": "2025-01-03T17:45:44.954413",
"cpu_percent": 12.11,
"ram_percent": 55.6,
"ram_used_gb": 35.34089660644531,
"gpu_memory_used": 3990.0,
"relative_time": 34.21659064292908
},
{
"timestamp": "2025-01-03T17:45:46.025229",
"cpu_percent": 13.02,
"ram_percent": 55.6,
"ram_used_gb": 35.37482833862305,
"gpu_memory_used": 3991.0,
"relative_time": 35.28446078300476
},
{
"timestamp": "2025-01-03T17:45:47.085470",
"cpu_percent": 13.53,
"ram_percent": 55.6,
"ram_used_gb": 35.392356872558594,
"gpu_memory_used": 3988.0,
"relative_time": 36.34242486953735
},
{
"timestamp": "2025-01-03T17:45:48.155295",
"cpu_percent": 15.0,
"ram_percent": 55.7,
"ram_used_gb": 35.449764251708984,
"gpu_memory_used": 3987.0,
"relative_time": 37.418004512786865
},
{
"timestamp": "2025-01-03T17:45:49.218400",
"cpu_percent": 13.84,
"ram_percent": 55.8,
"ram_used_gb": 35.468841552734375,
"gpu_memory_used": 3986.0,
"relative_time": 38.48085808753967
},
{
"timestamp": "2025-01-03T17:45:50.281360",
"cpu_percent": 13.25,
"ram_percent": 55.8,
"ram_used_gb": 35.491825103759766,
"gpu_memory_used": 3987.0,
"relative_time": 39.5399751663208
},
{
"timestamp": "2025-01-03T17:45:51.343810",
"cpu_percent": 10.34,
"ram_percent": 55.8,
"ram_used_gb": 35.51161193847656,
"gpu_memory_used": 3985.0,
"relative_time": 40.60230302810669
},
{
"timestamp": "2025-01-03T17:45:52.402527",
"cpu_percent": 12.56,
"ram_percent": 55.9,
"ram_used_gb": 35.57502365112305,
"gpu_memory_used": 3984.0,
"relative_time": 41.660725116729736
},
{
"timestamp": "2025-01-03T17:45:53.460932",
"cpu_percent": 12.04,
"ram_percent": 56.0,
"ram_used_gb": 35.61081314086914,
"gpu_memory_used": 3978.0,
"relative_time": 42.71787190437317
},
{
"timestamp": "2025-01-03T17:45:54.521959",
"cpu_percent": 10.13,
"ram_percent": 56.3,
"ram_used_gb": 35.822574615478516,
"gpu_memory_used": 3978.0,
"relative_time": 43.783926010131836
},
{
"timestamp": "2025-01-03T17:45:55.583212",
"cpu_percent": 28.17,
"ram_percent": 56.3,
"ram_used_gb": 35.78395462036133,
"gpu_memory_used": 3976.0,
"relative_time": 44.858543157577515
},
{
"timestamp": "2025-01-03T17:45:56.657026",
"cpu_percent": 16.61,
"ram_percent": 56.3,
"ram_used_gb": 35.7921028137207,
"gpu_memory_used": 3984.0,
"relative_time": 45.918612003326416
},
{
"timestamp": "2025-01-03T17:45:57.716203",
"cpu_percent": 15.03,
"ram_percent": 56.3,
"ram_used_gb": 35.79140853881836,
"gpu_memory_used": 3984.0,
"relative_time": 46.97588872909546
},
{
"timestamp": "2025-01-03T17:45:58.775392",
"cpu_percent": 14.81,
"ram_percent": 56.3,
"ram_used_gb": 35.80635452270508,
"gpu_memory_used": 3984.0,
"relative_time": 48.03421711921692
},
{
"timestamp": "2025-01-03T17:45:59.834277",
"cpu_percent": 15.06,
"ram_percent": 56.3,
"ram_used_gb": 35.81984329223633,
"gpu_memory_used": 3984.0,
"relative_time": 49.0965371131897
},
{
"timestamp": "2025-01-03T17:46:00.896761",
"cpu_percent": 19.76,
"ram_percent": 56.3,
"ram_used_gb": 35.7983512878418,
"gpu_memory_used": 3989.0,
"relative_time": 50.177143812179565
},
{
"timestamp": "2025-01-03T17:46:01.981868",
"cpu_percent": 17.32,
"ram_percent": 56.3,
"ram_used_gb": 35.81730270385742,
"gpu_memory_used": 3990.0,
"relative_time": 51.242098331451416
},
{
"timestamp": "2025-01-03T17:46:03.046930",
"cpu_percent": 19.8,
"ram_percent": 56.5,
"ram_used_gb": 35.92729949951172,
"gpu_memory_used": 3990.0,
"relative_time": 52.3223512172699
},
{
"timestamp": "2025-01-03T17:46:04.122311",
"cpu_percent": 20.91,
"ram_percent": 56.5,
"ram_used_gb": 35.949684143066406,
"gpu_memory_used": 3991.0,
"relative_time": 53.3851900100708
},
{
"timestamp": "2025-01-03T17:46:05.182768",
"cpu_percent": 17.39,
"ram_percent": 56.5,
"ram_used_gb": 35.94847869873047,
"gpu_memory_used": 3991.0,
"relative_time": 54.45881199836731
},
{
"timestamp": "2025-01-03T17:46:06.257550",
"cpu_percent": 16.64,
"ram_percent": 56.5,
"ram_used_gb": 35.9198112487793,
"gpu_memory_used": 3989.0,
"relative_time": 55.51820731163025
},
{
"timestamp": "2025-01-03T17:46:07.317263",
"cpu_percent": 15.99,
"ram_percent": 56.3,
"ram_used_gb": 35.82686233520508,
"gpu_memory_used": 3989.0,
"relative_time": 56.59837555885315
},
{
"timestamp": "2025-01-03T17:46:08.409244",
"cpu_percent": 15.11,
"ram_percent": 56.4,
"ram_used_gb": 35.852657318115234,
"gpu_memory_used": 3988.0,
"relative_time": 57.669328927993774
},
{
"timestamp": "2025-01-03T17:46:09.473703",
"cpu_percent": 18.54,
"ram_percent": 56.4,
"ram_used_gb": 35.889339447021484,
"gpu_memory_used": 3979.0,
"relative_time": 58.76238036155701
},
{
"timestamp": "2025-01-03T17:46:10.562180",
"cpu_percent": 15.7,
"ram_percent": 56.4,
"ram_used_gb": 35.90079879760742,
"gpu_memory_used": 3975.0,
"relative_time": 59.82209253311157
},
{
"timestamp": "2025-01-03T17:46:11.634373",
"cpu_percent": 16.25,
"ram_percent": 56.5,
"ram_used_gb": 35.94197082519531,
"gpu_memory_used": 3976.0,
"relative_time": 60.91385841369629
},
{
"timestamp": "2025-01-03T17:46:12.723458",
"cpu_percent": 16.98,
"ram_percent": 56.6,
"ram_used_gb": 35.99095153808594,
"gpu_memory_used": 3976.0,
"relative_time": 61.981855154037476
},
{
"timestamp": "2025-01-03T17:46:13.781955",
"cpu_percent": 15.59,
"ram_percent": 56.6,
"ram_used_gb": 36.00953674316406,
"gpu_memory_used": 3976.0,
"relative_time": 63.04051613807678
},
{
"timestamp": "2025-01-03T17:46:14.852706",
"cpu_percent": 13.16,
"ram_percent": 56.7,
"ram_used_gb": 36.050899505615234,
"gpu_memory_used": 3976.0,
"relative_time": 64.11573505401611
},
{
"timestamp": "2025-01-03T17:46:15.927719",
"cpu_percent": 12.34,
"ram_percent": 56.7,
"ram_used_gb": 36.07988739013672,
"gpu_memory_used": 3976.0,
"relative_time": 65.18661308288574
},
{
"timestamp": "2025-01-03T17:46:16.999292",
"cpu_percent": 12.34,
"ram_percent": 56.8,
"ram_used_gb": 36.099937438964844,
"gpu_memory_used": 3976.0,
"relative_time": 66.25790786743164
},
{
"timestamp": "2025-01-03T17:46:18.058608",
"cpu_percent": 11.74,
"ram_percent": 56.8,
"ram_used_gb": 36.14547348022461,
"gpu_memory_used": 3975.0,
"relative_time": 67.31676268577576
},
{
"timestamp": "2025-01-03T17:46:19.122597",
"cpu_percent": 12.63,
"ram_percent": 56.9,
"ram_used_gb": 36.177284240722656,
"gpu_memory_used": 3974.0,
"relative_time": 68.3815085887909
},
{
"timestamp": "2025-01-03T17:46:20.182864",
"cpu_percent": 9.65,
"ram_percent": 56.9,
"ram_used_gb": 36.216495513916016,
"gpu_memory_used": 3973.0,
"relative_time": 69.44507431983948
},
{
"timestamp": "2025-01-03T17:46:21.244696",
"cpu_percent": 10.38,
"ram_percent": 57.4,
"ram_used_gb": 36.51596450805664,
"gpu_memory_used": 3973.0,
"relative_time": 70.51762080192566
},
{
"timestamp": "2025-01-03T17:46:22.448455",
"cpu_percent": 9.24,
"ram_percent": 57.5,
"ram_used_gb": 36.56745529174805,
"gpu_memory_used": 3974.0,
"relative_time": 71.72753357887268
}
],
"test_duration": 74.18872809410095
}

View file

@ -0,0 +1,23 @@
=== Benchmark Statistics (with correct RTF) ===
Total tokens processed: 8150
Total audio generated (s): 2549.70
Total test duration (s): 70.70
Average processing rate (tokens/s): 120.20
Average RTF: 0.03
Average Real Time Speed: 36.36
=== Per-chunk Stats ===
Average chunk size (tokens): 1018.75
Min chunk size (tokens): 150
Max chunk size (tokens): 3000
Average processing time (s): 8.75
Average output length (s): 318.71
=== Performance Ranges ===
Processing rate range (tokens/s): 107.14 - 145.63
RTF range: 0.02x - 0.03x
Real Time Speed range: 33.33x - 50.00x

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 212 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 449 KiB

View file

Before

Width:  |  Height:  |  Size: 764 KiB

After

Width:  |  Height:  |  Size: 764 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 232 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 448 KiB

View file

Before

Width:  |  Height:  |  Size: 198 KiB

After

Width:  |  Height:  |  Size: 198 KiB

View file

@ -332,8 +332,8 @@ def main():
)
parser.add_argument("--url", default="http://localhost:8880", help="API base URL")
parser.add_argument(
"--output-dir",
default="examples/output",
"--output-dir",
default="examples/assorted_checks/test_combinations/output",
help="Output directory for audio files",
)
args = parser.parse_args()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 754 KiB

View file

@ -1,531 +0,0 @@
{
"results": [
{
"tokens": 100,
"processing_time": 8.54442310333252,
"output_length": 31.15,
"realtime_factor": 3.6456527987068887,
"elapsed_time": 8.720048666000366
},
{
"tokens": 200,
"processing_time": 1.3838517665863037,
"output_length": 62.6,
"realtime_factor": 45.236058883981606,
"elapsed_time": 10.258155345916748
},
{
"tokens": 300,
"processing_time": 2.2024788856506348,
"output_length": 96.325,
"realtime_factor": 43.73481200095347,
"elapsed_time": 12.594647407531738
},
{
"tokens": 400,
"processing_time": 3.175424098968506,
"output_length": 128.55,
"realtime_factor": 40.48278150995886,
"elapsed_time": 16.005898475646973
},
{
"tokens": 500,
"processing_time": 3.205301523208618,
"output_length": 158.55,
"realtime_factor": 49.46492517224587,
"elapsed_time": 19.377076625823975
},
{
"tokens": 600,
"processing_time": 3.9976348876953125,
"output_length": 189.225,
"realtime_factor": 47.33423769700254,
"elapsed_time": 23.568575859069824
},
{
"tokens": 700,
"processing_time": 4.98036003112793,
"output_length": 222.05,
"realtime_factor": 44.58513011351734,
"elapsed_time": 28.767319917678833
},
{
"tokens": 800,
"processing_time": 5.156893491744995,
"output_length": 253.825,
"realtime_factor": 49.22052402406907,
"elapsed_time": 34.1369092464447
},
{
"tokens": 900,
"processing_time": 5.8110880851745605,
"output_length": 283.75,
"realtime_factor": 48.82906537312906,
"elapsed_time": 40.16419458389282
},
{
"tokens": 1000,
"processing_time": 6.686216354370117,
"output_length": 315.45,
"realtime_factor": 47.17914935460046,
"elapsed_time": 47.11375427246094
},
{
"tokens": 2000,
"processing_time": 13.290695905685425,
"output_length": 624.925,
"realtime_factor": 47.01973504131358,
"elapsed_time": 60.842002630233765
},
{
"tokens": 3000,
"processing_time": 20.058005571365356,
"output_length": 932.05,
"realtime_factor": 46.46773063671828,
"elapsed_time": 81.50969815254211
},
{
"tokens": 4000,
"processing_time": 26.38338828086853,
"output_length": 1222.975,
"realtime_factor": 46.353978002394015,
"elapsed_time": 108.76348638534546
},
{
"tokens": 5000,
"processing_time": 32.472310066223145,
"output_length": 1525.15,
"realtime_factor": 46.967708699801484,
"elapsed_time": 142.2994668483734
},
{
"tokens": 6000,
"processing_time": 42.67592263221741,
"output_length": 1837.525,
"realtime_factor": 43.0576514030137,
"elapsed_time": 186.26759266853333
},
{
"tokens": 7000,
"processing_time": 51.601537466049194,
"output_length": 2146.875,
"realtime_factor": 41.60486499869347,
"elapsed_time": 239.59922289848328
},
{
"tokens": 8000,
"processing_time": 51.86434292793274,
"output_length": 2458.425,
"realtime_factor": 47.401063258741466,
"elapsed_time": 293.4462616443634
},
{
"tokens": 9000,
"processing_time": 60.4497971534729,
"output_length": 2772.1,
"realtime_factor": 45.857887545297416,
"elapsed_time": 356.02399826049805
},
{
"tokens": 10000,
"processing_time": 71.75962543487549,
"output_length": 3085.625,
"realtime_factor": 42.99945800024164,
"elapsed_time": 430.50863671302795
},
{
"tokens": 11000,
"processing_time": 96.66409230232239,
"output_length": 3389.3,
"realtime_factor": 35.062657904030935,
"elapsed_time": 529.3296246528625
},
{
"tokens": 12000,
"processing_time": 85.70126295089722,
"output_length": 3703.175,
"realtime_factor": 43.21027336693678,
"elapsed_time": 618.0248212814331
},
{
"tokens": 13000,
"processing_time": 97.2874686717987,
"output_length": 4030.825,
"realtime_factor": 41.43210893479068,
"elapsed_time": 717.9070522785187
},
{
"tokens": 14000,
"processing_time": 105.1045708656311,
"output_length": 4356.775,
"realtime_factor": 41.451812838566596,
"elapsed_time": 826.1140224933624
},
{
"tokens": 15000,
"processing_time": 111.0716404914856,
"output_length": 4663.325,
"realtime_factor": 41.984839508672565,
"elapsed_time": 940.0645899772644
},
{
"tokens": 16000,
"processing_time": 116.61742973327637,
"output_length": 4978.65,
"realtime_factor": 42.692160266154104,
"elapsed_time": 1061.1957621574402
}
],
"system_metrics": [
{
"timestamp": "2024-12-31T03:12:36.009478",
"cpu_percent": 8.1,
"ram_percent": 66.8,
"ram_used_gb": 42.47850799560547,
"gpu_memory_used": 2124.0
},
{
"timestamp": "2024-12-31T03:12:44.639678",
"cpu_percent": 7.7,
"ram_percent": 69.1,
"ram_used_gb": 43.984352111816406,
"gpu_memory_used": 3486.0
},
{
"timestamp": "2024-12-31T03:12:44.731107",
"cpu_percent": 8.3,
"ram_percent": 69.1,
"ram_used_gb": 43.97468948364258,
"gpu_memory_used": 3484.0
},
{
"timestamp": "2024-12-31T03:12:46.189723",
"cpu_percent": 14.2,
"ram_percent": 69.1,
"ram_used_gb": 43.98275375366211,
"gpu_memory_used": 3697.0
},
{
"timestamp": "2024-12-31T03:12:46.265437",
"cpu_percent": 4.7,
"ram_percent": 69.1,
"ram_used_gb": 43.982975006103516,
"gpu_memory_used": 3697.0
},
{
"timestamp": "2024-12-31T03:12:48.536216",
"cpu_percent": 12.5,
"ram_percent": 69.0,
"ram_used_gb": 43.86142349243164,
"gpu_memory_used": 3697.0
},
{
"timestamp": "2024-12-31T03:12:48.603827",
"cpu_percent": 6.2,
"ram_percent": 69.0,
"ram_used_gb": 43.8692626953125,
"gpu_memory_used": 3694.0
},
{
"timestamp": "2024-12-31T03:12:51.905764",
"cpu_percent": 14.2,
"ram_percent": 69.1,
"ram_used_gb": 43.93961715698242,
"gpu_memory_used": 3690.0
},
{
"timestamp": "2024-12-31T03:12:52.028178",
"cpu_percent": 26.0,
"ram_percent": 69.1,
"ram_used_gb": 43.944759368896484,
"gpu_memory_used": 3690.0
},
{
"timestamp": "2024-12-31T03:12:55.320709",
"cpu_percent": 13.2,
"ram_percent": 69.1,
"ram_used_gb": 43.943058013916016,
"gpu_memory_used": 3685.0
},
{
"timestamp": "2024-12-31T03:12:55.386582",
"cpu_percent": 3.2,
"ram_percent": 69.1,
"ram_used_gb": 43.9305419921875,
"gpu_memory_used": 3685.0
},
{
"timestamp": "2024-12-31T03:12:59.492304",
"cpu_percent": 15.6,
"ram_percent": 69.1,
"ram_used_gb": 43.964195251464844,
"gpu_memory_used": 4053.0
},
{
"timestamp": "2024-12-31T03:12:59.586143",
"cpu_percent": 2.1,
"ram_percent": 69.1,
"ram_used_gb": 43.9642448425293,
"gpu_memory_used": 4053.0
},
{
"timestamp": "2024-12-31T03:13:04.705286",
"cpu_percent": 12.0,
"ram_percent": 69.2,
"ram_used_gb": 43.992374420166016,
"gpu_memory_used": 4059.0
},
{
"timestamp": "2024-12-31T03:13:04.779475",
"cpu_percent": 4.7,
"ram_percent": 69.2,
"ram_used_gb": 43.9922981262207,
"gpu_memory_used": 4059.0
},
{
"timestamp": "2024-12-31T03:13:10.063292",
"cpu_percent": 12.4,
"ram_percent": 69.2,
"ram_used_gb": 44.004146575927734,
"gpu_memory_used": 4041.0
},
{
"timestamp": "2024-12-31T03:13:10.155395",
"cpu_percent": 6.8,
"ram_percent": 69.2,
"ram_used_gb": 44.004215240478516,
"gpu_memory_used": 4041.0
},
{
"timestamp": "2024-12-31T03:13:16.097887",
"cpu_percent": 13.1,
"ram_percent": 69.2,
"ram_used_gb": 44.0260009765625,
"gpu_memory_used": 4042.0
},
{
"timestamp": "2024-12-31T03:13:16.171478",
"cpu_percent": 4.5,
"ram_percent": 69.2,
"ram_used_gb": 44.02027130126953,
"gpu_memory_used": 4042.0
},
{
"timestamp": "2024-12-31T03:13:23.044945",
"cpu_percent": 12.6,
"ram_percent": 69.2,
"ram_used_gb": 44.03746795654297,
"gpu_memory_used": 4044.0
},
{
"timestamp": "2024-12-31T03:13:23.127442",
"cpu_percent": 8.3,
"ram_percent": 69.2,
"ram_used_gb": 44.0373420715332,
"gpu_memory_used": 4044.0
},
{
"timestamp": "2024-12-31T03:13:36.780309",
"cpu_percent": 12.5,
"ram_percent": 69.2,
"ram_used_gb": 44.00790786743164,
"gpu_memory_used": 4034.0
},
{
"timestamp": "2024-12-31T03:13:36.853474",
"cpu_percent": 6.2,
"ram_percent": 69.2,
"ram_used_gb": 44.00779724121094,
"gpu_memory_used": 4034.0
},
{
"timestamp": "2024-12-31T03:13:57.449274",
"cpu_percent": 12.4,
"ram_percent": 69.2,
"ram_used_gb": 44.0432243347168,
"gpu_memory_used": 4034.0
},
{
"timestamp": "2024-12-31T03:13:57.524592",
"cpu_percent": 6.2,
"ram_percent": 69.2,
"ram_used_gb": 44.03204345703125,
"gpu_memory_used": 4034.0
},
{
"timestamp": "2024-12-31T03:14:24.698822",
"cpu_percent": 13.4,
"ram_percent": 69.5,
"ram_used_gb": 44.18327331542969,
"gpu_memory_used": 4480.0
},
{
"timestamp": "2024-12-31T03:14:24.783683",
"cpu_percent": 4.2,
"ram_percent": 69.5,
"ram_used_gb": 44.182212829589844,
"gpu_memory_used": 4480.0
},
{
"timestamp": "2024-12-31T03:14:58.242642",
"cpu_percent": 12.8,
"ram_percent": 69.5,
"ram_used_gb": 44.20225524902344,
"gpu_memory_used": 4476.0
},
{
"timestamp": "2024-12-31T03:14:58.310907",
"cpu_percent": 2.9,
"ram_percent": 69.5,
"ram_used_gb": 44.19659423828125,
"gpu_memory_used": 4476.0
},
{
"timestamp": "2024-12-31T03:15:42.196813",
"cpu_percent": 14.3,
"ram_percent": 69.9,
"ram_used_gb": 44.43781661987305,
"gpu_memory_used": 4494.0
},
{
"timestamp": "2024-12-31T03:15:42.288427",
"cpu_percent": 13.7,
"ram_percent": 69.9,
"ram_used_gb": 44.439701080322266,
"gpu_memory_used": 4494.0
},
{
"timestamp": "2024-12-31T03:16:35.483849",
"cpu_percent": 14.7,
"ram_percent": 65.0,
"ram_used_gb": 41.35385513305664,
"gpu_memory_used": 4506.0
},
{
"timestamp": "2024-12-31T03:16:35.626628",
"cpu_percent": 32.9,
"ram_percent": 65.0,
"ram_used_gb": 41.34442138671875,
"gpu_memory_used": 4506.0
},
{
"timestamp": "2024-12-31T03:17:29.378353",
"cpu_percent": 13.4,
"ram_percent": 64.3,
"ram_used_gb": 40.8721809387207,
"gpu_memory_used": 4485.0
},
{
"timestamp": "2024-12-31T03:17:29.457464",
"cpu_percent": 5.1,
"ram_percent": 64.3,
"ram_used_gb": 40.875389099121094,
"gpu_memory_used": 4485.0
},
{
"timestamp": "2024-12-31T03:18:31.955862",
"cpu_percent": 14.3,
"ram_percent": 65.0,
"ram_used_gb": 41.360206604003906,
"gpu_memory_used": 4484.0
},
{
"timestamp": "2024-12-31T03:18:32.038999",
"cpu_percent": 12.5,
"ram_percent": 65.0,
"ram_used_gb": 41.37223434448242,
"gpu_memory_used": 4484.0
},
{
"timestamp": "2024-12-31T03:19:46.454105",
"cpu_percent": 13.9,
"ram_percent": 65.3,
"ram_used_gb": 41.562198638916016,
"gpu_memory_used": 4487.0
},
{
"timestamp": "2024-12-31T03:19:46.524303",
"cpu_percent": 6.8,
"ram_percent": 65.3,
"ram_used_gb": 41.56681442260742,
"gpu_memory_used": 4487.0
},
{
"timestamp": "2024-12-31T03:21:25.251452",
"cpu_percent": 23.7,
"ram_percent": 62.0,
"ram_used_gb": 39.456459045410156,
"gpu_memory_used": 4488.0
},
{
"timestamp": "2024-12-31T03:21:25.348643",
"cpu_percent": 2.9,
"ram_percent": 62.0,
"ram_used_gb": 39.454288482666016,
"gpu_memory_used": 4487.0
},
{
"timestamp": "2024-12-31T03:22:53.939896",
"cpu_percent": 12.9,
"ram_percent": 62.1,
"ram_used_gb": 39.50320053100586,
"gpu_memory_used": 4488.0
},
{
"timestamp": "2024-12-31T03:22:54.041607",
"cpu_percent": 8.3,
"ram_percent": 62.1,
"ram_used_gb": 39.49895095825195,
"gpu_memory_used": 4488.0
},
{
"timestamp": "2024-12-31T03:24:33.835432",
"cpu_percent": 12.9,
"ram_percent": 62.3,
"ram_used_gb": 39.647212982177734,
"gpu_memory_used": 4503.0
},
{
"timestamp": "2024-12-31T03:24:33.923914",
"cpu_percent": 7.6,
"ram_percent": 62.3,
"ram_used_gb": 39.64302062988281,
"gpu_memory_used": 4503.0
},
{
"timestamp": "2024-12-31T03:26:22.021598",
"cpu_percent": 12.9,
"ram_percent": 58.4,
"ram_used_gb": 37.162540435791016,
"gpu_memory_used": 4491.0
},
{
"timestamp": "2024-12-31T03:26:22.142138",
"cpu_percent": 12.0,
"ram_percent": 58.4,
"ram_used_gb": 37.162010192871094,
"gpu_memory_used": 4487.0
},
{
"timestamp": "2024-12-31T03:28:15.970365",
"cpu_percent": 15.0,
"ram_percent": 58.2,
"ram_used_gb": 37.04011535644531,
"gpu_memory_used": 4481.0
},
{
"timestamp": "2024-12-31T03:28:16.096459",
"cpu_percent": 12.4,
"ram_percent": 58.2,
"ram_used_gb": 37.035972595214844,
"gpu_memory_used": 4473.0
},
{
"timestamp": "2024-12-31T03:30:17.092257",
"cpu_percent": 12.4,
"ram_percent": 58.4,
"ram_used_gb": 37.14639663696289,
"gpu_memory_used": 4459.0
}
]
}

View file

@ -1,19 +0,0 @@
=== Benchmark Statistics ===
Overall Stats:
Total tokens processed: 140500
Total audio generated: 43469.18s
Total test duration: 1061.20s
Average processing rate: 137.67 tokens/second
Average realtime factor: 42.93x
Per-chunk Stats:
Average chunk size: 5620.00 tokens
Min chunk size: 100.00 tokens
Max chunk size: 16000.00 tokens
Average processing time: 41.13s
Average output length: 1738.77s
Performance Ranges:
Processing rate range: 11.70 - 155.99 tokens/second
Realtime factor range: 3.65x - 49.46x

View file

@ -1,406 +0,0 @@
import os
import json
import time
import subprocess
from datetime import datetime
import pandas as pd
import psutil
import seaborn as sns
import requests
import tiktoken
import scipy.io.wavfile as wavfile
import matplotlib.pyplot as plt
enc = tiktoken.get_encoding("cl100k_base")
def setup_plot(fig, ax, title):
"""Configure plot styling"""
# Improve grid
ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
# Set title and labels with better fonts
ax.set_title(title, pad=20, fontsize=16, fontweight="bold", color="#ffffff")
ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
# Improve tick labels
ax.tick_params(labelsize=12, colors="#ffffff")
# Style spines
for spine in ax.spines.values():
spine.set_color("#ffffff")
spine.set_alpha(0.3)
spine.set_linewidth(0.5)
# Set background colors
ax.set_facecolor("#1a1a2e")
fig.patch.set_facecolor("#1a1a2e")
return fig, ax
def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens"""
tokens = enc.encode(text)
if num_tokens > len(tokens):
return text
return enc.decode(tokens[:num_tokens])
def get_audio_length(audio_data: bytes) -> float:
"""Get audio length in seconds from bytes data"""
# Save to a temporary file
temp_path = "examples/benchmarks/output/temp.wav"
os.makedirs(os.path.dirname(temp_path), exist_ok=True)
with open(temp_path, "wb") as f:
f.write(audio_data)
# Read the audio file
try:
rate, data = wavfile.read(temp_path)
return len(data) / rate
finally:
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)
def get_gpu_memory():
"""Get GPU memory usage using nvidia-smi"""
try:
result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
)
return float(result.decode("utf-8").strip())
except (subprocess.CalledProcessError, FileNotFoundError):
return None
def get_system_metrics():
"""Get current system metrics"""
metrics = {
"timestamp": datetime.now().isoformat(),
"cpu_percent": psutil.cpu_percent(),
"ram_percent": psutil.virtual_memory().percent,
"ram_used_gb": psutil.virtual_memory().used / (1024**3),
}
gpu_mem = get_gpu_memory()
if gpu_mem is not None:
metrics["gpu_memory_used"] = gpu_mem
return metrics
def make_tts_request(text: str, timeout: int = 120) -> tuple[float, float]:
"""Make TTS request using OpenAI-compatible endpoint and return processing time and output length"""
try:
start_time = time.time()
# Make request to OpenAI-compatible endpoint
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
},
timeout=timeout,
)
response.raise_for_status()
processing_time = time.time() - start_time
audio_length = get_audio_length(response.content)
# Save the audio file
token_count = len(enc.encode(text))
output_file = f"examples/benchmarks/output/chunk_{token_count}_tokens.wav"
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "wb") as f:
f.write(response.content)
print(f"Saved audio to {output_file}")
return processing_time, audio_length
except requests.exceptions.RequestException as e:
print(f"Error making request for text: {text[:50]}... Error: {str(e)}")
return None, None
except Exception as e:
print(f"Error processing text: {text[:50]}... Error: {str(e)}")
return None, None
def plot_system_metrics(metrics_data):
"""Create plots for system metrics over time"""
df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
# Get baseline values (first measurement)
baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = (
df["gpu_memory_used"].iloc[0] / 1024
if "gpu_memory_used" in df.columns
else None
) # Convert MB to GB
# Convert GPU memory to GB
if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
# Set plotting style
plt.style.use("dark_background")
# Create figure with 3 subplots (or 2 if no GPU)
has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor("#1a1a2e")
# Apply rolling average for smoothing
window = min(5, len(df) // 2) # Smaller window for smoother lines
# Plot 1: CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(
x=elapsed_time, y=smoothed_cpu, ax=axes[0], color="#ff2a6d", linewidth=2
)
axes[0].axhline(
y=baseline_cpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline"
)
axes[0].set_xlabel("Time (seconds)", fontsize=14)
axes[0].set_ylabel("CPU Usage (%)", fontsize=14)
axes[0].tick_params(labelsize=12)
axes[0].set_title("CPU Usage Over Time", pad=20, fontsize=16, fontweight="bold")
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1) # Add 10% padding
axes[0].legend()
# Plot 2: RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(
x=elapsed_time, y=smoothed_ram, ax=axes[1], color="#05d9e8", linewidth=2
)
axes[1].axhline(
y=baseline_ram, color="#ff2a6d", linestyle="--", alpha=0.5, label="Baseline"
)
axes[1].set_xlabel("Time (seconds)", fontsize=14)
axes[1].set_ylabel("RAM Usage (GB)", fontsize=14)
axes[1].tick_params(labelsize=12)
axes[1].set_title("RAM Usage Over Time", pad=20, fontsize=16, fontweight="bold")
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1) # Add 10% padding
axes[1].legend()
# Plot 3: GPU Memory (if available)
if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(
x=elapsed_time, y=smoothed_gpu, ax=axes[2], color="#ff2a6d", linewidth=2
)
axes[2].axhline(
y=baseline_gpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline"
)
axes[2].set_xlabel("Time (seconds)", fontsize=14)
axes[2].set_ylabel("GPU Memory (GB)", fontsize=14)
axes[2].tick_params(labelsize=12)
axes[2].set_title(
"GPU Memory Usage Over Time", pad=20, fontsize=16, fontweight="bold"
)
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1) # Add 10% padding
axes[2].legend()
# Style all subplots
for ax in axes:
ax.grid(True, linestyle="--", alpha=0.3)
ax.set_facecolor("#1a1a2e")
for spine in ax.spines.values():
spine.set_color("#ffffff")
spine.set_alpha(0.3)
plt.tight_layout()
plt.savefig("examples/benchmarks/system_usage.png", dpi=300, bbox_inches="tight")
plt.close()
def main():
# Create output directory
os.makedirs("examples/benchmarks/output", exist_ok=True)
# Read input text
with open(
"examples/benchmarks/the_time_machine_hg_wells.txt", "r", encoding="utf-8"
) as f:
text = f.read()
# Get total tokens in file
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
# Generate token sizes with dense sampling at start and increasing intervals
dense_range = list(range(100, 1001, 100))
current = max(dense_range)
large_range = []
while current <= total_tokens:
large_range.append(current)
current += 1000
token_sizes = sorted(list(set(dense_range + large_range)))
print(f"Testing sizes: {token_sizes}")
# Process chunks
results = []
system_metrics = []
test_start_time = time.time()
for num_tokens in token_sizes:
# Get text slice with exact token count
chunk = get_text_for_tokens(text, num_tokens)
actual_tokens = len(enc.encode(chunk))
print(f"\nProcessing chunk with {actual_tokens} tokens:")
print(f"Text preview: {chunk[:100]}...")
# Collect system metrics before processing
system_metrics.append(get_system_metrics())
processing_time, audio_length = make_tts_request(chunk)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
break
# Collect system metrics after processing
system_metrics.append(get_system_metrics())
results.append(
{
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"realtime_factor": audio_length / processing_time,
"elapsed_time": time.time() - test_start_time,
}
)
# Save intermediate results
with open("examples/benchmarks/benchmark_results.json", "w") as f:
json.dump(
{"results": results, "system_metrics": system_metrics}, f, indent=2
)
# Create DataFrame and calculate stats
df = pd.DataFrame(results)
if df.empty:
print("No data to plot")
return
# Calculate useful metrics
df["tokens_per_second"] = df["tokens"] / df["processing_time"]
# Write detailed stats
with open("examples/benchmarks/benchmark_stats.txt", "w") as f:
f.write("=== Benchmark Statistics ===\n\n")
f.write("Overall Stats:\n")
f.write(f"Total tokens processed: {df['tokens'].sum()}\n")
f.write(f"Total audio generated: {df['output_length'].sum():.2f}s\n")
f.write(f"Total test duration: {df['elapsed_time'].max():.2f}s\n")
f.write(
f"Average processing rate: {df['tokens_per_second'].mean():.2f} tokens/second\n"
)
f.write(f"Average realtime factor: {df['realtime_factor'].mean():.2f}x\n\n")
f.write("Per-chunk Stats:\n")
f.write(f"Average chunk size: {df['tokens'].mean():.2f} tokens\n")
f.write(f"Min chunk size: {df['tokens'].min():.2f} tokens\n")
f.write(f"Max chunk size: {df['tokens'].max():.2f} tokens\n")
f.write(f"Average processing time: {df['processing_time'].mean():.2f}s\n")
f.write(f"Average output length: {df['output_length'].mean():.2f}s\n\n")
f.write("Performance Ranges:\n")
f.write(
f"Processing rate range: {df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f} tokens/second\n"
)
f.write(
f"Realtime factor range: {df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x\n"
)
# Set plotting style
plt.style.use("dark_background")
# Plot 1: Processing Time vs Token Count
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(
data=df, x="tokens", y="processing_time", s=100, alpha=0.6, color="#ff2a6d"
)
sns.regplot(
data=df,
x="tokens",
y="processing_time",
scatter=False,
color="#05d9e8",
line_kws={"linewidth": 2},
)
corr = df["tokens"].corr(df["processing_time"])
plt.text(
0.05,
0.95,
f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=10,
color="#ffffff",
bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7),
)
setup_plot(fig, ax, "Processing Time vs Input Size")
ax.set_xlabel("Number of Input Tokens")
ax.set_ylabel("Processing Time (seconds)")
plt.savefig("examples/benchmarks/processing_time.png", dpi=300, bbox_inches="tight")
plt.close()
# Plot 2: Realtime Factor vs Token Count
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(
data=df, x="tokens", y="realtime_factor", s=100, alpha=0.6, color="#ff2a6d"
)
sns.regplot(
data=df,
x="tokens",
y="realtime_factor",
scatter=False,
color="#05d9e8",
line_kws={"linewidth": 2},
)
corr = df["tokens"].corr(df["realtime_factor"])
plt.text(
0.05,
0.95,
f"Correlation: {corr:.2f}",
transform=ax.transAxes,
fontsize=10,
color="#ffffff",
bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7),
)
setup_plot(fig, ax, "Realtime Factor vs Input Size")
ax.set_xlabel("Number of Input Tokens")
ax.set_ylabel("Realtime Factor (output length / processing time)")
plt.savefig("examples/benchmarks/realtime_factor.png", dpi=300, bbox_inches="tight")
plt.close()
# Plot system metrics
plot_system_metrics(system_metrics)
print("\nResults saved to:")
print("- examples/benchmarks/benchmark_results.json")
print("- examples/benchmarks/benchmark_stats.txt")
print("- examples/benchmarks/processing_time.png")
print("- examples/benchmarks/realtime_factor.png")
print("- examples/benchmarks/system_usage.png")
if any("gpu_memory_used" in m for m in system_metrics):
print("- examples/benchmarks/gpu_usage.png")
print("\nAudio files saved in examples/benchmarks/output/")
if __name__ == "__main__":
main()

View file

@ -1,314 +0,0 @@
import os
import json
import time
import subprocess
from datetime import datetime
import pandas as pd
import psutil
import seaborn as sns
import requests
import tiktoken
import scipy.io.wavfile as wavfile
import matplotlib.pyplot as plt
enc = tiktoken.get_encoding("cl100k_base")
def setup_plot(fig, ax, title):
"""Configure plot styling"""
ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
ax.set_title(title, pad=20, fontsize=16, fontweight="bold", color="#ffffff")
ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
ax.tick_params(labelsize=12, colors="#ffffff")
for spine in ax.spines.values():
spine.set_color("#ffffff")
spine.set_alpha(0.3)
spine.set_linewidth(0.5)
ax.set_facecolor("#1a1a2e")
fig.patch.set_facecolor("#1a1a2e")
return fig, ax
def get_text_for_tokens(text: str, num_tokens: int) -> str:
"""Get a slice of text that contains exactly num_tokens tokens"""
tokens = enc.encode(text)
if num_tokens > len(tokens):
return text
return enc.decode(tokens[:num_tokens])
def get_audio_length(audio_data: bytes) -> float:
"""Get audio length in seconds from bytes data"""
temp_path = "examples/benchmarks/output/temp.wav"
os.makedirs(os.path.dirname(temp_path), exist_ok=True)
with open(temp_path, "wb") as f:
f.write(audio_data)
try:
rate, data = wavfile.read(temp_path)
return len(data) / rate
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
def get_gpu_memory():
"""Get GPU memory usage using nvidia-smi"""
try:
result = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
)
return float(result.decode("utf-8").strip())
except (subprocess.CalledProcessError, FileNotFoundError):
return None
def get_system_metrics():
"""Get current system metrics"""
# Get per-CPU percentages and calculate average
cpu_percentages = psutil.cpu_percent(percpu=True)
avg_cpu = sum(cpu_percentages) / len(cpu_percentages)
metrics = {
"timestamp": datetime.now().isoformat(),
"cpu_percent": round(avg_cpu, 2),
"ram_percent": psutil.virtual_memory().percent,
"ram_used_gb": psutil.virtual_memory().used / (1024**3),
}
gpu_mem = get_gpu_memory()
if gpu_mem is not None:
metrics["gpu_memory_used"] = gpu_mem
return metrics
def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float:
"""Calculate Real-Time Factor (RTF) as processing-time / length-of-audio"""
rtf = processing_time / audio_length
return round(rtf, decimals)
def make_tts_request(text: str, timeout: int = 1800) -> tuple[float, float]:
"""Make TTS request using OpenAI-compatible endpoint and return processing time and output length"""
try:
start_time = time.time()
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
},
timeout=timeout,
)
response.raise_for_status()
processing_time = round(time.time() - start_time, 2)
audio_length = round(get_audio_length(response.content), 2)
# Save the audio file
token_count = len(enc.encode(text))
output_file = f"examples/benchmarks/output/chunk_{token_count}_tokens.wav"
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "wb") as f:
f.write(response.content)
print(f"Saved audio to {output_file}")
return processing_time, audio_length
except requests.exceptions.RequestException as e:
print(f"Error making request for text: {text[:50]}... Error: {str(e)}")
return None, None
except Exception as e:
print(f"Error processing text: {text[:50]}... Error: {str(e)}")
return None, None
def plot_system_metrics(metrics_data):
"""Create plots for system metrics over time"""
df = pd.DataFrame(metrics_data)
df["timestamp"] = pd.to_datetime(df["timestamp"])
elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds()
baseline_cpu = df["cpu_percent"].iloc[0]
baseline_ram = df["ram_used_gb"].iloc[0]
baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None
if "gpu_memory_used" in df.columns:
df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024
plt.style.use("dark_background")
has_gpu = "gpu_memory_used" in df.columns
num_plots = 3 if has_gpu else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots))
fig.patch.set_facecolor("#1a1a2e")
window = min(5, len(df) // 2)
# Plot CPU Usage
smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], color="#ff2a6d", linewidth=2)
axes[0].axhline(y=baseline_cpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline")
axes[0].set_xlabel("Time (seconds)")
axes[0].set_ylabel("CPU Usage (%)")
axes[0].set_title("CPU Usage Over Time")
axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1)
axes[0].legend()
# Plot RAM Usage
smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], color="#05d9e8", linewidth=2)
axes[1].axhline(y=baseline_ram, color="#ff2a6d", linestyle="--", alpha=0.5, label="Baseline")
axes[1].set_xlabel("Time (seconds)")
axes[1].set_ylabel("RAM Usage (GB)")
axes[1].set_title("RAM Usage Over Time")
axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1)
axes[1].legend()
# Plot GPU Memory if available
if has_gpu:
smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean()
sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], color="#ff2a6d", linewidth=2)
axes[2].axhline(y=baseline_gpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline")
axes[2].set_xlabel("Time (seconds)")
axes[2].set_ylabel("GPU Memory (GB)")
axes[2].set_title("GPU Memory Usage Over Time")
axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1)
axes[2].legend()
for ax in axes:
ax.grid(True, linestyle="--", alpha=0.3)
ax.set_facecolor("#1a1a2e")
for spine in ax.spines.values():
spine.set_color("#ffffff")
spine.set_alpha(0.3)
plt.tight_layout()
plt.savefig("examples/benchmarks/system_usage_rtf.png", dpi=300, bbox_inches="tight")
plt.close()
def main():
os.makedirs("examples/benchmarks/output", exist_ok=True)
with open("examples/benchmarks/the_time_machine_hg_wells.txt", "r", encoding="utf-8") as f:
text = f.read()
total_tokens = len(enc.encode(text))
print(f"Total tokens in file: {total_tokens}")
# Generate token sizes with dense sampling at start
dense_range = list(range(100, 1001, 100))
token_sizes = sorted(list(set(dense_range)))
print(f"Testing sizes: {token_sizes}")
results = []
system_metrics = []
test_start_time = time.time()
for num_tokens in token_sizes:
chunk = get_text_for_tokens(text, num_tokens)
actual_tokens = len(enc.encode(chunk))
print(f"\nProcessing chunk with {actual_tokens} tokens:")
print(f"Text preview: {chunk[:100]}...")
system_metrics.append(get_system_metrics())
processing_time, audio_length = make_tts_request(chunk)
if processing_time is None or audio_length is None:
print("Breaking loop due to error")
break
system_metrics.append(get_system_metrics())
# Calculate RTF using the correct formula
rtf = real_time_factor(processing_time, audio_length)
results.append({
"tokens": actual_tokens,
"processing_time": processing_time,
"output_length": audio_length,
"rtf": rtf,
"elapsed_time": round(time.time() - test_start_time, 2),
})
with open("examples/benchmarks/benchmark_results_rtf.json", "w") as f:
json.dump({"results": results, "system_metrics": system_metrics}, f, indent=2)
df = pd.DataFrame(results)
if df.empty:
print("No data to plot")
return
df["tokens_per_second"] = df["tokens"] / df["processing_time"]
with open("examples/benchmarks/benchmark_stats_rtf.txt", "w") as f:
f.write("=== Benchmark Statistics (with correct RTF) ===\n\n")
f.write("Overall Stats:\n")
f.write(f"Total tokens processed: {df['tokens'].sum()}\n")
f.write(f"Total audio generated: {df['output_length'].sum():.2f}s\n")
f.write(f"Total test duration: {df['elapsed_time'].max():.2f}s\n")
f.write(f"Average processing rate: {df['tokens_per_second'].mean():.2f} tokens/second\n")
f.write(f"Average RTF: {df['rtf'].mean():.2f}x\n\n")
f.write("Per-chunk Stats:\n")
f.write(f"Average chunk size: {df['tokens'].mean():.2f} tokens\n")
f.write(f"Min chunk size: {df['tokens'].min():.2f} tokens\n")
f.write(f"Max chunk size: {df['tokens'].max():.2f} tokens\n")
f.write(f"Average processing time: {df['processing_time'].mean():.2f}s\n")
f.write(f"Average output length: {df['output_length'].mean():.2f}s\n\n")
f.write("Performance Ranges:\n")
f.write(f"Processing rate range: {df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f} tokens/second\n")
f.write(f"RTF range: {df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x\n")
plt.style.use("dark_background")
# Plot Processing Time vs Token Count
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(data=df, x="tokens", y="processing_time", s=100, alpha=0.6, color="#ff2a6d")
sns.regplot(data=df, x="tokens", y="processing_time", scatter=False, color="#05d9e8", line_kws={"linewidth": 2})
corr = df["tokens"].corr(df["processing_time"])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff",
bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7))
setup_plot(fig, ax, "Processing Time vs Input Size")
ax.set_xlabel("Number of Input Tokens")
ax.set_ylabel("Processing Time (seconds)")
plt.savefig("examples/benchmarks/processing_time_rtf.png", dpi=300, bbox_inches="tight")
plt.close()
# Plot RTF vs Token Count
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(data=df, x="tokens", y="rtf", s=100, alpha=0.6, color="#ff2a6d")
sns.regplot(data=df, x="tokens", y="rtf", scatter=False, color="#05d9e8", line_kws={"linewidth": 2})
corr = df["tokens"].corr(df["rtf"])
plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff",
bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7))
setup_plot(fig, ax, "Real-Time Factor vs Input Size")
ax.set_xlabel("Number of Input Tokens")
ax.set_ylabel("Real-Time Factor (processing time / audio length)")
plt.savefig("examples/benchmarks/realtime_factor_rtf.png", dpi=300, bbox_inches="tight")
plt.close()
plot_system_metrics(system_metrics)
print("\nResults saved to:")
print("- examples/benchmarks/benchmark_results_rtf.json")
print("- examples/benchmarks/benchmark_stats_rtf.txt")
print("- examples/benchmarks/processing_time_rtf.png")
print("- examples/benchmarks/realtime_factor_rtf.png")
print("- examples/benchmarks/system_usage_rtf.png")
print("\nAudio files saved in examples/benchmarks/output/")
if __name__ == "__main__":
main()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 283 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 254 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 223 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 208 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 406 KiB