First streaming attempt
BIN
.coverage
1
.gitignore
vendored
|
@ -23,4 +23,5 @@ examples/assorted_checks/test_openai/output/*
|
||||||
|
|
||||||
examples/assorted_checks/test_voices/output/*
|
examples/assorted_checks/test_voices/output/*
|
||||||
examples/assorted_checks/test_formats/output/*
|
examples/assorted_checks/test_formats/output/*
|
||||||
|
examples/assorted_checks/benchmarks/output_audio_stream/*
|
||||||
ui/RepoScreenshot.png
|
ui/RepoScreenshot.png
|
||||||
|
|
|
@ -23,8 +23,16 @@ async def lifespan(app: FastAPI):
|
||||||
|
|
||||||
# Initialize the main model with warm-up
|
# Initialize the main model with warm-up
|
||||||
voicepack_count = TTSModel.setup()
|
voicepack_count = TTSModel.setup()
|
||||||
|
logger.info("""
|
||||||
|
███████╗█████╗█████████████████╗ ██╗██████╗██╗ ██╗██████╗
|
||||||
|
██╔════██╔══████╔════╚══██╔══██║ ██╔██╔═══████║ ██╔██╔═══██╗
|
||||||
|
█████╗ ██████████████╗ ██║ █████╔╝██║ ███████╔╝██║ ██║
|
||||||
|
██╔══╝ ██╔══██╚════██║ ██║ ██╔═██╗██║ ████╔═██╗██║ ██║
|
||||||
|
██║ ██║ █████████║ ██║ ██║ ██╚██████╔██║ ██╚██████╔╝
|
||||||
|
╚═╝ ╚═╝ ╚═╚══════╝ ╚═╝ ╚═╝ ╚═╝╚═════╝╚═╝ ╚═╝╚═════╝ """)
|
||||||
logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
|
logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
|
||||||
logger.info(f"{voicepack_count} voice packs loaded successfully")
|
logger.info(f"{voicepack_count} voice packs loaded successfully")
|
||||||
|
logger.info("#" * 80)
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,10 +2,12 @@ from typing import List
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from fastapi import Depends, Response, APIRouter, HTTPException
|
from fastapi import Depends, Response, APIRouter, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
|
||||||
from ..services.tts_service import TTSService
|
from ..services.tts_service import TTSService
|
||||||
from ..services.audio import AudioService
|
from ..services.audio import AudioService
|
||||||
from ..structures.schemas import OpenAISpeechRequest
|
from ..structures.schemas import OpenAISpeechRequest
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
tags=["OpenAI Compatible TTS"],
|
tags=["OpenAI Compatible TTS"],
|
||||||
|
@ -18,6 +20,16 @@ def get_tts_service() -> TTSService:
|
||||||
return TTSService() # Initialize TTSService with default settings
|
return TTSService() # Initialize TTSService with default settings
|
||||||
|
|
||||||
|
|
||||||
|
async def stream_audio_chunks(tts_service: TTSService, request: OpenAISpeechRequest) -> AsyncGenerator[bytes, None]:
|
||||||
|
"""Stream audio chunks as they're generated"""
|
||||||
|
async for chunk in tts_service.generate_audio_stream(
|
||||||
|
text=request.input,
|
||||||
|
voice=request.voice,
|
||||||
|
speed=request.speed,
|
||||||
|
output_format=request.response_format
|
||||||
|
):
|
||||||
|
yield chunk
|
||||||
|
|
||||||
@router.post("/audio/speech")
|
@router.post("/audio/speech")
|
||||||
async def create_speech(
|
async def create_speech(
|
||||||
request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
|
request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
|
||||||
|
@ -31,24 +43,52 @@ async def create_speech(
|
||||||
f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
|
f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate audio directly using TTSService's method
|
# Set content type based on format
|
||||||
audio, _ = tts_service._generate_audio(
|
content_type = {
|
||||||
text=request.input,
|
"mp3": "audio/mpeg",
|
||||||
voice=request.voice,
|
"opus": "audio/opus",
|
||||||
speed=request.speed,
|
"aac": "audio/aac",
|
||||||
stitch_long_output=True,
|
"flac": "audio/flac",
|
||||||
)
|
"wav": "audio/wav",
|
||||||
|
"pcm": "audio/pcm",
|
||||||
|
}.get(request.response_format, f"audio/{request.response_format}")
|
||||||
|
|
||||||
# Convert to requested format
|
if request.stream:
|
||||||
content = AudioService.convert_audio(audio, 24000, request.response_format)
|
# Stream audio chunks as they're generated
|
||||||
|
return StreamingResponse(
|
||||||
|
stream_audio_chunks(tts_service, request),
|
||||||
|
media_type=content_type,
|
||||||
|
headers={
|
||||||
|
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
||||||
|
"X-Accel-Buffering": "no", # Disable proxy buffering
|
||||||
|
"Cache-Control": "no-cache", # Prevent caching
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Generate complete audio
|
||||||
|
audio, _ = tts_service._generate_audio(
|
||||||
|
text=request.input,
|
||||||
|
voice=request.voice,
|
||||||
|
speed=request.speed,
|
||||||
|
stitch_long_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
return Response(
|
# Convert to requested format
|
||||||
content=content,
|
content = AudioService.convert_audio(
|
||||||
media_type=f"audio/{request.response_format}",
|
audio,
|
||||||
headers={
|
24000,
|
||||||
"Content-Disposition": f"attachment; filename=speech.{request.response_format}"
|
request.response_format,
|
||||||
},
|
is_first_chunk=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return Response(
|
||||||
|
content=content,
|
||||||
|
media_type=content_type,
|
||||||
|
headers={
|
||||||
|
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
||||||
|
"Cache-Control": "no-cache", # Prevent caching
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Invalid request: {str(e)}")
|
logger.error(f"Invalid request: {str(e)}")
|
||||||
|
|
|
@ -7,12 +7,35 @@ import soundfile as sf
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class AudioNormalizer:
|
||||||
|
"""Handles audio normalization state for a single stream"""
|
||||||
|
def __init__(self):
|
||||||
|
self.int16_max = np.iinfo(np.int16).max
|
||||||
|
|
||||||
|
def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
||||||
|
"""Normalize audio data to int16 range"""
|
||||||
|
# Convert to float64 for accurate scaling
|
||||||
|
audio_float = audio_data.astype(np.float64)
|
||||||
|
|
||||||
|
# Scale to int16 range while preserving relative amplitudes
|
||||||
|
max_val = np.abs(audio_float).max()
|
||||||
|
if max_val > 0:
|
||||||
|
scaling = self.int16_max / max_val
|
||||||
|
audio_float *= scaling
|
||||||
|
|
||||||
|
# Clip to int16 range and convert
|
||||||
|
return np.clip(audio_float, -self.int16_max, self.int16_max).astype(np.int16)
|
||||||
|
|
||||||
class AudioService:
|
class AudioService:
|
||||||
"""Service for audio format conversions"""
|
"""Service for audio format conversions"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_audio(
|
def convert_audio(
|
||||||
audio_data: np.ndarray, sample_rate: int, output_format: str
|
audio_data: np.ndarray,
|
||||||
|
sample_rate: int,
|
||||||
|
output_format: str,
|
||||||
|
is_first_chunk: bool = True,
|
||||||
|
normalizer: AudioNormalizer = None
|
||||||
) -> bytes:
|
) -> bytes:
|
||||||
"""Convert audio data to specified format
|
"""Convert audio data to specified format
|
||||||
|
|
||||||
|
@ -20,6 +43,7 @@ class AudioService:
|
||||||
audio_data: Numpy array of audio samples
|
audio_data: Numpy array of audio samples
|
||||||
sample_rate: Sample rate of the audio
|
sample_rate: Sample rate of the audio
|
||||||
output_format: Target format (wav, mp3, opus, flac, pcm)
|
output_format: Target format (wav, mp3, opus, flac, pcm)
|
||||||
|
is_first_chunk: Whether this is the first chunk of a stream
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Bytes of the converted audio
|
Bytes of the converted audio
|
||||||
|
@ -27,30 +51,34 @@ class AudioService:
|
||||||
buffer = BytesIO()
|
buffer = BytesIO()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if output_format == "wav":
|
# Normalize audio if normalizer provided, otherwise just convert to int16
|
||||||
|
if normalizer is not None:
|
||||||
|
normalized_audio = normalizer.normalize(audio_data)
|
||||||
|
else:
|
||||||
|
normalized_audio = audio_data.astype(np.int16)
|
||||||
|
|
||||||
|
if output_format == "pcm":
|
||||||
|
logger.info("Writing PCM data...")
|
||||||
|
# Raw 16-bit PCM samples, no header
|
||||||
|
buffer.write(normalized_audio.tobytes())
|
||||||
|
elif output_format == "wav":
|
||||||
logger.info("Writing to WAV format...")
|
logger.info("Writing to WAV format...")
|
||||||
# Ensure audio_data is in int16 format for WAV
|
# Always include WAV header for WAV format
|
||||||
audio_data_wav = (
|
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
|
||||||
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
|
elif output_format in ["mp3", "aac"]:
|
||||||
).astype(np.int16) # Normalize
|
logger.info(f"Converting to {output_format.upper()} format...")
|
||||||
sf.write(buffer, audio_data_wav, sample_rate, format="WAV")
|
# Use lower bitrate for streaming
|
||||||
elif output_format == "mp3":
|
sf.write(buffer, normalized_audio, sample_rate, format=output_format.upper(),
|
||||||
logger.info("Converting to MP3 format...")
|
subtype='COMPRESSED')
|
||||||
# soundfile can write MP3 if ffmpeg or libsox is installed
|
|
||||||
sf.write(buffer, audio_data, sample_rate, format="MP3")
|
|
||||||
elif output_format == "opus":
|
elif output_format == "opus":
|
||||||
logger.info("Converting to Opus format...")
|
logger.info("Converting to Opus format...")
|
||||||
sf.write(buffer, audio_data, sample_rate, format="OGG", subtype="OPUS")
|
# Use lower bitrate and smaller frame size for streaming
|
||||||
|
sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
|
||||||
elif output_format == "flac":
|
elif output_format == "flac":
|
||||||
logger.info("Converting to FLAC format...")
|
logger.info("Converting to FLAC format...")
|
||||||
sf.write(buffer, audio_data, sample_rate, format="FLAC")
|
# Use smaller block size for streaming
|
||||||
elif output_format == "pcm":
|
sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
|
||||||
logger.info("Extracting PCM data...")
|
subtype='PCM_16')
|
||||||
# Ensure audio_data is in int16 format for PCM
|
|
||||||
audio_data_pcm = (
|
|
||||||
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
|
|
||||||
).astype(np.int16) # Normalize
|
|
||||||
buffer.write(audio_data_pcm.tobytes())
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
|
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import re
|
import re
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
def split_num(num: re.Match) -> str:
|
def split_num(num: re.Match) -> str:
|
||||||
"""Handle number splitting for various formats"""
|
"""Handle number splitting for various formats"""
|
||||||
|
@ -48,6 +49,7 @@ def handle_decimal(num: re.Match) -> str:
|
||||||
a, b = num.group().split(".")
|
a, b = num.group().split(".")
|
||||||
return " point ".join([a, " ".join(b)])
|
return " point ".join([a, " ".join(b)])
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1000) # Cache normalized text results
|
||||||
def normalize_text(text: str) -> str:
|
def normalize_text(text: str) -> str:
|
||||||
"""Normalize text for TTS processing
|
"""Normalize text for TTS processing
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import List, Tuple, Optional
|
from typing import List, Tuple, Optional
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -12,6 +13,7 @@ from loguru import logger
|
||||||
|
|
||||||
from ..core.config import settings
|
from ..core.config import settings
|
||||||
from .tts_model import TTSModel
|
from .tts_model import TTSModel
|
||||||
|
from .audio import AudioService, AudioNormalizer
|
||||||
|
|
||||||
|
|
||||||
class TTSService:
|
class TTSService:
|
||||||
|
@ -24,6 +26,12 @@ class TTSService:
|
||||||
text = str(text) if text is not None else ""
|
text = str(text) if text is not None else ""
|
||||||
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
|
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
|
||||||
|
def _load_voice(voice_path: str) -> torch.Tensor:
|
||||||
|
"""Load and cache a voice model"""
|
||||||
|
return torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
|
||||||
|
|
||||||
def _get_voice_path(self, voice_name: str) -> Optional[str]:
|
def _get_voice_path(self, voice_name: str) -> Optional[str]:
|
||||||
"""Get the path to a voice file"""
|
"""Get the path to a voice file"""
|
||||||
voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
|
voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
|
||||||
|
@ -31,6 +39,13 @@ class TTSService:
|
||||||
|
|
||||||
def _generate_audio(
|
def _generate_audio(
|
||||||
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
|
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
|
||||||
|
) -> Tuple[torch.Tensor, float]:
|
||||||
|
"""Generate complete audio and return with processing time"""
|
||||||
|
audio, processing_time = self._generate_audio_internal(text, voice, speed, stitch_long_output)
|
||||||
|
return audio, processing_time
|
||||||
|
|
||||||
|
def _generate_audio_internal(
|
||||||
|
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
|
||||||
) -> Tuple[torch.Tensor, float]:
|
) -> Tuple[torch.Tensor, float]:
|
||||||
"""Generate audio and measure processing time"""
|
"""Generate audio and measure processing time"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
@ -49,10 +64,8 @@ class TTSService:
|
||||||
if not voice_path:
|
if not voice_path:
|
||||||
raise ValueError(f"Voice not found: {voice}")
|
raise ValueError(f"Voice not found: {voice}")
|
||||||
|
|
||||||
# Load voice
|
# Load voice using cached loader
|
||||||
voicepack = torch.load(
|
voicepack = self._load_voice(voice_path)
|
||||||
voice_path, map_location=TTSModel.get_device(), weights_only=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate audio with or without stitching
|
# Generate audio with or without stitching
|
||||||
if stitch_long_output:
|
if stitch_long_output:
|
||||||
|
@ -97,6 +110,78 @@ class TTSService:
|
||||||
logger.error(f"Error in audio generation: {str(e)}")
|
logger.error(f"Error in audio generation: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
async def generate_audio_stream(
|
||||||
|
self, text: str, voice: str, speed: float, output_format: str = "wav"
|
||||||
|
):
|
||||||
|
"""Generate and yield audio chunks as they're generated for real-time streaming"""
|
||||||
|
try:
|
||||||
|
# Create normalizer for consistent audio levels
|
||||||
|
stream_normalizer = AudioNormalizer()
|
||||||
|
|
||||||
|
# Input validation and preprocessing
|
||||||
|
if not text:
|
||||||
|
raise ValueError("Text is empty")
|
||||||
|
normalized = normalize_text(text)
|
||||||
|
if not normalized:
|
||||||
|
raise ValueError("Text is empty after preprocessing")
|
||||||
|
text = str(normalized)
|
||||||
|
|
||||||
|
# Voice validation and loading
|
||||||
|
voice_path = self._get_voice_path(voice)
|
||||||
|
if not voice_path:
|
||||||
|
raise ValueError(f"Voice not found: {voice}")
|
||||||
|
voicepack = self._load_voice(voice_path)
|
||||||
|
|
||||||
|
# Split text into smaller chunks for faster streaming
|
||||||
|
# Use shorter chunks for real-time delivery
|
||||||
|
chunks = []
|
||||||
|
sentences = self._split_text(text)
|
||||||
|
current_chunk = []
|
||||||
|
current_length = 0
|
||||||
|
target_length = 100 # Target ~100 characters per chunk for faster processing
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
current_chunk.append(sentence)
|
||||||
|
current_length += len(sentence)
|
||||||
|
if current_length >= target_length:
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
current_chunk = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
|
||||||
|
# Process and stream chunks
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
try:
|
||||||
|
# Process text and generate audio
|
||||||
|
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
|
||||||
|
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
|
||||||
|
|
||||||
|
if chunk_audio is not None:
|
||||||
|
# Convert chunk with proper header handling
|
||||||
|
chunk_bytes = AudioService.convert_audio(
|
||||||
|
chunk_audio,
|
||||||
|
24000,
|
||||||
|
output_format,
|
||||||
|
is_first_chunk=(i == 0),
|
||||||
|
normalizer=stream_normalizer
|
||||||
|
)
|
||||||
|
yield chunk_bytes
|
||||||
|
else:
|
||||||
|
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in audio generation stream: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _save_audio(self, audio: torch.Tensor, filepath: str):
|
def _save_audio(self, audio: torch.Tensor, filepath: str):
|
||||||
"""Save audio to file"""
|
"""Save audio to file"""
|
||||||
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
||||||
|
|
|
@ -22,7 +22,7 @@ class OpenAISpeechRequest(BaseModel):
|
||||||
)
|
)
|
||||||
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
|
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
|
||||||
default="mp3",
|
default="mp3",
|
||||||
description="The format to return audio in. Supported formats: mp3, opus, flac, wav. AAC and PCM are not currently supported.",
|
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
|
||||||
)
|
)
|
||||||
speed: float = Field(
|
speed: float = Field(
|
||||||
default=1.0,
|
default=1.0,
|
||||||
|
@ -30,3 +30,7 @@ class OpenAISpeechRequest(BaseModel):
|
||||||
le=4.0,
|
le=4.0,
|
||||||
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
|
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
|
||||||
)
|
)
|
||||||
|
stream: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.",
|
||||||
|
)
|
||||||
|
|
|
@ -46,14 +46,14 @@ services:
|
||||||
model-fetcher:
|
model-fetcher:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
# Gradio UI service [Comment out everything below if you don't need it]
|
# # Gradio UI service [Comment out everything below if you don't need it]
|
||||||
gradio-ui:
|
# gradio-ui:
|
||||||
build:
|
# build:
|
||||||
context: ./ui
|
# context: ./ui
|
||||||
ports:
|
# ports:
|
||||||
- "7860:7860"
|
# - "7860:7860"
|
||||||
volumes:
|
# volumes:
|
||||||
- ./ui/data:/app/ui/data
|
# - ./ui/data:/app/ui/data
|
||||||
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||||
environment:
|
# environment:
|
||||||
- GRADIO_WATCH=True # Enable hot reloading
|
# - GRADIO_WATCH=True # Enable hot reloading
|
||||||
|
|
157
examples/assorted_checks/benchmarks/benchmark_first_token.py
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from lib.shared_benchmark_utils import get_text_for_tokens, enc
|
||||||
|
from lib.shared_utils import save_json_results
|
||||||
|
from lib.shared_plotting import plot_correlation, plot_timeline
|
||||||
|
|
||||||
|
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
|
||||||
|
"""Measure time to audio via API calls and save the audio output"""
|
||||||
|
results = {
|
||||||
|
"text_length": len(text),
|
||||||
|
"token_count": len(enc.encode(text)),
|
||||||
|
"total_time": None,
|
||||||
|
"time_to_first_chunk": None,
|
||||||
|
"error": None,
|
||||||
|
"audio_path": None,
|
||||||
|
"audio_length": None # Length of output audio in seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Make request without streaming
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:8880/v1/audio/speech",
|
||||||
|
json={
|
||||||
|
"model": "kokoro",
|
||||||
|
"input": text,
|
||||||
|
"voice": "af",
|
||||||
|
"response_format": "wav",
|
||||||
|
"stream": False
|
||||||
|
},
|
||||||
|
timeout=1800
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Save complete audio
|
||||||
|
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
|
||||||
|
audio_path = os.path.join(output_dir, audio_filename)
|
||||||
|
results["audio_path"] = audio_path
|
||||||
|
|
||||||
|
content = response.content
|
||||||
|
with open(audio_path, 'wb') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
# Calculate audio length using scipy
|
||||||
|
import scipy.io.wavfile as wavfile
|
||||||
|
sample_rate, audio_data = wavfile.read(audio_path)
|
||||||
|
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||||
|
results["time_to_first_chunk"] = time.time() - start_time
|
||||||
|
|
||||||
|
results["total_time"] = time.time() - start_time
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
results["error"] = str(e)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Set up paths
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
output_dir = os.path.join(script_dir, "output_audio")
|
||||||
|
output_data_dir = os.path.join(script_dir, "output_data")
|
||||||
|
|
||||||
|
# Create output directories
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
os.makedirs(output_data_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Load sample text
|
||||||
|
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# Test specific token counts
|
||||||
|
token_sizes = [10, 25, 50, 100, 200, 500]
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for tokens in token_sizes:
|
||||||
|
print(f"\nTesting {tokens} tokens")
|
||||||
|
test_text = get_text_for_tokens(text, tokens)
|
||||||
|
actual_tokens = len(enc.encode(test_text))
|
||||||
|
print(f"Text preview: {test_text[:50]}...")
|
||||||
|
|
||||||
|
# Run test 3 times for each size to get average
|
||||||
|
for i in range(5):
|
||||||
|
print(f"Run {i+1}/3...")
|
||||||
|
result = measure_first_token(test_text, output_dir, tokens, i + 1)
|
||||||
|
result["target_tokens"] = tokens
|
||||||
|
result["actual_tokens"] = actual_tokens
|
||||||
|
result["run_number"] = i + 1
|
||||||
|
|
||||||
|
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
|
||||||
|
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
|
||||||
|
|
||||||
|
if result["error"]:
|
||||||
|
print(f"Error: {result['error']}")
|
||||||
|
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
# Calculate averages per token size
|
||||||
|
summary = {}
|
||||||
|
for tokens in token_sizes:
|
||||||
|
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
|
||||||
|
if matching_results:
|
||||||
|
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
|
||||||
|
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
|
||||||
|
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
|
||||||
|
summary[tokens] = {
|
||||||
|
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||||
|
"avg_total_time": round(avg_total, 3),
|
||||||
|
"avg_audio_length": round(avg_audio_length, 3),
|
||||||
|
"num_successful_runs": len(matching_results)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
# Save results
|
||||||
|
results_data = {
|
||||||
|
"individual_runs": all_results,
|
||||||
|
"summary": summary,
|
||||||
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
}
|
||||||
|
save_json_results(
|
||||||
|
results_data,
|
||||||
|
os.path.join(output_data_dir, "first_token_benchmark.json")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create plot directory if it doesn't exist
|
||||||
|
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||||
|
os.makedirs(output_plots_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create DataFrame for plotting
|
||||||
|
df = pd.DataFrame(all_results)
|
||||||
|
|
||||||
|
# Create both plots
|
||||||
|
plot_correlation(
|
||||||
|
df, "target_tokens", "time_to_first_chunk",
|
||||||
|
"Time to Audio vs Input Size",
|
||||||
|
"Number of Input Tokens",
|
||||||
|
"Time to Audio (seconds)",
|
||||||
|
os.path.join(output_plots_dir, "first_token_latency.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
plot_timeline(
|
||||||
|
df,
|
||||||
|
os.path.join(output_plots_dir, "first_token_timeline.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nResults and plots saved to:")
|
||||||
|
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
|
||||||
|
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
|
||||||
|
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,207 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from lib.shared_benchmark_utils import get_text_for_tokens, enc
|
||||||
|
from lib.shared_utils import save_json_results
|
||||||
|
from lib.shared_plotting import plot_correlation, plot_timeline
|
||||||
|
|
||||||
|
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
|
||||||
|
"""Measure time to audio via API calls and save the audio output"""
|
||||||
|
results = {
|
||||||
|
"text_length": len(text),
|
||||||
|
"token_count": len(enc.encode(text)),
|
||||||
|
"total_time": None,
|
||||||
|
"time_to_first_chunk": None,
|
||||||
|
"error": None,
|
||||||
|
"audio_path": None,
|
||||||
|
"audio_length": None # Length of output audio in seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Make request with streaming enabled
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:8880/v1/audio/speech",
|
||||||
|
json={
|
||||||
|
"model": "kokoro",
|
||||||
|
"input": text,
|
||||||
|
"voice": "af",
|
||||||
|
"response_format": "wav",
|
||||||
|
"stream": True
|
||||||
|
},
|
||||||
|
stream=True,
|
||||||
|
timeout=1800
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Save complete audio
|
||||||
|
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
|
||||||
|
audio_path = os.path.join(output_dir, audio_filename)
|
||||||
|
results["audio_path"] = audio_path
|
||||||
|
|
||||||
|
first_chunk_time = None
|
||||||
|
chunks = []
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
if first_chunk_time is None:
|
||||||
|
first_chunk_time = time.time()
|
||||||
|
results["time_to_first_chunk"] = first_chunk_time - start_time
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# Extract WAV header and data separately
|
||||||
|
# First chunk has header + data, subsequent chunks are raw PCM
|
||||||
|
if not chunks:
|
||||||
|
raise ValueError("No audio chunks received")
|
||||||
|
|
||||||
|
first_chunk = chunks[0]
|
||||||
|
remaining_chunks = chunks[1:]
|
||||||
|
|
||||||
|
# Find end of WAV header (44 bytes for standard WAV)
|
||||||
|
header = first_chunk[:44]
|
||||||
|
first_data = first_chunk[44:]
|
||||||
|
|
||||||
|
# Concatenate all PCM data
|
||||||
|
all_data = first_data + b''.join(remaining_chunks)
|
||||||
|
|
||||||
|
# Update WAV header with total data size
|
||||||
|
import struct
|
||||||
|
data_size = len(all_data)
|
||||||
|
# Update data size field (bytes 4-7)
|
||||||
|
header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
|
||||||
|
# Update subchunk2 size field (bytes 40-43)
|
||||||
|
header = header[:40] + struct.pack('<I', data_size) + header[44:]
|
||||||
|
|
||||||
|
# Write complete WAV file
|
||||||
|
complete_audio = header + all_data
|
||||||
|
with open(audio_path, 'wb') as f:
|
||||||
|
f.write(complete_audio)
|
||||||
|
|
||||||
|
# Calculate audio length using scipy
|
||||||
|
import scipy.io.wavfile as wavfile
|
||||||
|
sample_rate, audio_data = wavfile.read(audio_path)
|
||||||
|
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
|
||||||
|
|
||||||
|
results["total_time"] = time.time() - start_time
|
||||||
|
|
||||||
|
# Print debug info
|
||||||
|
print(f"Complete audio size: {len(complete_audio)} bytes")
|
||||||
|
print(f"Number of chunks received: {len(chunks)}")
|
||||||
|
print(f"Audio length: {results['audio_length']:.3f}s")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
results["error"] = str(e)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Set up paths with _stream suffix
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
output_dir = os.path.join(script_dir, "output_audio_stream")
|
||||||
|
output_data_dir = os.path.join(script_dir, "output_data")
|
||||||
|
|
||||||
|
# Create output directories
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
os.makedirs(output_data_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Load sample text
|
||||||
|
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# Test specific token counts
|
||||||
|
token_sizes = [50, 100, 200, 500]
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for tokens in token_sizes:
|
||||||
|
print(f"\nTesting {tokens} tokens (streaming)")
|
||||||
|
test_text = get_text_for_tokens(text, tokens)
|
||||||
|
actual_tokens = len(enc.encode(test_text))
|
||||||
|
print(f"Text preview: {test_text[:50]}...")
|
||||||
|
|
||||||
|
# Run test 3 times for each size to get average
|
||||||
|
for i in range(3):
|
||||||
|
print(f"Run {i+1}/3...")
|
||||||
|
result = measure_first_token(test_text, output_dir, tokens, i + 1)
|
||||||
|
result["target_tokens"] = tokens
|
||||||
|
result["actual_tokens"] = actual_tokens
|
||||||
|
result["run_number"] = i + 1
|
||||||
|
|
||||||
|
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
|
||||||
|
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
|
||||||
|
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
|
||||||
|
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
|
||||||
|
|
||||||
|
if result["error"]:
|
||||||
|
print(f"Error: {result['error']}")
|
||||||
|
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
# Calculate averages per token size
|
||||||
|
summary = {}
|
||||||
|
for tokens in token_sizes:
|
||||||
|
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
|
||||||
|
if matching_results:
|
||||||
|
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
|
||||||
|
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
|
||||||
|
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
|
||||||
|
summary[tokens] = {
|
||||||
|
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
|
||||||
|
"avg_total_time": round(avg_total, 3),
|
||||||
|
"avg_audio_length": round(avg_audio_length, 3),
|
||||||
|
"num_successful_runs": len(matching_results)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save results with _stream suffix
|
||||||
|
results_data = {
|
||||||
|
"individual_runs": all_results,
|
||||||
|
"summary": summary,
|
||||||
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
}
|
||||||
|
save_json_results(
|
||||||
|
results_data,
|
||||||
|
os.path.join(output_data_dir, "first_token_benchmark_stream.json")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create plot directory if it doesn't exist
|
||||||
|
output_plots_dir = os.path.join(script_dir, "output_plots")
|
||||||
|
os.makedirs(output_plots_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create DataFrame for plotting
|
||||||
|
df = pd.DataFrame(all_results)
|
||||||
|
|
||||||
|
# Create both plots with _stream suffix
|
||||||
|
# Plot correlation for both metrics
|
||||||
|
plot_correlation(
|
||||||
|
df, "target_tokens", "time_to_first_chunk",
|
||||||
|
"Time to First Audio vs Input Size (Streaming)",
|
||||||
|
"Number of Input Tokens",
|
||||||
|
"Time to First Audio (seconds)",
|
||||||
|
os.path.join(output_plots_dir, "first_token_latency_stream.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
plot_correlation(
|
||||||
|
df, "target_tokens", "total_time",
|
||||||
|
"Total Time vs Input Size (Streaming)",
|
||||||
|
"Number of Input Tokens",
|
||||||
|
"Total Time (seconds)",
|
||||||
|
os.path.join(output_plots_dir, "total_time_latency_stream.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
plot_timeline(
|
||||||
|
df,
|
||||||
|
os.path.join(output_plots_dir, "first_token_timeline_stream.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nResults and plots saved to:")
|
||||||
|
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
|
||||||
|
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
|
||||||
|
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
|
||||||
|
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1,7 +1,9 @@
|
||||||
"""Shared plotting utilities for benchmarks and tests."""
|
"""Shared plotting utilities for benchmarks and tests."""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
# Common style configurations
|
# Common style configurations
|
||||||
STYLE_CONFIG = {
|
STYLE_CONFIG = {
|
||||||
|
@ -136,6 +138,132 @@ def plot_system_metrics(metrics_data, output_path):
|
||||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
|
def plot_timeline(df, output_path):
|
||||||
|
"""Create timeline plot showing latency for each run.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: pandas DataFrame containing run data with columns:
|
||||||
|
- target_tokens: number of tokens
|
||||||
|
- run_number: run iteration
|
||||||
|
- time_to_first_chunk: latency to first token
|
||||||
|
output_path: str, path to save the output plot
|
||||||
|
"""
|
||||||
|
plt.style.use("dark_background")
|
||||||
|
|
||||||
|
# Sort by tokens and run number
|
||||||
|
df = df.sort_values(['target_tokens', 'run_number'])
|
||||||
|
|
||||||
|
# Create figure and axis
|
||||||
|
fig, ax = plt.subplots(figsize=(12, 6))
|
||||||
|
|
||||||
|
# Calculate y positions for each run with tighter grouping
|
||||||
|
unique_tokens = sorted(df['target_tokens'].unique())
|
||||||
|
y_positions = {}
|
||||||
|
current_y = 0
|
||||||
|
group_spacing = 0.8 # Space between groups
|
||||||
|
run_spacing = 0.2 # Space between runs in a group
|
||||||
|
|
||||||
|
for tokens in unique_tokens:
|
||||||
|
runs = df[df['target_tokens'] == tokens]
|
||||||
|
base_y = current_y
|
||||||
|
for i, (_, run) in enumerate(runs.iterrows()):
|
||||||
|
y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
|
||||||
|
current_y = base_y + (len(runs) * run_spacing) + group_spacing
|
||||||
|
|
||||||
|
# Plot bars and points with more transparency
|
||||||
|
bar_height = 0.15
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
y = y_positions[(row['target_tokens'], row['run_number'])]
|
||||||
|
latency = row['time_to_first_chunk']
|
||||||
|
|
||||||
|
# Latency bar
|
||||||
|
ax.add_patch(patches.Rectangle(
|
||||||
|
(0, y - bar_height/2),
|
||||||
|
latency,
|
||||||
|
bar_height,
|
||||||
|
facecolor=STYLE_CONFIG["primary_color"],
|
||||||
|
alpha=0.3
|
||||||
|
))
|
||||||
|
|
||||||
|
# End point
|
||||||
|
ax.plot(latency, y, 'o',
|
||||||
|
color=STYLE_CONFIG["secondary_color"],
|
||||||
|
markersize=4,
|
||||||
|
alpha=0.5)
|
||||||
|
|
||||||
|
# Add mean lines and values for each token group
|
||||||
|
for tokens in unique_tokens:
|
||||||
|
token_runs = df[df['target_tokens'] == tokens]
|
||||||
|
mean_latency = token_runs['time_to_first_chunk'].mean()
|
||||||
|
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
|
||||||
|
min_y = min(y_positions_for_token)
|
||||||
|
max_y = max(y_positions_for_token)
|
||||||
|
group_center = (min_y + max_y) / 2
|
||||||
|
|
||||||
|
# Plot mean line with gradient alpha
|
||||||
|
gradient = np.linspace(0.2, 0.8, 100)
|
||||||
|
for i in range(len(gradient)-1):
|
||||||
|
y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
|
||||||
|
y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
|
||||||
|
ax.plot([mean_latency, mean_latency], [y1, y2],
|
||||||
|
'-', color=STYLE_CONFIG["secondary_color"],
|
||||||
|
linewidth=3, alpha=gradient[i])
|
||||||
|
|
||||||
|
# Add mean value label with background
|
||||||
|
label_text = f'Mean: {mean_latency:.3f}s'
|
||||||
|
bbox_props = dict(
|
||||||
|
facecolor=STYLE_CONFIG["background_color"],
|
||||||
|
edgecolor=STYLE_CONFIG["secondary_color"],
|
||||||
|
alpha=0.8,
|
||||||
|
pad=3,
|
||||||
|
linewidth=1
|
||||||
|
)
|
||||||
|
ax.text(mean_latency + 0.02, group_center,
|
||||||
|
label_text,
|
||||||
|
color=STYLE_CONFIG["secondary_color"],
|
||||||
|
va='center',
|
||||||
|
fontsize=10,
|
||||||
|
fontweight='bold',
|
||||||
|
bbox=bbox_props)
|
||||||
|
|
||||||
|
# Customize plot
|
||||||
|
ax.set_ylim(-1, current_y)
|
||||||
|
ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3) # Extra space for labels
|
||||||
|
|
||||||
|
# Add labels for token groups with tighter spacing
|
||||||
|
group_positions = {}
|
||||||
|
for tokens in unique_tokens:
|
||||||
|
runs = df[df['target_tokens'] == tokens]
|
||||||
|
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
|
||||||
|
group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
|
||||||
|
plt.axhline(y=min(y_positions_for_token) - bar_height,
|
||||||
|
color='white', alpha=0.1, linestyle='-')
|
||||||
|
|
||||||
|
# Calculate mean audio length for each token group
|
||||||
|
audio_lengths = {}
|
||||||
|
for tokens in unique_tokens:
|
||||||
|
token_runs = df[df['target_tokens'] == tokens]
|
||||||
|
audio_lengths[tokens] = token_runs['audio_length'].mean()
|
||||||
|
|
||||||
|
# Set y-ticks at group centers with token counts and audio lengths
|
||||||
|
plt.yticks(
|
||||||
|
list(group_positions.values()),
|
||||||
|
[f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
|
||||||
|
fontsize=10
|
||||||
|
)
|
||||||
|
|
||||||
|
# Customize appearance
|
||||||
|
setup_plot(
|
||||||
|
fig, ax,
|
||||||
|
"Time-To-Audio Latency",
|
||||||
|
xlabel="Time (seconds)",
|
||||||
|
ylabel="Input Size"
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
|
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
|
||||||
"""Create correlation plot with regression line and correlation coefficient.
|
"""Create correlation plot with regression line and correlation coefficient.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,403 @@
|
||||||
|
{
|
||||||
|
"individual_runs": [
|
||||||
|
{
|
||||||
|
"text_length": 37,
|
||||||
|
"token_count": 10,
|
||||||
|
"total_time": 0.16574740409851074,
|
||||||
|
"time_to_first_chunk": 0.16574740409851074,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
|
||||||
|
"audio_length": 3.45,
|
||||||
|
"target_tokens": 10,
|
||||||
|
"actual_tokens": 10,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 37,
|
||||||
|
"token_count": 10,
|
||||||
|
"total_time": 0.18812799453735352,
|
||||||
|
"time_to_first_chunk": 0.18812799453735352,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
|
||||||
|
"audio_length": 3.45,
|
||||||
|
"target_tokens": 10,
|
||||||
|
"actual_tokens": 10,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 37,
|
||||||
|
"token_count": 10,
|
||||||
|
"total_time": 0.18645429611206055,
|
||||||
|
"time_to_first_chunk": 0.18645429611206055,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
|
||||||
|
"audio_length": 3.45,
|
||||||
|
"target_tokens": 10,
|
||||||
|
"actual_tokens": 10,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 37,
|
||||||
|
"token_count": 10,
|
||||||
|
"total_time": 0.17632031440734863,
|
||||||
|
"time_to_first_chunk": 0.17632031440734863,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
|
||||||
|
"audio_length": 3.45,
|
||||||
|
"target_tokens": 10,
|
||||||
|
"actual_tokens": 10,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 37,
|
||||||
|
"token_count": 10,
|
||||||
|
"total_time": 0.13381195068359375,
|
||||||
|
"time_to_first_chunk": 0.13381195068359375,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
|
||||||
|
"audio_length": 3.45,
|
||||||
|
"target_tokens": 10,
|
||||||
|
"actual_tokens": 10,
|
||||||
|
"run_number": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 102,
|
||||||
|
"token_count": 25,
|
||||||
|
"total_time": 0.2086498737335205,
|
||||||
|
"time_to_first_chunk": 0.2086498737335205,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
|
||||||
|
"audio_length": 7.225,
|
||||||
|
"target_tokens": 25,
|
||||||
|
"actual_tokens": 25,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 102,
|
||||||
|
"token_count": 25,
|
||||||
|
"total_time": 0.2727653980255127,
|
||||||
|
"time_to_first_chunk": 0.2727653980255127,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
|
||||||
|
"audio_length": 7.225,
|
||||||
|
"target_tokens": 25,
|
||||||
|
"actual_tokens": 25,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 102,
|
||||||
|
"token_count": 25,
|
||||||
|
"total_time": 0.2096250057220459,
|
||||||
|
"time_to_first_chunk": 0.2096250057220459,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
|
||||||
|
"audio_length": 7.225,
|
||||||
|
"target_tokens": 25,
|
||||||
|
"actual_tokens": 25,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 102,
|
||||||
|
"token_count": 25,
|
||||||
|
"total_time": 0.2256758213043213,
|
||||||
|
"time_to_first_chunk": 0.2256758213043213,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
|
||||||
|
"audio_length": 7.225,
|
||||||
|
"target_tokens": 25,
|
||||||
|
"actual_tokens": 25,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 102,
|
||||||
|
"token_count": 25,
|
||||||
|
"total_time": 0.1945042610168457,
|
||||||
|
"time_to_first_chunk": 0.1945042610168457,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
|
||||||
|
"audio_length": 7.225,
|
||||||
|
"target_tokens": 25,
|
||||||
|
"actual_tokens": 25,
|
||||||
|
"run_number": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.4975121021270752,
|
||||||
|
"time_to_first_chunk": 0.4975121021270752,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
|
||||||
|
"audio_length": 16.325,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.4518404006958008,
|
||||||
|
"time_to_first_chunk": 0.4518404006958008,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
|
||||||
|
"audio_length": 16.325,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.5640325546264648,
|
||||||
|
"time_to_first_chunk": 0.5640325546264648,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
|
||||||
|
"audio_length": 16.325,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.5305957794189453,
|
||||||
|
"time_to_first_chunk": 0.5305957794189453,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
|
||||||
|
"audio_length": 16.325,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.5540030002593994,
|
||||||
|
"time_to_first_chunk": 0.5540030002593994,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
|
||||||
|
"audio_length": 16.325,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.7963137626647949,
|
||||||
|
"time_to_first_chunk": 0.7963137626647949,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
|
||||||
|
"audio_length": 31.1,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.9320805072784424,
|
||||||
|
"time_to_first_chunk": 0.9320805072784424,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
|
||||||
|
"audio_length": 31.1,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.824256181716919,
|
||||||
|
"time_to_first_chunk": 0.824256181716919,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
|
||||||
|
"audio_length": 31.1,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.9034836292266846,
|
||||||
|
"time_to_first_chunk": 0.9034836292266846,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
|
||||||
|
"audio_length": 31.1,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.8364357948303223,
|
||||||
|
"time_to_first_chunk": 0.8364357948303223,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
|
||||||
|
"audio_length": 31.1,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.8122682571411133,
|
||||||
|
"time_to_first_chunk": 1.8122682571411133,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
|
||||||
|
"audio_length": 62.625,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.7290427684783936,
|
||||||
|
"time_to_first_chunk": 1.7290427684783936,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
|
||||||
|
"audio_length": 62.625,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 2.141728401184082,
|
||||||
|
"time_to_first_chunk": 2.141728401184082,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
|
||||||
|
"audio_length": 62.625,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 2.0155680179595947,
|
||||||
|
"time_to_first_chunk": 2.0155680179595947,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
|
||||||
|
"audio_length": 62.625,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.8707575798034668,
|
||||||
|
"time_to_first_chunk": 1.8707575798034668,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
|
||||||
|
"audio_length": 62.625,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.822713851928711,
|
||||||
|
"time_to_first_chunk": 4.822713851928711,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
|
||||||
|
"audio_length": 157.875,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.227782726287842,
|
||||||
|
"time_to_first_chunk": 4.227782726287842,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
|
||||||
|
"audio_length": 157.875,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.414916276931763,
|
||||||
|
"time_to_first_chunk": 4.414916276931763,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
|
||||||
|
"audio_length": 157.875,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.579505681991577,
|
||||||
|
"time_to_first_chunk": 4.579505681991577,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
|
||||||
|
"audio_length": 157.875,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.332529067993164,
|
||||||
|
"time_to_first_chunk": 4.332529067993164,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
|
||||||
|
"audio_length": 157.875,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 5
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"10": {
|
||||||
|
"avg_time_to_first_chunk": 0.17,
|
||||||
|
"avg_total_time": 0.17,
|
||||||
|
"avg_audio_length": 3.45,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
},
|
||||||
|
"25": {
|
||||||
|
"avg_time_to_first_chunk": 0.222,
|
||||||
|
"avg_total_time": 0.222,
|
||||||
|
"avg_audio_length": 7.225,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
},
|
||||||
|
"50": {
|
||||||
|
"avg_time_to_first_chunk": 0.52,
|
||||||
|
"avg_total_time": 0.52,
|
||||||
|
"avg_audio_length": 16.325,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
},
|
||||||
|
"100": {
|
||||||
|
"avg_time_to_first_chunk": 0.859,
|
||||||
|
"avg_total_time": 0.859,
|
||||||
|
"avg_audio_length": 31.1,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
},
|
||||||
|
"200": {
|
||||||
|
"avg_time_to_first_chunk": 1.914,
|
||||||
|
"avg_total_time": 1.914,
|
||||||
|
"avg_audio_length": 62.625,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"avg_time_to_first_chunk": 4.475,
|
||||||
|
"avg_total_time": 4.475,
|
||||||
|
"avg_audio_length": 157.875,
|
||||||
|
"num_successful_runs": 5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timestamp": "2025-01-04 13:52:28"
|
||||||
|
}
|
|
@ -0,0 +1,175 @@
|
||||||
|
{
|
||||||
|
"individual_runs": [
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.9603095054626465,
|
||||||
|
"time_to_first_chunk": 0.5916037559509277,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
|
||||||
|
"audio_length": 15.45,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.5130870342254639,
|
||||||
|
"time_to_first_chunk": 0.27448558807373047,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
|
||||||
|
"audio_length": 15.45,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 212,
|
||||||
|
"token_count": 50,
|
||||||
|
"total_time": 0.4667215347290039,
|
||||||
|
"time_to_first_chunk": 0.22882533073425293,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
|
||||||
|
"audio_length": 15.45,
|
||||||
|
"target_tokens": 50,
|
||||||
|
"actual_tokens": 50,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.9051008224487305,
|
||||||
|
"time_to_first_chunk": 0.2526383399963379,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
|
||||||
|
"audio_length": 30.25,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.8579132556915283,
|
||||||
|
"time_to_first_chunk": 0.25691914558410645,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
|
||||||
|
"audio_length": 30.25,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 448,
|
||||||
|
"token_count": 100,
|
||||||
|
"total_time": 0.9683890342712402,
|
||||||
|
"time_to_first_chunk": 0.26229000091552734,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
|
||||||
|
"audio_length": 30.25,
|
||||||
|
"target_tokens": 100,
|
||||||
|
"actual_tokens": 100,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.8075971603393555,
|
||||||
|
"time_to_first_chunk": 0.22536945343017578,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
|
||||||
|
"audio_length": 60.75,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.493518590927124,
|
||||||
|
"time_to_first_chunk": 0.21502947807312012,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
|
||||||
|
"audio_length": 60.75,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 906,
|
||||||
|
"token_count": 200,
|
||||||
|
"total_time": 1.4910809993743896,
|
||||||
|
"time_to_first_chunk": 0.21600556373596191,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
|
||||||
|
"audio_length": 60.75,
|
||||||
|
"target_tokens": 200,
|
||||||
|
"actual_tokens": 200,
|
||||||
|
"run_number": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.223623275756836,
|
||||||
|
"time_to_first_chunk": 0.20010590553283691,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
|
||||||
|
"audio_length": 147.775,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 3.8811349868774414,
|
||||||
|
"time_to_first_chunk": 0.24638962745666504,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
|
||||||
|
"audio_length": 147.775,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_length": 2232,
|
||||||
|
"token_count": 500,
|
||||||
|
"total_time": 4.045536994934082,
|
||||||
|
"time_to_first_chunk": 0.2252039909362793,
|
||||||
|
"error": null,
|
||||||
|
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
|
||||||
|
"audio_length": 147.775,
|
||||||
|
"target_tokens": 500,
|
||||||
|
"actual_tokens": 500,
|
||||||
|
"run_number": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"50": {
|
||||||
|
"avg_time_to_first_chunk": 0.365,
|
||||||
|
"avg_total_time": 0.647,
|
||||||
|
"avg_audio_length": 15.45,
|
||||||
|
"num_successful_runs": 3
|
||||||
|
},
|
||||||
|
"100": {
|
||||||
|
"avg_time_to_first_chunk": 0.257,
|
||||||
|
"avg_total_time": 0.91,
|
||||||
|
"avg_audio_length": 30.25,
|
||||||
|
"num_successful_runs": 3
|
||||||
|
},
|
||||||
|
"200": {
|
||||||
|
"avg_time_to_first_chunk": 0.219,
|
||||||
|
"avg_total_time": 1.597,
|
||||||
|
"avg_audio_length": 60.75,
|
||||||
|
"num_successful_runs": 3
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"avg_time_to_first_chunk": 0.224,
|
||||||
|
"avg_total_time": 4.05,
|
||||||
|
"avg_audio_length": 147.775,
|
||||||
|
"num_successful_runs": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timestamp": "2025-01-04 14:59:28"
|
||||||
|
}
|
After Width: | Height: | Size: 246 KiB |
After Width: | Height: | Size: 214 KiB |
After Width: | Height: | Size: 233 KiB |
After Width: | Height: | Size: 189 KiB |
After Width: | Height: | Size: 243 KiB |
|
@ -2,199 +2,134 @@ import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
import argparse
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
def validate_tts(wav_path: str) -> dict:
|
def validate_tts(wav_path: str) -> dict:
|
||||||
"""
|
"""
|
||||||
Quick validation checks for TTS-generated audio files to detect common artifacts.
|
Validation checks for TTS-generated audio files to detect common artifacts.
|
||||||
|
|
||||||
Checks for:
|
|
||||||
- Unnatural silence gaps
|
|
||||||
- Audio glitches and artifacts
|
|
||||||
- Repeated speech segments (stuck/looping)
|
|
||||||
- Abrupt changes in speech
|
|
||||||
- Audio quality issues
|
|
||||||
|
|
||||||
Args:
|
|
||||||
wav_path: Path to audio file (wav, mp3, etc)
|
|
||||||
Returns:
|
|
||||||
Dictionary with validation results
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Load audio
|
# Load and process audio
|
||||||
audio, sr = sf.read(wav_path)
|
audio, sr = sf.read(wav_path)
|
||||||
if len(audio.shape) > 1:
|
if len(audio.shape) > 1:
|
||||||
audio = audio.mean(axis=1) # Convert to mono
|
audio = np.mean(audio, axis=1)
|
||||||
|
|
||||||
# Basic audio stats
|
|
||||||
duration = len(audio) / sr
|
duration = len(audio) / sr
|
||||||
rms = np.sqrt(np.mean(audio**2))
|
|
||||||
peak = np.max(np.abs(audio))
|
|
||||||
dc_offset = np.mean(audio)
|
|
||||||
|
|
||||||
# Calculate clipping stats if we're near peak
|
|
||||||
clip_count = np.sum(np.abs(audio) >= 0.99)
|
|
||||||
clip_percent = (clip_count / len(audio)) * 100
|
|
||||||
if clip_percent > 0:
|
|
||||||
clip_stats = f" ({clip_percent:.2e} ratio near peak)"
|
|
||||||
else:
|
|
||||||
clip_stats = " (no samples near peak)"
|
|
||||||
|
|
||||||
# Convert to dB for analysis
|
|
||||||
eps = np.finfo(float).eps
|
|
||||||
db = 20 * np.log10(np.abs(audio) + eps)
|
|
||||||
|
|
||||||
issues = []
|
issues = []
|
||||||
|
|
||||||
# Check if audio is too short (likely failed generation)
|
# Basic quality checks
|
||||||
if duration < 0.1: # Less than 100ms
|
abs_audio = np.abs(audio)
|
||||||
|
stats = {
|
||||||
|
'rms': float(np.sqrt(np.mean(audio**2))),
|
||||||
|
'peak': float(np.max(abs_audio)),
|
||||||
|
'dc_offset': float(np.mean(audio))
|
||||||
|
}
|
||||||
|
|
||||||
|
clip_count = np.sum(abs_audio >= 0.99)
|
||||||
|
clip_percent = (clip_count / len(audio)) * 100
|
||||||
|
|
||||||
|
if duration < 0.1:
|
||||||
issues.append("WARNING: Audio is suspiciously short - possible failed generation")
|
issues.append("WARNING: Audio is suspiciously short - possible failed generation")
|
||||||
|
|
||||||
# 1. Check for basic audio quality
|
if stats['peak'] >= 1.0:
|
||||||
if peak >= 1.0:
|
if clip_percent > 1.0:
|
||||||
# Calculate percentage of samples that are clipping
|
|
||||||
clip_count = np.sum(np.abs(audio) >= 0.99)
|
|
||||||
clip_percent = (clip_count / len(audio)) * 100
|
|
||||||
|
|
||||||
if clip_percent > 1.0: # Only warn if more than 1% of samples clip
|
|
||||||
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
|
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
|
||||||
elif clip_percent > 0.01: # Add info if more than 0.01% but less than 1%
|
elif clip_percent > 0.01:
|
||||||
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples) - likely intentional normalization")
|
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
|
||||||
|
|
||||||
if rms < 0.01:
|
if stats['rms'] < 0.01:
|
||||||
issues.append("WARNING: Audio is very quiet - possible failed generation")
|
issues.append("WARNING: Audio is very quiet - possible failed generation")
|
||||||
if abs(dc_offset) > 0.1: # DC offset is particularly bad for speech
|
|
||||||
issues.append(f"WARNING: High DC offset ({dc_offset:.3f}) - may cause audio artifacts")
|
|
||||||
|
|
||||||
# 2. Check for long silence gaps (potential TTS failures)
|
if abs(stats['dc_offset']) > 0.1:
|
||||||
|
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
|
||||||
|
|
||||||
|
# Check for long silence gaps
|
||||||
|
eps = np.finfo(float).eps
|
||||||
|
db = 20 * np.log10(abs_audio + eps)
|
||||||
silence_threshold = -45 # dB
|
silence_threshold = -45 # dB
|
||||||
min_silence = 2.0 # Only detect silences longer than 2 seconds
|
min_silence = 2.0 # seconds
|
||||||
window_size = int(min_silence * sr)
|
window_size = int(min_silence * sr)
|
||||||
silence_count = 0
|
silence_count = 0
|
||||||
last_silence = -1
|
last_silence = -1
|
||||||
|
|
||||||
# Skip the first 0.2s for silence detection (avoid false positives at start)
|
start_idx = int(0.2 * sr) # Skip first 0.2s
|
||||||
start_idx = int(0.2 * sr)
|
|
||||||
for i in range(start_idx, len(db) - window_size, window_size):
|
for i in range(start_idx, len(db) - window_size, window_size):
|
||||||
window = db[i:i+window_size]
|
window = db[i:i+window_size]
|
||||||
if np.mean(window) < silence_threshold:
|
if np.mean(window) < silence_threshold:
|
||||||
# Verify the entire window is mostly silence
|
|
||||||
silent_ratio = np.mean(window < silence_threshold)
|
silent_ratio = np.mean(window < silence_threshold)
|
||||||
if silent_ratio > 0.9: # 90% of the window should be below threshold
|
if silent_ratio > 0.9:
|
||||||
if last_silence == -1 or (i/sr - last_silence) > 2.0: # Only count silences more than 2s apart
|
if last_silence == -1 or (i/sr - last_silence) > 2.0:
|
||||||
silence_count += 1
|
silence_count += 1
|
||||||
last_silence = i/sr
|
last_silence = i/sr
|
||||||
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
|
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
|
||||||
|
|
||||||
if silence_count > 2: # Only warn if there are multiple long silences
|
if silence_count > 2:
|
||||||
issues.append(f"WARNING: Multiple long silences found ({silence_count} total) - possible generation issue")
|
issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
|
||||||
|
|
||||||
# 3. Check for extreme audio artifacts (changes too rapid for natural speech)
|
# Detect audio artifacts
|
||||||
# Use a longer window to avoid flagging normal phoneme transitions
|
diff = np.diff(audio)
|
||||||
window_size = int(0.02 * sr) # 20ms window
|
abs_diff = np.abs(diff)
|
||||||
db_smooth = np.convolve(db, np.ones(window_size)/window_size, 'same')
|
window_size = min(int(0.005 * sr), 256)
|
||||||
db_diff = np.abs(np.diff(db_smooth))
|
window = np.ones(window_size)/window_size
|
||||||
|
local_avg_diff = np.convolve(abs_diff, window, mode='same')
|
||||||
|
|
||||||
# Much higher threshold to only catch truly unnatural changes
|
spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
|
||||||
artifact_threshold = 40 # dB
|
artifact_indices = np.nonzero(spikes)[0]
|
||||||
min_duration = int(0.01 * sr) # Minimum 10ms duration
|
|
||||||
|
|
||||||
# Find regions where the smoothed dB change is extreme
|
artifacts = []
|
||||||
artifact_points = np.where(db_diff > artifact_threshold)[0]
|
if len(artifact_indices) > 0:
|
||||||
|
gaps = np.diff(artifact_indices)
|
||||||
|
min_gap = int(0.005 * sr)
|
||||||
|
break_points = np.nonzero(gaps > min_gap)[0] + 1
|
||||||
|
groups = np.split(artifact_indices, break_points)
|
||||||
|
|
||||||
if len(artifact_points) > 0:
|
for group in groups:
|
||||||
# Group artifacts that are very close together
|
if len(group) >= 5:
|
||||||
grouped_artifacts = []
|
severity = np.max(abs_diff[group])
|
||||||
current_group = [artifact_points[0]]
|
if severity > 0.2:
|
||||||
|
center_idx = group[len(group)//2]
|
||||||
|
artifacts.append({
|
||||||
|
'time': float(center_idx/sr), # Ensure float for consistent timing
|
||||||
|
'severity': float(severity)
|
||||||
|
})
|
||||||
|
issues.append(
|
||||||
|
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
|
||||||
|
f"(severity: {severity:.3f})"
|
||||||
|
)
|
||||||
|
|
||||||
for i in range(1, len(artifact_points)):
|
# Check for repeated speech segments
|
||||||
if (artifact_points[i] - current_group[-1]) < min_duration:
|
for chunk_duration in [5.0, 10.0]:
|
||||||
current_group.append(artifact_points[i])
|
|
||||||
else:
|
|
||||||
if len(current_group) * (1/sr) >= 0.01: # Only keep groups lasting >= 10ms
|
|
||||||
grouped_artifacts.append(current_group)
|
|
||||||
current_group = [artifact_points[i]]
|
|
||||||
|
|
||||||
if len(current_group) * (1/sr) >= 0.01:
|
|
||||||
grouped_artifacts.append(current_group)
|
|
||||||
|
|
||||||
# Report only the most severe artifacts
|
|
||||||
for group in grouped_artifacts[:2]: # Report up to 2 worst artifacts
|
|
||||||
center_idx = group[len(group)//2]
|
|
||||||
db_change = db_diff[center_idx]
|
|
||||||
if db_change > 45: # Only report very extreme changes
|
|
||||||
issues.append(
|
|
||||||
f"WARNING: Possible audio artifact at {center_idx/sr:.2f}s "
|
|
||||||
f"({db_change:.1f}dB change over {len(group)/sr*1000:.0f}ms)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 4. Check for repeated speech segments (stuck/looping)
|
|
||||||
# Check both short and long sentence durations at audiobook speed (150-160 wpm)
|
|
||||||
for chunk_duration in [5.0, 10.0]: # 5s (~12 words) and 10s (~25 words) at ~audiobook speed
|
|
||||||
chunk_size = int(chunk_duration * sr)
|
chunk_size = int(chunk_duration * sr)
|
||||||
overlap = int(0.2 * chunk_size) # 20% overlap between chunks
|
overlap = int(0.2 * chunk_size)
|
||||||
|
|
||||||
for i in range(0, len(audio) - 2*chunk_size, overlap):
|
for i in range(0, len(audio) - 2*chunk_size, overlap):
|
||||||
chunk1 = audio[i:i+chunk_size]
|
chunk1 = audio[i:i+chunk_size]
|
||||||
chunk2 = audio[i+chunk_size:i+2*chunk_size]
|
chunk2 = audio[i+chunk_size:i+2*chunk_size]
|
||||||
|
|
||||||
# Ignore chunks that are mostly silence
|
|
||||||
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
|
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
correlation = np.corrcoef(chunk1, chunk2)[0,1]
|
correlation = np.corrcoef(chunk1, chunk2)[0,1]
|
||||||
if not np.isnan(correlation) and correlation > 0.92: # Lower threshold for sentence-length chunks
|
if not np.isnan(correlation) and correlation > 0.92:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"WARNING: Possible repeated speech at {i/sr:.1f}s "
|
f"WARNING: Possible repeated speech at {i/sr:.1f}s "
|
||||||
f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
|
f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
|
||||||
)
|
)
|
||||||
break # Found repetition at this duration, try next duration
|
break
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 5. Check for extreme amplitude discontinuities (common in failed TTS)
|
|
||||||
amplitude_envelope = np.abs(audio)
|
|
||||||
window_size = sr // 10 # 100ms window for smoother envelope
|
|
||||||
smooth_env = np.convolve(amplitude_envelope, np.ones(window_size)/float(window_size), 'same')
|
|
||||||
env_diff = np.abs(np.diff(smooth_env))
|
|
||||||
|
|
||||||
# Only detect very extreme amplitude changes
|
|
||||||
jump_threshold = 0.5 # Much higher threshold
|
|
||||||
jumps = np.where(env_diff > jump_threshold)[0]
|
|
||||||
|
|
||||||
if len(jumps) > 0:
|
|
||||||
# Group jumps that are close together
|
|
||||||
grouped_jumps = []
|
|
||||||
current_group = [jumps[0]]
|
|
||||||
|
|
||||||
for i in range(1, len(jumps)):
|
|
||||||
if (jumps[i] - current_group[-1]) < 0.05 * sr: # Group within 50ms
|
|
||||||
current_group.append(jumps[i])
|
|
||||||
else:
|
|
||||||
if len(current_group) >= 3: # Only keep significant discontinuities
|
|
||||||
grouped_jumps.append(current_group)
|
|
||||||
current_group = [jumps[i]]
|
|
||||||
|
|
||||||
if len(current_group) >= 3:
|
|
||||||
grouped_jumps.append(current_group)
|
|
||||||
|
|
||||||
# Report only the most severe discontinuities
|
|
||||||
for group in grouped_jumps[:2]: # Report up to 2 worst cases
|
|
||||||
center_idx = group[len(group)//2]
|
|
||||||
jump_size = env_diff[center_idx]
|
|
||||||
if jump_size > 0.6: # Only report very extreme changes
|
|
||||||
issues.append(
|
|
||||||
f"WARNING: Possible audio discontinuity at {center_idx/sr:.2f}s "
|
|
||||||
f"({jump_size:.2f} amplitude ratio change)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"file": wav_path,
|
"file": wav_path,
|
||||||
"duration": f"{duration:.2f}s",
|
"duration": f"{duration:.2f}s",
|
||||||
"sample_rate": sr,
|
"sample_rate": sr,
|
||||||
"peak_amplitude": f"{peak:.3f}{clip_stats}",
|
"peak_amplitude": f"{stats['peak']:.3f}",
|
||||||
"rms_level": f"{rms:.3f}",
|
"rms_level": f"{stats['rms']:.3f}",
|
||||||
"dc_offset": f"{dc_offset:.3f}",
|
"dc_offset": f"{stats['dc_offset']:.3f}",
|
||||||
|
"artifact_count": len(artifacts),
|
||||||
|
"artifact_locations": [a['time'] for a in artifacts],
|
||||||
|
"artifact_severities": [a['severity'] for a in artifacts],
|
||||||
"issues": issues,
|
"issues": issues,
|
||||||
"valid": len(issues) == 0
|
"valid": len(issues) == 0
|
||||||
}
|
}
|
||||||
|
@ -206,12 +141,78 @@ def validate_tts(wav_path: str) -> dict:
|
||||||
"valid": False
|
"valid": False
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
|
||||||
parser = argparse.ArgumentParser(description="TTS Output Validator")
|
"""
|
||||||
parser.add_argument("wav_file", help="Path to audio file to validate")
|
Generate analysis plots for audio file with time-aligned visualizations.
|
||||||
args = parser.parse_args()
|
"""
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.signal import spectrogram
|
||||||
|
|
||||||
result = validate_tts(args.wav_file)
|
# Load audio
|
||||||
|
audio, sr = sf.read(wav_path)
|
||||||
|
if len(audio.shape) > 1:
|
||||||
|
audio = np.mean(audio, axis=1)
|
||||||
|
|
||||||
|
# Create figure with shared x-axis
|
||||||
|
fig = plt.figure(figsize=(15, 8))
|
||||||
|
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
|
||||||
|
ax1 = fig.add_subplot(gs[0])
|
||||||
|
ax2 = fig.add_subplot(gs[1], sharex=ax1)
|
||||||
|
|
||||||
|
# Calculate spectrogram
|
||||||
|
nperseg = 2048
|
||||||
|
noverlap = 1536
|
||||||
|
f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap,
|
||||||
|
window='hann', scaling='spectrum')
|
||||||
|
|
||||||
|
# Plot spectrogram
|
||||||
|
im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10),
|
||||||
|
shading='gouraud', cmap='viridis',
|
||||||
|
vmin=-100, vmax=-20)
|
||||||
|
ax1.set_ylabel('Frequency [Hz]', fontsize=10)
|
||||||
|
cbar = plt.colorbar(im, ax=ax1, label='dB')
|
||||||
|
ax1.set_title('Spectrogram', pad=10, fontsize=12)
|
||||||
|
|
||||||
|
# Plot waveform with exact time alignment
|
||||||
|
times = np.arange(len(audio)) / sr
|
||||||
|
ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
|
||||||
|
ax2.set_ylabel('Amplitude', fontsize=10)
|
||||||
|
ax2.set_xlabel('Time [sec]', fontsize=10)
|
||||||
|
ax2.grid(True, alpha=0.2)
|
||||||
|
|
||||||
|
# Add artifact markers
|
||||||
|
if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
|
||||||
|
for loc in validation_result['artifact_locations']:
|
||||||
|
ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
|
||||||
|
ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
|
||||||
|
|
||||||
|
# Add legend to both plots
|
||||||
|
if len(validation_result['artifact_locations']) > 0:
|
||||||
|
ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
|
||||||
|
ax1.legend(loc='upper right', fontsize=8)
|
||||||
|
# Only add unique labels to legend
|
||||||
|
handles, labels = ax2.get_legend_handles_labels()
|
||||||
|
unique_labels = dict(zip(labels, handles))
|
||||||
|
ax2.legend(unique_labels.values(), unique_labels.keys(),
|
||||||
|
loc='upper right', fontsize=8)
|
||||||
|
|
||||||
|
# Set common x limits
|
||||||
|
xlim = (0, len(audio)/sr)
|
||||||
|
ax1.set_xlim(xlim)
|
||||||
|
ax2.set_xlim(xlim)
|
||||||
|
og_filename = Path(wav_path).name.split(".")[0]
|
||||||
|
# Save plot
|
||||||
|
plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
|
||||||
|
silent=False
|
||||||
|
|
||||||
|
result = validate_tts(wav_file)
|
||||||
|
if not silent:
|
||||||
|
wav_root_dir = Path(wav_file).parent
|
||||||
|
generate_analysis_plots(wav_file, wav_root_dir, result)
|
||||||
|
|
||||||
print(f"\nValidating: {result['file']}")
|
print(f"\nValidating: {result['file']}")
|
||||||
if "error" in result:
|
if "error" in result:
|
||||||
|
@ -222,6 +223,7 @@ if __name__ == "__main__":
|
||||||
print(f"Peak Amplitude: {result['peak_amplitude']}")
|
print(f"Peak Amplitude: {result['peak_amplitude']}")
|
||||||
print(f"RMS Level: {result['rms_level']}")
|
print(f"RMS Level: {result['rms_level']}")
|
||||||
print(f"DC Offset: {result['dc_offset']}")
|
print(f"DC Offset: {result['dc_offset']}")
|
||||||
|
print(f"Detected Artifacts: {result['artifact_count']}")
|
||||||
|
|
||||||
if result["issues"]:
|
if result["issues"]:
|
||||||
print("\nIssues Found:")
|
print("\nIssues Found:")
|
||||||
|
|
BIN
examples/audio_analysis.png
Normal file
After Width: | Height: | Size: 5.2 MiB |
BIN
examples/output.wav
Normal file
BIN
examples/output_audio_analysis.png
Normal file
After Width: | Height: | Size: 142 KiB |
144
examples/stream_tts_playback.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import requests
|
||||||
|
import sounddevice as sd
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import wave
|
||||||
|
|
||||||
|
def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"):
|
||||||
|
"""Stream TTS audio and play it back in real-time"""
|
||||||
|
|
||||||
|
print("\nStarting TTS stream request...")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Initialize variables
|
||||||
|
sample_rate = 24000 # Known sample rate for Kokoro
|
||||||
|
audio_started = False
|
||||||
|
stream = None
|
||||||
|
chunk_count = 0
|
||||||
|
total_bytes = 0
|
||||||
|
first_chunk_time = None
|
||||||
|
all_audio_data = bytearray() # Raw PCM audio data
|
||||||
|
|
||||||
|
# Make streaming request to API
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:8880/v1/audio/speech",
|
||||||
|
json={
|
||||||
|
"model": "kokoro",
|
||||||
|
"input": text,
|
||||||
|
"voice": voice,
|
||||||
|
"response_format": "pcm",
|
||||||
|
"stream": True
|
||||||
|
},
|
||||||
|
stream=True,
|
||||||
|
timeout=1800
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"Request started successfully after {time.time() - start_time:.2f}s")
|
||||||
|
|
||||||
|
# Process streaming response
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
chunk_count += 1
|
||||||
|
total_bytes += len(chunk)
|
||||||
|
|
||||||
|
# Handle first chunk
|
||||||
|
if not audio_started:
|
||||||
|
first_chunk_time = time.time()
|
||||||
|
print(f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s")
|
||||||
|
print(f"First chunk size: {len(chunk)} bytes")
|
||||||
|
|
||||||
|
# Accumulate raw audio data
|
||||||
|
all_audio_data.extend(chunk)
|
||||||
|
|
||||||
|
# Convert PCM to float32 for playback
|
||||||
|
audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
|
||||||
|
# Scale to [-1, 1] range for sounddevice
|
||||||
|
audio_data = audio_data / 32768.0
|
||||||
|
|
||||||
|
# Start audio stream
|
||||||
|
stream = sd.OutputStream(
|
||||||
|
samplerate=sample_rate,
|
||||||
|
channels=1,
|
||||||
|
dtype=np.float32
|
||||||
|
)
|
||||||
|
stream.start()
|
||||||
|
audio_started = True
|
||||||
|
print("Audio playback started")
|
||||||
|
|
||||||
|
# Play first chunk
|
||||||
|
if len(audio_data) > 0:
|
||||||
|
stream.write(audio_data)
|
||||||
|
|
||||||
|
# Handle subsequent chunks
|
||||||
|
else:
|
||||||
|
# Accumulate raw audio data
|
||||||
|
all_audio_data.extend(chunk)
|
||||||
|
|
||||||
|
# Convert PCM to float32 for playback
|
||||||
|
audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
|
||||||
|
audio_data = audio_data / 32768.0
|
||||||
|
if len(audio_data) > 0:
|
||||||
|
stream.write(audio_data)
|
||||||
|
|
||||||
|
# Log progress every 10 chunks
|
||||||
|
if chunk_count % 10 == 0:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
print(f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed")
|
||||||
|
|
||||||
|
# Final stats
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
print(f"\nStream complete:")
|
||||||
|
print(f"Total chunks: {chunk_count}")
|
||||||
|
print(f"Total data: {total_bytes/1024:.1f}KB")
|
||||||
|
print(f"Total time: {total_time:.2f}s")
|
||||||
|
print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
|
||||||
|
|
||||||
|
# Save as WAV file
|
||||||
|
if output_file:
|
||||||
|
print(f"\nWriting audio to {output_file}")
|
||||||
|
with wave.open(output_file, 'wb') as wav_file:
|
||||||
|
wav_file.setnchannels(1) # Mono
|
||||||
|
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
||||||
|
wav_file.setframerate(sample_rate)
|
||||||
|
wav_file.writeframes(all_audio_data)
|
||||||
|
print(f"Saved {len(all_audio_data)} bytes of audio data")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
if stream is not None:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
print(f"Connection error - Is the server running? Error: {str(e)}")
|
||||||
|
if stream is not None:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during streaming: {str(e)}")
|
||||||
|
if stream is not None:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Load sample text from HG Wells
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
wells_path = os.path.join(script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt")
|
||||||
|
output_path = os.path.join(script_dir, "output.wav")
|
||||||
|
|
||||||
|
with open(wells_path, "r", encoding="utf-8") as f:
|
||||||
|
full_text = f.read()
|
||||||
|
# Take first few paragraphs
|
||||||
|
text = " ".join(full_text.split("\n\n")[:2])
|
||||||
|
|
||||||
|
print("\nStarting TTS stream playback...")
|
||||||
|
print(f"Text length: {len(text)} characters")
|
||||||
|
print("\nFirst 100 characters:")
|
||||||
|
print(text[:100] + "...")
|
||||||
|
|
||||||
|
play_streaming_tts(text, output_file=output_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|