First streaming attempt

This commit is contained in:
remsky 2025-01-04 17:54:54 -07:00
parent 65bf15f153
commit f1eb1d9590
24 changed files with 1583 additions and 199 deletions

BIN
.coverage

Binary file not shown.

1
.gitignore vendored
View file

@ -23,4 +23,5 @@ examples/assorted_checks/test_openai/output/*
examples/assorted_checks/test_voices/output/* examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/* examples/assorted_checks/test_formats/output/*
examples/assorted_checks/benchmarks/output_audio_stream/*
ui/RepoScreenshot.png ui/RepoScreenshot.png

View file

@ -23,8 +23,16 @@ async def lifespan(app: FastAPI):
# Initialize the main model with warm-up # Initialize the main model with warm-up
voicepack_count = TTSModel.setup() voicepack_count = TTSModel.setup()
logger.info("""
""")
logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}") logger.info(f"Model loaded and warmed up on {TTSModel.get_device()}")
logger.info(f"{voicepack_count} voice packs loaded successfully") logger.info(f"{voicepack_count} voice packs loaded successfully")
logger.info("#" * 80)
yield yield

View file

@ -2,10 +2,12 @@ from typing import List
from loguru import logger from loguru import logger
from fastapi import Depends, Response, APIRouter, HTTPException from fastapi import Depends, Response, APIRouter, HTTPException
from fastapi.responses import StreamingResponse
from ..services.tts_service import TTSService from ..services.tts_service import TTSService
from ..services.audio import AudioService from ..services.audio import AudioService
from ..structures.schemas import OpenAISpeechRequest from ..structures.schemas import OpenAISpeechRequest
from typing import AsyncGenerator
router = APIRouter( router = APIRouter(
tags=["OpenAI Compatible TTS"], tags=["OpenAI Compatible TTS"],
@ -18,6 +20,16 @@ def get_tts_service() -> TTSService:
return TTSService() # Initialize TTSService with default settings return TTSService() # Initialize TTSService with default settings
async def stream_audio_chunks(tts_service: TTSService, request: OpenAISpeechRequest) -> AsyncGenerator[bytes, None]:
"""Stream audio chunks as they're generated"""
async for chunk in tts_service.generate_audio_stream(
text=request.input,
voice=request.voice,
speed=request.speed,
output_format=request.response_format
):
yield chunk
@router.post("/audio/speech") @router.post("/audio/speech")
async def create_speech( async def create_speech(
request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service) request: OpenAISpeechRequest, tts_service: TTSService = Depends(get_tts_service)
@ -31,24 +43,52 @@ async def create_speech(
f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}" f"Voice '{request.voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
) )
# Generate audio directly using TTSService's method # Set content type based on format
audio, _ = tts_service._generate_audio( content_type = {
text=request.input, "mp3": "audio/mpeg",
voice=request.voice, "opus": "audio/opus",
speed=request.speed, "aac": "audio/aac",
stitch_long_output=True, "flac": "audio/flac",
) "wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
# Convert to requested format if request.stream:
content = AudioService.convert_audio(audio, 24000, request.response_format) # Stream audio chunks as they're generated
return StreamingResponse(
stream_audio_chunks(tts_service, request),
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"X-Accel-Buffering": "no", # Disable proxy buffering
"Cache-Control": "no-cache", # Prevent caching
},
)
else:
# Generate complete audio
audio, _ = tts_service._generate_audio(
text=request.input,
voice=request.voice,
speed=request.speed,
stitch_long_output=True,
)
return Response( # Convert to requested format
content=content, content = AudioService.convert_audio(
media_type=f"audio/{request.response_format}", audio,
headers={ 24000,
"Content-Disposition": f"attachment; filename=speech.{request.response_format}" request.response_format,
}, is_first_chunk=True
) )
return Response(
content=content,
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"Cache-Control": "no-cache", # Prevent caching
},
)
except ValueError as e: except ValueError as e:
logger.error(f"Invalid request: {str(e)}") logger.error(f"Invalid request: {str(e)}")

View file

@ -7,12 +7,35 @@ import soundfile as sf
from loguru import logger from loguru import logger
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
def __init__(self):
self.int16_max = np.iinfo(np.int16).max
def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Normalize audio data to int16 range"""
# Convert to float64 for accurate scaling
audio_float = audio_data.astype(np.float64)
# Scale to int16 range while preserving relative amplitudes
max_val = np.abs(audio_float).max()
if max_val > 0:
scaling = self.int16_max / max_val
audio_float *= scaling
# Clip to int16 range and convert
return np.clip(audio_float, -self.int16_max, self.int16_max).astype(np.int16)
class AudioService: class AudioService:
"""Service for audio format conversions""" """Service for audio format conversions"""
@staticmethod @staticmethod
def convert_audio( def convert_audio(
audio_data: np.ndarray, sample_rate: int, output_format: str audio_data: np.ndarray,
sample_rate: int,
output_format: str,
is_first_chunk: bool = True,
normalizer: AudioNormalizer = None
) -> bytes: ) -> bytes:
"""Convert audio data to specified format """Convert audio data to specified format
@ -20,6 +43,7 @@ class AudioService:
audio_data: Numpy array of audio samples audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, opus, flac, pcm) output_format: Target format (wav, mp3, opus, flac, pcm)
is_first_chunk: Whether this is the first chunk of a stream
Returns: Returns:
Bytes of the converted audio Bytes of the converted audio
@ -27,30 +51,34 @@ class AudioService:
buffer = BytesIO() buffer = BytesIO()
try: try:
if output_format == "wav": # Normalize audio if normalizer provided, otherwise just convert to int16
if normalizer is not None:
normalized_audio = normalizer.normalize(audio_data)
else:
normalized_audio = audio_data.astype(np.int16)
if output_format == "pcm":
logger.info("Writing PCM data...")
# Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes())
elif output_format == "wav":
logger.info("Writing to WAV format...") logger.info("Writing to WAV format...")
# Ensure audio_data is in int16 format for WAV # Always include WAV header for WAV format
audio_data_wav = ( sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max elif output_format in ["mp3", "aac"]:
).astype(np.int16) # Normalize logger.info(f"Converting to {output_format.upper()} format...")
sf.write(buffer, audio_data_wav, sample_rate, format="WAV") # Use lower bitrate for streaming
elif output_format == "mp3": sf.write(buffer, normalized_audio, sample_rate, format=output_format.upper(),
logger.info("Converting to MP3 format...") subtype='COMPRESSED')
# soundfile can write MP3 if ffmpeg or libsox is installed
sf.write(buffer, audio_data, sample_rate, format="MP3")
elif output_format == "opus": elif output_format == "opus":
logger.info("Converting to Opus format...") logger.info("Converting to Opus format...")
sf.write(buffer, audio_data, sample_rate, format="OGG", subtype="OPUS") # Use lower bitrate and smaller frame size for streaming
sf.write(buffer, normalized_audio, sample_rate, format="OGG", subtype="OPUS")
elif output_format == "flac": elif output_format == "flac":
logger.info("Converting to FLAC format...") logger.info("Converting to FLAC format...")
sf.write(buffer, audio_data, sample_rate, format="FLAC") # Use smaller block size for streaming
elif output_format == "pcm": sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
logger.info("Extracting PCM data...") subtype='PCM_16')
# Ensure audio_data is in int16 format for PCM
audio_data_pcm = (
audio_data / np.abs(audio_data).max() * np.iinfo(np.int16).max
).astype(np.int16) # Normalize
buffer.write(audio_data_pcm.tobytes())
else: else:
raise ValueError( raise ValueError(
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm." f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."

View file

@ -1,4 +1,5 @@
import re import re
from functools import lru_cache
def split_num(num: re.Match) -> str: def split_num(num: re.Match) -> str:
"""Handle number splitting for various formats""" """Handle number splitting for various formats"""
@ -48,6 +49,7 @@ def handle_decimal(num: re.Match) -> str:
a, b = num.group().split(".") a, b = num.group().split(".")
return " point ".join([a, " ".join(b)]) return " point ".join([a, " ".join(b)])
@lru_cache(maxsize=1000) # Cache normalized text results
def normalize_text(text: str) -> str: def normalize_text(text: str) -> str:
"""Normalize text for TTS processing """Normalize text for TTS processing

View file

@ -3,6 +3,7 @@ import os
import re import re
import time import time
from typing import List, Tuple, Optional from typing import List, Tuple, Optional
from functools import lru_cache
import numpy as np import numpy as np
import torch import torch
@ -12,6 +13,7 @@ from loguru import logger
from ..core.config import settings from ..core.config import settings
from .tts_model import TTSModel from .tts_model import TTSModel
from .audio import AudioService, AudioNormalizer
class TTSService: class TTSService:
@ -24,6 +26,12 @@ class TTSService:
text = str(text) if text is not None else "" text = str(text) if text is not None else ""
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
@staticmethod
@lru_cache(maxsize=20) # Cache up to 8 most recently used voices
def _load_voice(voice_path: str) -> torch.Tensor:
"""Load and cache a voice model"""
return torch.load(voice_path, map_location=TTSModel.get_device(), weights_only=True)
def _get_voice_path(self, voice_name: str) -> Optional[str]: def _get_voice_path(self, voice_name: str) -> Optional[str]:
"""Get the path to a voice file""" """Get the path to a voice file"""
voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt") voice_path = os.path.join(TTSModel.VOICES_DIR, f"{voice_name}.pt")
@ -31,6 +39,13 @@ class TTSService:
def _generate_audio( def _generate_audio(
self, text: str, voice: str, speed: float, stitch_long_output: bool = True self, text: str, voice: str, speed: float, stitch_long_output: bool = True
) -> Tuple[torch.Tensor, float]:
"""Generate complete audio and return with processing time"""
audio, processing_time = self._generate_audio_internal(text, voice, speed, stitch_long_output)
return audio, processing_time
def _generate_audio_internal(
self, text: str, voice: str, speed: float, stitch_long_output: bool = True
) -> Tuple[torch.Tensor, float]: ) -> Tuple[torch.Tensor, float]:
"""Generate audio and measure processing time""" """Generate audio and measure processing time"""
start_time = time.time() start_time = time.time()
@ -49,10 +64,8 @@ class TTSService:
if not voice_path: if not voice_path:
raise ValueError(f"Voice not found: {voice}") raise ValueError(f"Voice not found: {voice}")
# Load voice # Load voice using cached loader
voicepack = torch.load( voicepack = self._load_voice(voice_path)
voice_path, map_location=TTSModel.get_device(), weights_only=True
)
# Generate audio with or without stitching # Generate audio with or without stitching
if stitch_long_output: if stitch_long_output:
@ -97,6 +110,78 @@ class TTSService:
logger.error(f"Error in audio generation: {str(e)}") logger.error(f"Error in audio generation: {str(e)}")
raise raise
async def generate_audio_stream(
self, text: str, voice: str, speed: float, output_format: str = "wav"
):
"""Generate and yield audio chunks as they're generated for real-time streaming"""
try:
# Create normalizer for consistent audio levels
stream_normalizer = AudioNormalizer()
# Input validation and preprocessing
if not text:
raise ValueError("Text is empty")
normalized = normalize_text(text)
if not normalized:
raise ValueError("Text is empty after preprocessing")
text = str(normalized)
# Voice validation and loading
voice_path = self._get_voice_path(voice)
if not voice_path:
raise ValueError(f"Voice not found: {voice}")
voicepack = self._load_voice(voice_path)
# Split text into smaller chunks for faster streaming
# Use shorter chunks for real-time delivery
chunks = []
sentences = self._split_text(text)
current_chunk = []
current_length = 0
target_length = 100 # Target ~100 characters per chunk for faster processing
for sentence in sentences:
current_chunk.append(sentence)
current_length += len(sentence)
if current_length >= target_length:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
# Process and stream chunks
for i, chunk in enumerate(chunks):
try:
# Process text and generate audio
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
if chunk_audio is not None:
# Convert chunk with proper header handling
chunk_bytes = AudioService.convert_audio(
chunk_audio,
24000,
output_format,
is_first_chunk=(i == 0),
normalizer=stream_normalizer
)
yield chunk_bytes
else:
logger.error(f"No audio generated for chunk {i + 1}/{len(chunks)}")
except Exception as e:
logger.error(
f"Failed to generate audio for chunk {i + 1}/{len(chunks)}: '{chunk}'. Error: {str(e)}"
)
continue
except Exception as e:
logger.error(f"Error in audio generation stream: {str(e)}")
raise
def _save_audio(self, audio: torch.Tensor, filepath: str): def _save_audio(self, audio: torch.Tensor, filepath: str):
"""Save audio to file""" """Save audio to file"""
os.makedirs(os.path.dirname(filepath), exist_ok=True) os.makedirs(os.path.dirname(filepath), exist_ok=True)

View file

@ -22,7 +22,7 @@ class OpenAISpeechRequest(BaseModel):
) )
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field( response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
default="mp3", default="mp3",
description="The format to return audio in. Supported formats: mp3, opus, flac, wav. AAC and PCM are not currently supported.", description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
) )
speed: float = Field( speed: float = Field(
default=1.0, default=1.0,
@ -30,3 +30,7 @@ class OpenAISpeechRequest(BaseModel):
le=4.0, le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.", description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
) )
stream: bool = Field(
default=False,
description="If true, audio will be streamed as it's generated. Each chunk will be a complete sentence.",
)

View file

@ -46,14 +46,14 @@ services:
model-fetcher: model-fetcher:
condition: service_healthy condition: service_healthy
# Gradio UI service [Comment out everything below if you don't need it] # # Gradio UI service [Comment out everything below if you don't need it]
gradio-ui: # gradio-ui:
build: # build:
context: ./ui # context: ./ui
ports: # ports:
- "7860:7860" # - "7860:7860"
volumes: # volumes:
- ./ui/data:/app/ui/data # - ./ui/data:/app/ui/data
- ./ui/app.py:/app/app.py # Mount app.py for hot reload # - ./ui/app.py:/app/app.py # Mount app.py for hot reload
environment: # environment:
- GRADIO_WATCH=True # Enable hot reloading # - GRADIO_WATCH=True # Enable hot reloading

View file

@ -0,0 +1,157 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request without streaming
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": False
},
timeout=1800
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
content = response.content
with open(audio_path, 'wb') as f:
f.write(content)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["time_to_first_chunk"] = time.time() - start_time
results["total_time"] = time.time() - start_time
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [10, 25, 50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(5):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Total time: {result.get('total_time', 'N/A'):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results
# Save results
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to Audio vs Input Size",
"Number of Input Tokens",
"Time to Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline.png')}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,207 @@
#!/usr/bin/env python3
import os
import time
import json
import numpy as np
import requests
import pandas as pd
from lib.shared_benchmark_utils import get_text_for_tokens, enc
from lib.shared_utils import save_json_results
from lib.shared_plotting import plot_correlation, plot_timeline
def measure_first_token(text: str, output_dir: str, tokens: int, run_number: int) -> dict:
"""Measure time to audio via API calls and save the audio output"""
results = {
"text_length": len(text),
"token_count": len(enc.encode(text)),
"total_time": None,
"time_to_first_chunk": None,
"error": None,
"audio_path": None,
"audio_length": None # Length of output audio in seconds
}
try:
start_time = time.time()
# Make request with streaming enabled
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af",
"response_format": "wav",
"stream": True
},
stream=True,
timeout=1800
)
response.raise_for_status()
# Save complete audio
audio_filename = f"benchmark_tokens{tokens}_run{run_number}_stream.wav"
audio_path = os.path.join(output_dir, audio_filename)
results["audio_path"] = audio_path
first_chunk_time = None
chunks = []
for chunk in response.iter_content(chunk_size=1024):
if chunk:
if first_chunk_time is None:
first_chunk_time = time.time()
results["time_to_first_chunk"] = first_chunk_time - start_time
chunks.append(chunk)
# Extract WAV header and data separately
# First chunk has header + data, subsequent chunks are raw PCM
if not chunks:
raise ValueError("No audio chunks received")
first_chunk = chunks[0]
remaining_chunks = chunks[1:]
# Find end of WAV header (44 bytes for standard WAV)
header = first_chunk[:44]
first_data = first_chunk[44:]
# Concatenate all PCM data
all_data = first_data + b''.join(remaining_chunks)
# Update WAV header with total data size
import struct
data_size = len(all_data)
# Update data size field (bytes 4-7)
header = header[:4] + struct.pack('<I', data_size + 36) + header[8:]
# Update subchunk2 size field (bytes 40-43)
header = header[:40] + struct.pack('<I', data_size) + header[44:]
# Write complete WAV file
complete_audio = header + all_data
with open(audio_path, 'wb') as f:
f.write(complete_audio)
# Calculate audio length using scipy
import scipy.io.wavfile as wavfile
sample_rate, audio_data = wavfile.read(audio_path)
results["audio_length"] = len(audio_data) / sample_rate # Length in seconds
results["total_time"] = time.time() - start_time
# Print debug info
print(f"Complete audio size: {len(complete_audio)} bytes")
print(f"Number of chunks received: {len(chunks)}")
print(f"Audio length: {results['audio_length']:.3f}s")
return results
except Exception as e:
results["error"] = str(e)
return results
def main():
# Set up paths with _stream suffix
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "output_audio_stream")
output_data_dir = os.path.join(script_dir, "output_data")
# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_data_dir, exist_ok=True)
# Load sample text
with open(os.path.join(script_dir, "the_time_machine_hg_wells.txt"), "r", encoding="utf-8") as f:
text = f.read()
# Test specific token counts
token_sizes = [50, 100, 200, 500]
all_results = []
for tokens in token_sizes:
print(f"\nTesting {tokens} tokens (streaming)")
test_text = get_text_for_tokens(text, tokens)
actual_tokens = len(enc.encode(test_text))
print(f"Text preview: {test_text[:50]}...")
# Run test 3 times for each size to get average
for i in range(3):
print(f"Run {i+1}/3...")
result = measure_first_token(test_text, output_dir, tokens, i + 1)
result["target_tokens"] = tokens
result["actual_tokens"] = actual_tokens
result["run_number"] = i + 1
print(f"Time to First Audio: {result.get('time_to_first_chunk', 'N/A'):.3f}s")
print(f"Time to Save Complete: {result.get('total_time', 'N/A'):.3f}s")
print(f"Audio length: {result.get('audio_length', 'N/A'):.3f}s")
print(f"Streaming overhead: {(result.get('total_time', 0) - result.get('time_to_first_chunk', 0)):.3f}s")
if result["error"]:
print(f"Error: {result['error']}")
all_results.append(result)
# Calculate averages per token size
summary = {}
for tokens in token_sizes:
matching_results = [r for r in all_results if r["target_tokens"] == tokens and not r["error"]]
if matching_results:
avg_first_chunk = sum(r["time_to_first_chunk"] for r in matching_results) / len(matching_results)
avg_total = sum(r["total_time"] for r in matching_results) / len(matching_results)
avg_audio_length = sum(r["audio_length"] for r in matching_results) / len(matching_results)
summary[tokens] = {
"avg_time_to_first_chunk": round(avg_first_chunk, 3),
"avg_total_time": round(avg_total, 3),
"avg_audio_length": round(avg_audio_length, 3),
"num_successful_runs": len(matching_results)
}
# Save results with _stream suffix
results_data = {
"individual_runs": all_results,
"summary": summary,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json_results(
results_data,
os.path.join(output_data_dir, "first_token_benchmark_stream.json")
)
# Create plot directory if it doesn't exist
output_plots_dir = os.path.join(script_dir, "output_plots")
os.makedirs(output_plots_dir, exist_ok=True)
# Create DataFrame for plotting
df = pd.DataFrame(all_results)
# Create both plots with _stream suffix
# Plot correlation for both metrics
plot_correlation(
df, "target_tokens", "time_to_first_chunk",
"Time to First Audio vs Input Size (Streaming)",
"Number of Input Tokens",
"Time to First Audio (seconds)",
os.path.join(output_plots_dir, "first_token_latency_stream.png")
)
plot_correlation(
df, "target_tokens", "total_time",
"Total Time vs Input Size (Streaming)",
"Number of Input Tokens",
"Total Time (seconds)",
os.path.join(output_plots_dir, "total_time_latency_stream.png")
)
plot_timeline(
df,
os.path.join(output_plots_dir, "first_token_timeline_stream.png")
)
print("\nResults and plots saved to:")
print(f"- {os.path.join(output_data_dir, 'first_token_benchmark_stream.json')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'total_time_latency_stream.png')}")
print(f"- {os.path.join(output_plots_dir, 'first_token_timeline_stream.png')}")
if __name__ == "__main__":
main()

View file

@ -1,7 +1,9 @@
"""Shared plotting utilities for benchmarks and tests.""" """Shared plotting utilities for benchmarks and tests."""
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Common style configurations # Common style configurations
STYLE_CONFIG = { STYLE_CONFIG = {
@ -136,6 +138,132 @@ def plot_system_metrics(metrics_data, output_path):
plt.savefig(output_path, dpi=300, bbox_inches="tight") plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()
def plot_timeline(df, output_path):
"""Create timeline plot showing latency for each run.
Args:
df: pandas DataFrame containing run data with columns:
- target_tokens: number of tokens
- run_number: run iteration
- time_to_first_chunk: latency to first token
output_path: str, path to save the output plot
"""
plt.style.use("dark_background")
# Sort by tokens and run number
df = df.sort_values(['target_tokens', 'run_number'])
# Create figure and axis
fig, ax = plt.subplots(figsize=(12, 6))
# Calculate y positions for each run with tighter grouping
unique_tokens = sorted(df['target_tokens'].unique())
y_positions = {}
current_y = 0
group_spacing = 0.8 # Space between groups
run_spacing = 0.2 # Space between runs in a group
for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens]
base_y = current_y
for i, (_, run) in enumerate(runs.iterrows()):
y_positions[(tokens, run['run_number'])] = base_y + (i * run_spacing)
current_y = base_y + (len(runs) * run_spacing) + group_spacing
# Plot bars and points with more transparency
bar_height = 0.15
for _, row in df.iterrows():
y = y_positions[(row['target_tokens'], row['run_number'])]
latency = row['time_to_first_chunk']
# Latency bar
ax.add_patch(patches.Rectangle(
(0, y - bar_height/2),
latency,
bar_height,
facecolor=STYLE_CONFIG["primary_color"],
alpha=0.3
))
# End point
ax.plot(latency, y, 'o',
color=STYLE_CONFIG["secondary_color"],
markersize=4,
alpha=0.5)
# Add mean lines and values for each token group
for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens]
mean_latency = token_runs['time_to_first_chunk'].mean()
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in token_runs.iterrows()]
min_y = min(y_positions_for_token)
max_y = max(y_positions_for_token)
group_center = (min_y + max_y) / 2
# Plot mean line with gradient alpha
gradient = np.linspace(0.2, 0.8, 100)
for i in range(len(gradient)-1):
y1 = min_y - bar_height + (max_y - min_y + 2*bar_height) * (i/len(gradient))
y2 = min_y - bar_height + (max_y - min_y + 2*bar_height) * ((i+1)/len(gradient))
ax.plot([mean_latency, mean_latency], [y1, y2],
'-', color=STYLE_CONFIG["secondary_color"],
linewidth=3, alpha=gradient[i])
# Add mean value label with background
label_text = f'Mean: {mean_latency:.3f}s'
bbox_props = dict(
facecolor=STYLE_CONFIG["background_color"],
edgecolor=STYLE_CONFIG["secondary_color"],
alpha=0.8,
pad=3,
linewidth=1
)
ax.text(mean_latency + 0.02, group_center,
label_text,
color=STYLE_CONFIG["secondary_color"],
va='center',
fontsize=10,
fontweight='bold',
bbox=bbox_props)
# Customize plot
ax.set_ylim(-1, current_y)
ax.set_xlim(0, df['time_to_first_chunk'].max() * 1.3) # Extra space for labels
# Add labels for token groups with tighter spacing
group_positions = {}
for tokens in unique_tokens:
runs = df[df['target_tokens'] == tokens]
y_positions_for_token = [y_positions[(tokens, run['run_number'])] for _, run in runs.iterrows()]
group_positions[tokens] = sum(y_positions_for_token) / len(y_positions_for_token)
plt.axhline(y=min(y_positions_for_token) - bar_height,
color='white', alpha=0.1, linestyle='-')
# Calculate mean audio length for each token group
audio_lengths = {}
for tokens in unique_tokens:
token_runs = df[df['target_tokens'] == tokens]
audio_lengths[tokens] = token_runs['audio_length'].mean()
# Set y-ticks at group centers with token counts and audio lengths
plt.yticks(
list(group_positions.values()),
[f'{tokens} tokens\n({audio_lengths[tokens]:.1f}s)' for tokens in group_positions.keys()],
fontsize=10
)
# Customize appearance
setup_plot(
fig, ax,
"Time-To-Audio Latency",
xlabel="Time (seconds)",
ylabel="Input Size"
)
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.close()
def plot_correlation(df, x, y, title, xlabel, ylabel, output_path): def plot_correlation(df, x, y, title, xlabel, ylabel, output_path):
"""Create correlation plot with regression line and correlation coefficient. """Create correlation plot with regression line and correlation coefficient.

View file

@ -0,0 +1,403 @@
{
"individual_runs": [
{
"text_length": 37,
"token_count": 10,
"total_time": 0.16574740409851074,
"time_to_first_chunk": 0.16574740409851074,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run1.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 1
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18812799453735352,
"time_to_first_chunk": 0.18812799453735352,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run2.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 2
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.18645429611206055,
"time_to_first_chunk": 0.18645429611206055,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run3.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 3
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.17632031440734863,
"time_to_first_chunk": 0.17632031440734863,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run4.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 4
},
{
"text_length": 37,
"token_count": 10,
"total_time": 0.13381195068359375,
"time_to_first_chunk": 0.13381195068359375,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens10_run5.wav",
"audio_length": 3.45,
"target_tokens": 10,
"actual_tokens": 10,
"run_number": 5
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2086498737335205,
"time_to_first_chunk": 0.2086498737335205,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run1.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 1
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2727653980255127,
"time_to_first_chunk": 0.2727653980255127,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run2.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 2
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2096250057220459,
"time_to_first_chunk": 0.2096250057220459,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run3.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 3
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.2256758213043213,
"time_to_first_chunk": 0.2256758213043213,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run4.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 4
},
{
"text_length": 102,
"token_count": 25,
"total_time": 0.1945042610168457,
"time_to_first_chunk": 0.1945042610168457,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens25_run5.wav",
"audio_length": 7.225,
"target_tokens": 25,
"actual_tokens": 25,
"run_number": 5
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4975121021270752,
"time_to_first_chunk": 0.4975121021270752,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run1.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4518404006958008,
"time_to_first_chunk": 0.4518404006958008,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run2.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5640325546264648,
"time_to_first_chunk": 0.5640325546264648,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run3.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5305957794189453,
"time_to_first_chunk": 0.5305957794189453,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run4.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 4
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5540030002593994,
"time_to_first_chunk": 0.5540030002593994,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens50_run5.wav",
"audio_length": 16.325,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 5
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.7963137626647949,
"time_to_first_chunk": 0.7963137626647949,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run1.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9320805072784424,
"time_to_first_chunk": 0.9320805072784424,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run2.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.824256181716919,
"time_to_first_chunk": 0.824256181716919,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run3.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9034836292266846,
"time_to_first_chunk": 0.9034836292266846,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run4.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 4
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8364357948303223,
"time_to_first_chunk": 0.8364357948303223,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens100_run5.wav",
"audio_length": 31.1,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 5
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8122682571411133,
"time_to_first_chunk": 1.8122682571411133,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run1.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.7290427684783936,
"time_to_first_chunk": 1.7290427684783936,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run2.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.141728401184082,
"time_to_first_chunk": 2.141728401184082,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run3.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 2.0155680179595947,
"time_to_first_chunk": 2.0155680179595947,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run4.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 4
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8707575798034668,
"time_to_first_chunk": 1.8707575798034668,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens200_run5.wav",
"audio_length": 62.625,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 5
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.822713851928711,
"time_to_first_chunk": 4.822713851928711,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run1.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.227782726287842,
"time_to_first_chunk": 4.227782726287842,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run2.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.414916276931763,
"time_to_first_chunk": 4.414916276931763,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run3.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.579505681991577,
"time_to_first_chunk": 4.579505681991577,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run4.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 4
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.332529067993164,
"time_to_first_chunk": 4.332529067993164,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio\\benchmark_tokens500_run5.wav",
"audio_length": 157.875,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 5
}
],
"summary": {
"10": {
"avg_time_to_first_chunk": 0.17,
"avg_total_time": 0.17,
"avg_audio_length": 3.45,
"num_successful_runs": 5
},
"25": {
"avg_time_to_first_chunk": 0.222,
"avg_total_time": 0.222,
"avg_audio_length": 7.225,
"num_successful_runs": 5
},
"50": {
"avg_time_to_first_chunk": 0.52,
"avg_total_time": 0.52,
"avg_audio_length": 16.325,
"num_successful_runs": 5
},
"100": {
"avg_time_to_first_chunk": 0.859,
"avg_total_time": 0.859,
"avg_audio_length": 31.1,
"num_successful_runs": 5
},
"200": {
"avg_time_to_first_chunk": 1.914,
"avg_total_time": 1.914,
"avg_audio_length": 62.625,
"num_successful_runs": 5
},
"500": {
"avg_time_to_first_chunk": 4.475,
"avg_total_time": 4.475,
"avg_audio_length": 157.875,
"num_successful_runs": 5
}
},
"timestamp": "2025-01-04 13:52:28"
}

View file

@ -0,0 +1,175 @@
{
"individual_runs": [
{
"text_length": 212,
"token_count": 50,
"total_time": 0.9603095054626465,
"time_to_first_chunk": 0.5916037559509277,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run1_stream.wav",
"audio_length": 15.45,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 1
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.5130870342254639,
"time_to_first_chunk": 0.27448558807373047,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run2_stream.wav",
"audio_length": 15.45,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 2
},
{
"text_length": 212,
"token_count": 50,
"total_time": 0.4667215347290039,
"time_to_first_chunk": 0.22882533073425293,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens50_run3_stream.wav",
"audio_length": 15.45,
"target_tokens": 50,
"actual_tokens": 50,
"run_number": 3
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9051008224487305,
"time_to_first_chunk": 0.2526383399963379,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run1_stream.wav",
"audio_length": 30.25,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 1
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.8579132556915283,
"time_to_first_chunk": 0.25691914558410645,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run2_stream.wav",
"audio_length": 30.25,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 2
},
{
"text_length": 448,
"token_count": 100,
"total_time": 0.9683890342712402,
"time_to_first_chunk": 0.26229000091552734,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens100_run3_stream.wav",
"audio_length": 30.25,
"target_tokens": 100,
"actual_tokens": 100,
"run_number": 3
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.8075971603393555,
"time_to_first_chunk": 0.22536945343017578,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run1_stream.wav",
"audio_length": 60.75,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 1
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.493518590927124,
"time_to_first_chunk": 0.21502947807312012,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run2_stream.wav",
"audio_length": 60.75,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 2
},
{
"text_length": 906,
"token_count": 200,
"total_time": 1.4910809993743896,
"time_to_first_chunk": 0.21600556373596191,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens200_run3_stream.wav",
"audio_length": 60.75,
"target_tokens": 200,
"actual_tokens": 200,
"run_number": 3
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.223623275756836,
"time_to_first_chunk": 0.20010590553283691,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run1_stream.wav",
"audio_length": 147.775,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 1
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 3.8811349868774414,
"time_to_first_chunk": 0.24638962745666504,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run2_stream.wav",
"audio_length": 147.775,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 2
},
{
"text_length": 2232,
"token_count": 500,
"total_time": 4.045536994934082,
"time_to_first_chunk": 0.2252039909362793,
"error": null,
"audio_path": "c:\\Users\\jerem\\Desktop\\Kokoro-FastAPI\\examples\\assorted_checks\\benchmarks\\output_audio_stream\\benchmark_tokens500_run3_stream.wav",
"audio_length": 147.775,
"target_tokens": 500,
"actual_tokens": 500,
"run_number": 3
}
],
"summary": {
"50": {
"avg_time_to_first_chunk": 0.365,
"avg_total_time": 0.647,
"avg_audio_length": 15.45,
"num_successful_runs": 3
},
"100": {
"avg_time_to_first_chunk": 0.257,
"avg_total_time": 0.91,
"avg_audio_length": 30.25,
"num_successful_runs": 3
},
"200": {
"avg_time_to_first_chunk": 0.219,
"avg_total_time": 1.597,
"avg_audio_length": 60.75,
"num_successful_runs": 3
},
"500": {
"avg_time_to_first_chunk": 0.224,
"avg_total_time": 4.05,
"avg_audio_length": 147.775,
"num_successful_runs": 3
}
},
"timestamp": "2025-01-04 14:59:28"
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 246 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 233 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 243 KiB

View file

@ -2,199 +2,134 @@ import numpy as np
import soundfile as sf import soundfile as sf
import argparse import argparse
from pathlib import Path from pathlib import Path
from typing import Dict, Any
def validate_tts(wav_path: str) -> dict: def validate_tts(wav_path: str) -> dict:
""" """
Quick validation checks for TTS-generated audio files to detect common artifacts. Validation checks for TTS-generated audio files to detect common artifacts.
Checks for:
- Unnatural silence gaps
- Audio glitches and artifacts
- Repeated speech segments (stuck/looping)
- Abrupt changes in speech
- Audio quality issues
Args:
wav_path: Path to audio file (wav, mp3, etc)
Returns:
Dictionary with validation results
""" """
try: try:
# Load audio # Load and process audio
audio, sr = sf.read(wav_path) audio, sr = sf.read(wav_path)
if len(audio.shape) > 1: if len(audio.shape) > 1:
audio = audio.mean(axis=1) # Convert to mono audio = np.mean(audio, axis=1)
# Basic audio stats
duration = len(audio) / sr duration = len(audio) / sr
rms = np.sqrt(np.mean(audio**2))
peak = np.max(np.abs(audio))
dc_offset = np.mean(audio)
# Calculate clipping stats if we're near peak
clip_count = np.sum(np.abs(audio) >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if clip_percent > 0:
clip_stats = f" ({clip_percent:.2e} ratio near peak)"
else:
clip_stats = " (no samples near peak)"
# Convert to dB for analysis
eps = np.finfo(float).eps
db = 20 * np.log10(np.abs(audio) + eps)
issues = [] issues = []
# Check if audio is too short (likely failed generation) # Basic quality checks
if duration < 0.1: # Less than 100ms abs_audio = np.abs(audio)
stats = {
'rms': float(np.sqrt(np.mean(audio**2))),
'peak': float(np.max(abs_audio)),
'dc_offset': float(np.mean(audio))
}
clip_count = np.sum(abs_audio >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if duration < 0.1:
issues.append("WARNING: Audio is suspiciously short - possible failed generation") issues.append("WARNING: Audio is suspiciously short - possible failed generation")
# 1. Check for basic audio quality if stats['peak'] >= 1.0:
if peak >= 1.0: if clip_percent > 1.0:
# Calculate percentage of samples that are clipping
clip_count = np.sum(np.abs(audio) >= 0.99)
clip_percent = (clip_count / len(audio)) * 100
if clip_percent > 1.0: # Only warn if more than 1% of samples clip
issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)") issues.append(f"WARNING: Significant clipping detected ({clip_percent:.2e}% of samples)")
elif clip_percent > 0.01: # Add info if more than 0.01% but less than 1% elif clip_percent > 0.01:
issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples) - likely intentional normalization") issues.append(f"INFO: Minor peak limiting detected ({clip_percent:.2e}% of samples)")
if rms < 0.01: if stats['rms'] < 0.01:
issues.append("WARNING: Audio is very quiet - possible failed generation") issues.append("WARNING: Audio is very quiet - possible failed generation")
if abs(dc_offset) > 0.1: # DC offset is particularly bad for speech
issues.append(f"WARNING: High DC offset ({dc_offset:.3f}) - may cause audio artifacts")
# 2. Check for long silence gaps (potential TTS failures) if abs(stats['dc_offset']) > 0.1:
issues.append(f"WARNING: High DC offset ({stats['dc_offset']:.3f})")
# Check for long silence gaps
eps = np.finfo(float).eps
db = 20 * np.log10(abs_audio + eps)
silence_threshold = -45 # dB silence_threshold = -45 # dB
min_silence = 2.0 # Only detect silences longer than 2 seconds min_silence = 2.0 # seconds
window_size = int(min_silence * sr) window_size = int(min_silence * sr)
silence_count = 0 silence_count = 0
last_silence = -1 last_silence = -1
# Skip the first 0.2s for silence detection (avoid false positives at start) start_idx = int(0.2 * sr) # Skip first 0.2s
start_idx = int(0.2 * sr)
for i in range(start_idx, len(db) - window_size, window_size): for i in range(start_idx, len(db) - window_size, window_size):
window = db[i:i+window_size] window = db[i:i+window_size]
if np.mean(window) < silence_threshold: if np.mean(window) < silence_threshold:
# Verify the entire window is mostly silence
silent_ratio = np.mean(window < silence_threshold) silent_ratio = np.mean(window < silence_threshold)
if silent_ratio > 0.9: # 90% of the window should be below threshold if silent_ratio > 0.9:
if last_silence == -1 or (i/sr - last_silence) > 2.0: # Only count silences more than 2s apart if last_silence == -1 or (i/sr - last_silence) > 2.0:
silence_count += 1 silence_count += 1
last_silence = i/sr last_silence = i/sr
issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)") issues.append(f"WARNING: Long silence detected at {i/sr:.2f}s (duration: {min_silence:.1f}s)")
if silence_count > 2: # Only warn if there are multiple long silences if silence_count > 2:
issues.append(f"WARNING: Multiple long silences found ({silence_count} total) - possible generation issue") issues.append(f"WARNING: Multiple long silences found ({silence_count} total)")
# 3. Check for extreme audio artifacts (changes too rapid for natural speech) # Detect audio artifacts
# Use a longer window to avoid flagging normal phoneme transitions diff = np.diff(audio)
window_size = int(0.02 * sr) # 20ms window abs_diff = np.abs(diff)
db_smooth = np.convolve(db, np.ones(window_size)/window_size, 'same') window_size = min(int(0.005 * sr), 256)
db_diff = np.abs(np.diff(db_smooth)) window = np.ones(window_size)/window_size
local_avg_diff = np.convolve(abs_diff, window, mode='same')
# Much higher threshold to only catch truly unnatural changes spikes = (abs_diff > (10 * local_avg_diff)) & (abs_diff > 0.1)
artifact_threshold = 40 # dB artifact_indices = np.nonzero(spikes)[0]
min_duration = int(0.01 * sr) # Minimum 10ms duration
# Find regions where the smoothed dB change is extreme artifacts = []
artifact_points = np.where(db_diff > artifact_threshold)[0] if len(artifact_indices) > 0:
gaps = np.diff(artifact_indices)
min_gap = int(0.005 * sr)
break_points = np.nonzero(gaps > min_gap)[0] + 1
groups = np.split(artifact_indices, break_points)
if len(artifact_points) > 0: for group in groups:
# Group artifacts that are very close together if len(group) >= 5:
grouped_artifacts = [] severity = np.max(abs_diff[group])
current_group = [artifact_points[0]] if severity > 0.2:
center_idx = group[len(group)//2]
artifacts.append({
'time': float(center_idx/sr), # Ensure float for consistent timing
'severity': float(severity)
})
issues.append(
f"WARNING: Audio discontinuity at {center_idx/sr:.3f}s "
f"(severity: {severity:.3f})"
)
for i in range(1, len(artifact_points)): # Check for repeated speech segments
if (artifact_points[i] - current_group[-1]) < min_duration: for chunk_duration in [5.0, 10.0]:
current_group.append(artifact_points[i])
else:
if len(current_group) * (1/sr) >= 0.01: # Only keep groups lasting >= 10ms
grouped_artifacts.append(current_group)
current_group = [artifact_points[i]]
if len(current_group) * (1/sr) >= 0.01:
grouped_artifacts.append(current_group)
# Report only the most severe artifacts
for group in grouped_artifacts[:2]: # Report up to 2 worst artifacts
center_idx = group[len(group)//2]
db_change = db_diff[center_idx]
if db_change > 45: # Only report very extreme changes
issues.append(
f"WARNING: Possible audio artifact at {center_idx/sr:.2f}s "
f"({db_change:.1f}dB change over {len(group)/sr*1000:.0f}ms)"
)
# 4. Check for repeated speech segments (stuck/looping)
# Check both short and long sentence durations at audiobook speed (150-160 wpm)
for chunk_duration in [5.0, 10.0]: # 5s (~12 words) and 10s (~25 words) at ~audiobook speed
chunk_size = int(chunk_duration * sr) chunk_size = int(chunk_duration * sr)
overlap = int(0.2 * chunk_size) # 20% overlap between chunks overlap = int(0.2 * chunk_size)
for i in range(0, len(audio) - 2*chunk_size, overlap): for i in range(0, len(audio) - 2*chunk_size, overlap):
chunk1 = audio[i:i+chunk_size] chunk1 = audio[i:i+chunk_size]
chunk2 = audio[i+chunk_size:i+2*chunk_size] chunk2 = audio[i+chunk_size:i+2*chunk_size]
# Ignore chunks that are mostly silence
if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01: if np.mean(np.abs(chunk1)) < 0.01 or np.mean(np.abs(chunk2)) < 0.01:
continue continue
try: try:
correlation = np.corrcoef(chunk1, chunk2)[0,1] correlation = np.corrcoef(chunk1, chunk2)[0,1]
if not np.isnan(correlation) and correlation > 0.92: # Lower threshold for sentence-length chunks if not np.isnan(correlation) and correlation > 0.92:
issues.append( issues.append(
f"WARNING: Possible repeated speech at {i/sr:.1f}s " f"WARNING: Possible repeated speech at {i/sr:.1f}s "
f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})" f"(~{int(chunk_duration*160/60):d} words, correlation: {correlation:.3f})"
) )
break # Found repetition at this duration, try next duration break
except: except:
continue continue
# 5. Check for extreme amplitude discontinuities (common in failed TTS)
amplitude_envelope = np.abs(audio)
window_size = sr // 10 # 100ms window for smoother envelope
smooth_env = np.convolve(amplitude_envelope, np.ones(window_size)/float(window_size), 'same')
env_diff = np.abs(np.diff(smooth_env))
# Only detect very extreme amplitude changes
jump_threshold = 0.5 # Much higher threshold
jumps = np.where(env_diff > jump_threshold)[0]
if len(jumps) > 0:
# Group jumps that are close together
grouped_jumps = []
current_group = [jumps[0]]
for i in range(1, len(jumps)):
if (jumps[i] - current_group[-1]) < 0.05 * sr: # Group within 50ms
current_group.append(jumps[i])
else:
if len(current_group) >= 3: # Only keep significant discontinuities
grouped_jumps.append(current_group)
current_group = [jumps[i]]
if len(current_group) >= 3:
grouped_jumps.append(current_group)
# Report only the most severe discontinuities
for group in grouped_jumps[:2]: # Report up to 2 worst cases
center_idx = group[len(group)//2]
jump_size = env_diff[center_idx]
if jump_size > 0.6: # Only report very extreme changes
issues.append(
f"WARNING: Possible audio discontinuity at {center_idx/sr:.2f}s "
f"({jump_size:.2f} amplitude ratio change)"
)
return { return {
"file": wav_path, "file": wav_path,
"duration": f"{duration:.2f}s", "duration": f"{duration:.2f}s",
"sample_rate": sr, "sample_rate": sr,
"peak_amplitude": f"{peak:.3f}{clip_stats}", "peak_amplitude": f"{stats['peak']:.3f}",
"rms_level": f"{rms:.3f}", "rms_level": f"{stats['rms']:.3f}",
"dc_offset": f"{dc_offset:.3f}", "dc_offset": f"{stats['dc_offset']:.3f}",
"artifact_count": len(artifacts),
"artifact_locations": [a['time'] for a in artifacts],
"artifact_severities": [a['severity'] for a in artifacts],
"issues": issues, "issues": issues,
"valid": len(issues) == 0 "valid": len(issues) == 0
} }
@ -206,12 +141,78 @@ def validate_tts(wav_path: str) -> dict:
"valid": False "valid": False
} }
if __name__ == "__main__": def generate_analysis_plots(wav_path: str, output_dir: str, validation_result: Dict[str, Any]):
parser = argparse.ArgumentParser(description="TTS Output Validator") """
parser.add_argument("wav_file", help="Path to audio file to validate") Generate analysis plots for audio file with time-aligned visualizations.
args = parser.parse_args() """
import matplotlib.pyplot as plt
from scipy.signal import spectrogram
result = validate_tts(args.wav_file) # Load audio
audio, sr = sf.read(wav_path)
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
# Create figure with shared x-axis
fig = plt.figure(figsize=(15, 8))
gs = plt.GridSpec(2, 1, height_ratios=[1.2, 0.8], hspace=0.1)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1], sharex=ax1)
# Calculate spectrogram
nperseg = 2048
noverlap = 1536
f, t, Sxx = spectrogram(audio, sr, nperseg=nperseg, noverlap=noverlap,
window='hann', scaling='spectrum')
# Plot spectrogram
im = ax1.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10),
shading='gouraud', cmap='viridis',
vmin=-100, vmax=-20)
ax1.set_ylabel('Frequency [Hz]', fontsize=10)
cbar = plt.colorbar(im, ax=ax1, label='dB')
ax1.set_title('Spectrogram', pad=10, fontsize=12)
# Plot waveform with exact time alignment
times = np.arange(len(audio)) / sr
ax2.plot(times, audio, color='#2E5596', alpha=0.7, linewidth=0.5, label='Audio')
ax2.set_ylabel('Amplitude', fontsize=10)
ax2.set_xlabel('Time [sec]', fontsize=10)
ax2.grid(True, alpha=0.2)
# Add artifact markers
if 'artifact_locations' in validation_result and validation_result['artifact_locations']:
for loc in validation_result['artifact_locations']:
ax1.axvline(x=loc, color='red', alpha=0.7, linewidth=2)
ax2.axvline(x=loc, color='red', alpha=0.7, linewidth=2, label='Detected Artifacts')
# Add legend to both plots
if len(validation_result['artifact_locations']) > 0:
ax1.plot([], [], color='red', linewidth=2, label='Detected Artifacts')
ax1.legend(loc='upper right', fontsize=8)
# Only add unique labels to legend
handles, labels = ax2.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles))
ax2.legend(unique_labels.values(), unique_labels.keys(),
loc='upper right', fontsize=8)
# Set common x limits
xlim = (0, len(audio)/sr)
ax1.set_xlim(xlim)
ax2.set_xlim(xlim)
og_filename = Path(wav_path).name.split(".")[0]
# Save plot
plt.savefig(Path(output_dir) / f"{og_filename}_audio_analysis.png", dpi=300, bbox_inches='tight')
plt.close()
if __name__ == "__main__":
wav_file = r"C:\Users\jerem\Desktop\Kokoro-FastAPI\examples\output.wav"
silent=False
result = validate_tts(wav_file)
if not silent:
wav_root_dir = Path(wav_file).parent
generate_analysis_plots(wav_file, wav_root_dir, result)
print(f"\nValidating: {result['file']}") print(f"\nValidating: {result['file']}")
if "error" in result: if "error" in result:
@ -222,6 +223,7 @@ if __name__ == "__main__":
print(f"Peak Amplitude: {result['peak_amplitude']}") print(f"Peak Amplitude: {result['peak_amplitude']}")
print(f"RMS Level: {result['rms_level']}") print(f"RMS Level: {result['rms_level']}")
print(f"DC Offset: {result['dc_offset']}") print(f"DC Offset: {result['dc_offset']}")
print(f"Detected Artifacts: {result['artifact_count']}")
if result["issues"]: if result["issues"]:
print("\nIssues Found:") print("\nIssues Found:")

BIN
examples/audio_analysis.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 MiB

BIN
examples/output.wav Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

View file

@ -0,0 +1,144 @@
#!/usr/bin/env python3
import requests
import sounddevice as sd
import numpy as np
import time
import os
import wave
def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"):
"""Stream TTS audio and play it back in real-time"""
print("\nStarting TTS stream request...")
start_time = time.time()
# Initialize variables
sample_rate = 24000 # Known sample rate for Kokoro
audio_started = False
stream = None
chunk_count = 0
total_bytes = 0
first_chunk_time = None
all_audio_data = bytearray() # Raw PCM audio data
# Make streaming request to API
try:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": voice,
"response_format": "pcm",
"stream": True
},
stream=True,
timeout=1800
)
response.raise_for_status()
print(f"Request started successfully after {time.time() - start_time:.2f}s")
# Process streaming response
for chunk in response.iter_content(chunk_size=1024):
if chunk:
chunk_count += 1
total_bytes += len(chunk)
# Handle first chunk
if not audio_started:
first_chunk_time = time.time()
print(f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s")
print(f"First chunk size: {len(chunk)} bytes")
# Accumulate raw audio data
all_audio_data.extend(chunk)
# Convert PCM to float32 for playback
audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
# Scale to [-1, 1] range for sounddevice
audio_data = audio_data / 32768.0
# Start audio stream
stream = sd.OutputStream(
samplerate=sample_rate,
channels=1,
dtype=np.float32
)
stream.start()
audio_started = True
print("Audio playback started")
# Play first chunk
if len(audio_data) > 0:
stream.write(audio_data)
# Handle subsequent chunks
else:
# Accumulate raw audio data
all_audio_data.extend(chunk)
# Convert PCM to float32 for playback
audio_data = np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
audio_data = audio_data / 32768.0
if len(audio_data) > 0:
stream.write(audio_data)
# Log progress every 10 chunks
if chunk_count % 10 == 0:
elapsed = time.time() - start_time
print(f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed")
# Final stats
total_time = time.time() - start_time
print(f"\nStream complete:")
print(f"Total chunks: {chunk_count}")
print(f"Total data: {total_bytes/1024:.1f}KB")
print(f"Total time: {total_time:.2f}s")
print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
# Save as WAV file
if output_file:
print(f"\nWriting audio to {output_file}")
with wave.open(output_file, 'wb') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(sample_rate)
wav_file.writeframes(all_audio_data)
print(f"Saved {len(all_audio_data)} bytes of audio data")
# Clean up
if stream is not None:
stream.stop()
stream.close()
except requests.exceptions.ConnectionError as e:
print(f"Connection error - Is the server running? Error: {str(e)}")
if stream is not None:
stream.stop()
stream.close()
except Exception as e:
print(f"Error during streaming: {str(e)}")
if stream is not None:
stream.stop()
stream.close()
def main():
# Load sample text from HG Wells
script_dir = os.path.dirname(os.path.abspath(__file__))
wells_path = os.path.join(script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt")
output_path = os.path.join(script_dir, "output.wav")
with open(wells_path, "r", encoding="utf-8") as f:
full_text = f.read()
# Take first few paragraphs
text = " ".join(full_text.split("\n\n")[:2])
print("\nStarting TTS stream playback...")
print(f"Text length: {len(text)} characters")
print("\nFirst 100 characters:")
print(text[:100] + "...")
play_streaming_tts(text, output_file=output_path)
if __name__ == "__main__":
main()