import os import json import time import subprocess from datetime import datetime import pandas as pd import psutil import seaborn as sns import requests import tiktoken import scipy.io.wavfile as wavfile import matplotlib.pyplot as plt enc = tiktoken.get_encoding("cl100k_base") def setup_plot(fig, ax, title): """Configure plot styling""" # Improve grid ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff") # Set title and labels with better fonts ax.set_title(title, pad=20, fontsize=16, fontweight="bold", color="#ffffff") ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff") ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff") # Improve tick labels ax.tick_params(labelsize=12, colors="#ffffff") # Style spines for spine in ax.spines.values(): spine.set_color("#ffffff") spine.set_alpha(0.3) spine.set_linewidth(0.5) # Set background colors ax.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#1a1a2e") return fig, ax def get_text_for_tokens(text: str, num_tokens: int) -> str: """Get a slice of text that contains exactly num_tokens tokens""" tokens = enc.encode(text) if num_tokens > len(tokens): return text return enc.decode(tokens[:num_tokens]) def get_audio_length(audio_data: bytes) -> float: """Get audio length in seconds from bytes data""" # Save to a temporary file temp_path = "examples/benchmarks/output/temp.wav" os.makedirs(os.path.dirname(temp_path), exist_ok=True) with open(temp_path, "wb") as f: f.write(audio_data) # Read the audio file try: rate, data = wavfile.read(temp_path) return len(data) / rate finally: # Clean up temp file if os.path.exists(temp_path): os.remove(temp_path) def get_gpu_memory(): """Get GPU memory usage using nvidia-smi""" try: result = subprocess.check_output( ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"] ) return float(result.decode("utf-8").strip()) except (subprocess.CalledProcessError, FileNotFoundError): return None def get_system_metrics(): """Get current system metrics""" metrics = { "timestamp": datetime.now().isoformat(), "cpu_percent": psutil.cpu_percent(), "ram_percent": psutil.virtual_memory().percent, "ram_used_gb": psutil.virtual_memory().used / (1024**3), } gpu_mem = get_gpu_memory() if gpu_mem is not None: metrics["gpu_memory_used"] = gpu_mem return metrics def make_tts_request(text: str, timeout: int = 120) -> tuple[float, float]: """Make TTS request using OpenAI-compatible endpoint and return processing time and output length""" try: start_time = time.time() # Make request to OpenAI-compatible endpoint response = requests.post( "http://localhost:8880/v1/audio/speech", json={ "model": "kokoro", "input": text, "voice": "af", "response_format": "wav", }, timeout=timeout, ) response.raise_for_status() processing_time = time.time() - start_time audio_length = get_audio_length(response.content) # Save the audio file token_count = len(enc.encode(text)) output_file = f"examples/benchmarks/output/chunk_{token_count}_tokens.wav" os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "wb") as f: f.write(response.content) print(f"Saved audio to {output_file}") return processing_time, audio_length except requests.exceptions.RequestException as e: print(f"Error making request for text: {text[:50]}... Error: {str(e)}") return None, None except Exception as e: print(f"Error processing text: {text[:50]}... Error: {str(e)}") return None, None def plot_system_metrics(metrics_data): """Create plots for system metrics over time""" df = pd.DataFrame(metrics_data) df["timestamp"] = pd.to_datetime(df["timestamp"]) elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds() # Get baseline values (first measurement) baseline_cpu = df["cpu_percent"].iloc[0] baseline_ram = df["ram_used_gb"].iloc[0] baseline_gpu = ( df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None ) # Convert MB to GB # Convert GPU memory to GB if "gpu_memory_used" in df.columns: df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024 # Set plotting style plt.style.use("dark_background") # Create figure with 3 subplots (or 2 if no GPU) has_gpu = "gpu_memory_used" in df.columns num_plots = 3 if has_gpu else 2 fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots)) fig.patch.set_facecolor("#1a1a2e") # Apply rolling average for smoothing window = min(5, len(df) // 2) # Smaller window for smoother lines # Plot 1: CPU Usage smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean() sns.lineplot( x=elapsed_time, y=smoothed_cpu, ax=axes[0], color="#ff2a6d", linewidth=2 ) axes[0].axhline( y=baseline_cpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline" ) axes[0].set_xlabel("Time (seconds)", fontsize=14) axes[0].set_ylabel("CPU Usage (%)", fontsize=14) axes[0].tick_params(labelsize=12) axes[0].set_title("CPU Usage Over Time", pad=20, fontsize=16, fontweight="bold") axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1) # Add 10% padding axes[0].legend() # Plot 2: RAM Usage smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean() sns.lineplot( x=elapsed_time, y=smoothed_ram, ax=axes[1], color="#05d9e8", linewidth=2 ) axes[1].axhline( y=baseline_ram, color="#ff2a6d", linestyle="--", alpha=0.5, label="Baseline" ) axes[1].set_xlabel("Time (seconds)", fontsize=14) axes[1].set_ylabel("RAM Usage (GB)", fontsize=14) axes[1].tick_params(labelsize=12) axes[1].set_title("RAM Usage Over Time", pad=20, fontsize=16, fontweight="bold") axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1) # Add 10% padding axes[1].legend() # Plot 3: GPU Memory (if available) if has_gpu: smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean() sns.lineplot( x=elapsed_time, y=smoothed_gpu, ax=axes[2], color="#ff2a6d", linewidth=2 ) axes[2].axhline( y=baseline_gpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline" ) axes[2].set_xlabel("Time (seconds)", fontsize=14) axes[2].set_ylabel("GPU Memory (GB)", fontsize=14) axes[2].tick_params(labelsize=12) axes[2].set_title( "GPU Memory Usage Over Time", pad=20, fontsize=16, fontweight="bold" ) axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1) # Add 10% padding axes[2].legend() # Style all subplots for ax in axes: ax.grid(True, linestyle="--", alpha=0.3) ax.set_facecolor("#1a1a2e") for spine in ax.spines.values(): spine.set_color("#ffffff") spine.set_alpha(0.3) plt.tight_layout() plt.savefig("examples/benchmarks/system_usage.png", dpi=300, bbox_inches="tight") plt.close() def main(): # Create output directory os.makedirs("examples/benchmarks/output", exist_ok=True) # Read input text with open( "examples/benchmarks/the_time_machine_hg_wells.txt", "r", encoding="utf-8" ) as f: text = f.read() # Get total tokens in file total_tokens = len(enc.encode(text)) print(f"Total tokens in file: {total_tokens}") # Generate token sizes with dense sampling at start and increasing intervals dense_range = list(range(100, 1001, 100)) current = max(dense_range) large_range = [] while current <= total_tokens: large_range.append(current) current += 1000 token_sizes = sorted(list(set(dense_range + large_range))) print(f"Testing sizes: {token_sizes}") # Process chunks results = [] system_metrics = [] test_start_time = time.time() for num_tokens in token_sizes: # Get text slice with exact token count chunk = get_text_for_tokens(text, num_tokens) actual_tokens = len(enc.encode(chunk)) print(f"\nProcessing chunk with {actual_tokens} tokens:") print(f"Text preview: {chunk[:100]}...") # Collect system metrics before processing system_metrics.append(get_system_metrics()) processing_time, audio_length = make_tts_request(chunk) if processing_time is None or audio_length is None: print("Breaking loop due to error") break # Collect system metrics after processing system_metrics.append(get_system_metrics()) results.append( { "tokens": actual_tokens, "processing_time": processing_time, "output_length": audio_length, "realtime_factor": audio_length / processing_time, "elapsed_time": time.time() - test_start_time, } ) # Save intermediate results with open("examples/benchmarks/benchmark_results.json", "w") as f: json.dump( {"results": results, "system_metrics": system_metrics}, f, indent=2 ) # Create DataFrame and calculate stats df = pd.DataFrame(results) if df.empty: print("No data to plot") return # Calculate useful metrics df["tokens_per_second"] = df["tokens"] / df["processing_time"] # Write detailed stats with open("examples/benchmarks/benchmark_stats.txt", "w") as f: f.write("=== Benchmark Statistics ===\n\n") f.write("Overall Stats:\n") f.write(f"Total tokens processed: {df['tokens'].sum()}\n") f.write(f"Total audio generated: {df['output_length'].sum():.2f}s\n") f.write(f"Total test duration: {df['elapsed_time'].max():.2f}s\n") f.write( f"Average processing rate: {df['tokens_per_second'].mean():.2f} tokens/second\n" ) f.write(f"Average realtime factor: {df['realtime_factor'].mean():.2f}x\n\n") f.write("Per-chunk Stats:\n") f.write(f"Average chunk size: {df['tokens'].mean():.2f} tokens\n") f.write(f"Min chunk size: {df['tokens'].min():.2f} tokens\n") f.write(f"Max chunk size: {df['tokens'].max():.2f} tokens\n") f.write(f"Average processing time: {df['processing_time'].mean():.2f}s\n") f.write(f"Average output length: {df['output_length'].mean():.2f}s\n\n") f.write("Performance Ranges:\n") f.write( f"Processing rate range: {df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f} tokens/second\n" ) f.write( f"Realtime factor range: {df['realtime_factor'].min():.2f}x - {df['realtime_factor'].max():.2f}x\n" ) # Set plotting style plt.style.use("dark_background") # Plot 1: Processing Time vs Token Count fig, ax = plt.subplots(figsize=(12, 8)) sns.scatterplot( data=df, x="tokens", y="processing_time", s=100, alpha=0.6, color="#ff2a6d" ) sns.regplot( data=df, x="tokens", y="processing_time", scatter=False, color="#05d9e8", line_kws={"linewidth": 2}, ) corr = df["tokens"].corr(df["processing_time"]) plt.text( 0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff", bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7), ) setup_plot(fig, ax, "Processing Time vs Input Size") ax.set_xlabel("Number of Input Tokens") ax.set_ylabel("Processing Time (seconds)") plt.savefig("examples/benchmarks/processing_time.png", dpi=300, bbox_inches="tight") plt.close() # Plot 2: Realtime Factor vs Token Count fig, ax = plt.subplots(figsize=(12, 8)) sns.scatterplot( data=df, x="tokens", y="realtime_factor", s=100, alpha=0.6, color="#ff2a6d" ) sns.regplot( data=df, x="tokens", y="realtime_factor", scatter=False, color="#05d9e8", line_kws={"linewidth": 2}, ) corr = df["tokens"].corr(df["realtime_factor"]) plt.text( 0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff", bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7), ) setup_plot(fig, ax, "Realtime Factor vs Input Size") ax.set_xlabel("Number of Input Tokens") ax.set_ylabel("Realtime Factor (output length / processing time)") plt.savefig("examples/benchmarks/realtime_factor.png", dpi=300, bbox_inches="tight") plt.close() # Plot system metrics plot_system_metrics(system_metrics) print("\nResults saved to:") print("- examples/benchmarks/benchmark_results.json") print("- examples/benchmarks/benchmark_stats.txt") print("- examples/benchmarks/processing_time.png") print("- examples/benchmarks/realtime_factor.png") print("- examples/benchmarks/system_usage.png") if any("gpu_memory_used" in m for m in system_metrics): print("- examples/benchmarks/gpu_usage.png") print("\nAudio files saved in examples/benchmarks/output/") if __name__ == "__main__": main()