import os import json import time import subprocess from datetime import datetime import pandas as pd import psutil import seaborn as sns import requests import tiktoken import scipy.io.wavfile as wavfile import matplotlib.pyplot as plt enc = tiktoken.get_encoding("cl100k_base") def setup_plot(fig, ax, title): """Configure plot styling""" ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff") ax.set_title(title, pad=20, fontsize=16, fontweight="bold", color="#ffffff") ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff") ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff") ax.tick_params(labelsize=12, colors="#ffffff") for spine in ax.spines.values(): spine.set_color("#ffffff") spine.set_alpha(0.3) spine.set_linewidth(0.5) ax.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#1a1a2e") return fig, ax def get_text_for_tokens(text: str, num_tokens: int) -> str: """Get a slice of text that contains exactly num_tokens tokens""" tokens = enc.encode(text) if num_tokens > len(tokens): return text return enc.decode(tokens[:num_tokens]) def get_audio_length(audio_data: bytes) -> float: """Get audio length in seconds from bytes data""" temp_path = "examples/benchmarks/output/temp.wav" os.makedirs(os.path.dirname(temp_path), exist_ok=True) with open(temp_path, "wb") as f: f.write(audio_data) try: rate, data = wavfile.read(temp_path) return len(data) / rate finally: if os.path.exists(temp_path): os.remove(temp_path) def get_gpu_memory(): """Get GPU memory usage using nvidia-smi""" try: result = subprocess.check_output( ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"] ) return float(result.decode("utf-8").strip()) except (subprocess.CalledProcessError, FileNotFoundError): return None def get_system_metrics(): """Get current system metrics""" # Take multiple CPU measurements over a short period samples = [] for _ in range(3): # Take 3 samples # Get both overall and per-CPU percentages overall_cpu = psutil.cpu_percent(interval=0.1) per_cpu = psutil.cpu_percent(percpu=True) avg_per_cpu = sum(per_cpu) / len(per_cpu) # Use the maximum of overall and average per-CPU samples.append(max(overall_cpu, avg_per_cpu)) # Use the maximum CPU usage from all samples cpu_usage = round(max(samples), 2) metrics = { "timestamp": datetime.now().isoformat(), "cpu_percent": cpu_usage, "ram_percent": psutil.virtual_memory().percent, "ram_used_gb": psutil.virtual_memory().used / (1024**3), } gpu_mem = get_gpu_memory() if gpu_mem is not None: metrics["gpu_memory_used"] = gpu_mem return metrics def real_time_factor(processing_time: float, audio_length: float, decimals: int = 2) -> float: """Calculate Real-Time Factor (RTF) as processing-time / length-of-audio""" rtf = processing_time / audio_length return round(rtf, decimals) def make_tts_request(text: str, timeout: int = 1800) -> tuple[float, float]: """Make TTS request using OpenAI-compatible endpoint and return processing time and output length""" try: start_time = time.time() response = requests.post( "http://localhost:8880/v1/audio/speech", json={ "model": "kokoro", "input": text, "voice": "af", "response_format": "wav", }, timeout=timeout, ) response.raise_for_status() processing_time = round(time.time() - start_time, 2) audio_length = round(get_audio_length(response.content), 2) # Save the audio file token_count = len(enc.encode(text)) output_file = f"examples/benchmarks/output/chunk_{token_count}_tokens.wav" os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "wb") as f: f.write(response.content) print(f"Saved audio to {output_file}") return processing_time, audio_length except requests.exceptions.RequestException as e: print(f"Error making request for text: {text[:50]}... Error: {str(e)}") return None, None except Exception as e: print(f"Error processing text: {text[:50]}... Error: {str(e)}") return None, None def plot_system_metrics(metrics_data): """Create plots for system metrics over time""" df = pd.DataFrame(metrics_data) df["timestamp"] = pd.to_datetime(df["timestamp"]) elapsed_time = (df["timestamp"] - df["timestamp"].iloc[0]).dt.total_seconds() baseline_cpu = df["cpu_percent"].iloc[0] baseline_ram = df["ram_used_gb"].iloc[0] baseline_gpu = df["gpu_memory_used"].iloc[0] / 1024 if "gpu_memory_used" in df.columns else None if "gpu_memory_used" in df.columns: df["gpu_memory_gb"] = df["gpu_memory_used"] / 1024 plt.style.use("dark_background") has_gpu = "gpu_memory_used" in df.columns num_plots = 3 if has_gpu else 2 fig, axes = plt.subplots(num_plots, 1, figsize=(15, 5 * num_plots)) fig.patch.set_facecolor("#1a1a2e") window = min(5, len(df) // 2) # Plot CPU Usage smoothed_cpu = df["cpu_percent"].rolling(window=window, center=True).mean() sns.lineplot(x=elapsed_time, y=smoothed_cpu, ax=axes[0], color="#ff2a6d", linewidth=2) axes[0].axhline(y=baseline_cpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline") axes[0].set_xlabel("Time (seconds)") axes[0].set_ylabel("CPU Usage (%)") axes[0].set_title("CPU Usage Over Time") axes[0].set_ylim(0, max(df["cpu_percent"]) * 1.1) axes[0].legend() # Plot RAM Usage smoothed_ram = df["ram_used_gb"].rolling(window=window, center=True).mean() sns.lineplot(x=elapsed_time, y=smoothed_ram, ax=axes[1], color="#05d9e8", linewidth=2) axes[1].axhline(y=baseline_ram, color="#ff2a6d", linestyle="--", alpha=0.5, label="Baseline") axes[1].set_xlabel("Time (seconds)") axes[1].set_ylabel("RAM Usage (GB)") axes[1].set_title("RAM Usage Over Time") axes[1].set_ylim(0, max(df["ram_used_gb"]) * 1.1) axes[1].legend() # Plot GPU Memory if available if has_gpu: smoothed_gpu = df["gpu_memory_gb"].rolling(window=window, center=True).mean() sns.lineplot(x=elapsed_time, y=smoothed_gpu, ax=axes[2], color="#ff2a6d", linewidth=2) axes[2].axhline(y=baseline_gpu, color="#05d9e8", linestyle="--", alpha=0.5, label="Baseline") axes[2].set_xlabel("Time (seconds)") axes[2].set_ylabel("GPU Memory (GB)") axes[2].set_title("GPU Memory Usage Over Time") axes[2].set_ylim(0, max(df["gpu_memory_gb"]) * 1.1) axes[2].legend() for ax in axes: ax.grid(True, linestyle="--", alpha=0.3) ax.set_facecolor("#1a1a2e") for spine in ax.spines.values(): spine.set_color("#ffffff") spine.set_alpha(0.3) plt.tight_layout() plt.savefig("examples/benchmarks/system_usage_rtf.png", dpi=300, bbox_inches="tight") plt.close() def main(): os.makedirs("examples/benchmarks/output", exist_ok=True) with open("examples/benchmarks/the_time_machine_hg_wells.txt", "r", encoding="utf-8") as f: text = f.read() total_tokens = len(enc.encode(text)) print(f"Total tokens in file: {total_tokens}") # Generate token sizes with dense sampling at start dense_range = list(range(100, 1001, 100)) token_sizes = sorted(list(set(dense_range))) print(f"Testing sizes: {token_sizes}") results = [] system_metrics = [] test_start_time = time.time() for num_tokens in token_sizes: chunk = get_text_for_tokens(text, num_tokens) actual_tokens = len(enc.encode(chunk)) print(f"\nProcessing chunk with {actual_tokens} tokens:") print(f"Text preview: {chunk[:100]}...") system_metrics.append(get_system_metrics()) processing_time, audio_length = make_tts_request(chunk) if processing_time is None or audio_length is None: print("Breaking loop due to error") break system_metrics.append(get_system_metrics()) # Calculate RTF using the correct formula rtf = real_time_factor(processing_time, audio_length) results.append({ "tokens": actual_tokens, "processing_time": processing_time, "output_length": audio_length, "rtf": rtf, "elapsed_time": round(time.time() - test_start_time, 2), }) with open("examples/benchmarks/benchmark_results_rtf.json", "w") as f: json.dump({"results": results, "system_metrics": system_metrics}, f, indent=2) df = pd.DataFrame(results) if df.empty: print("No data to plot") return df["tokens_per_second"] = df["tokens"] / df["processing_time"] with open("examples/benchmarks/benchmark_stats_rtf.txt", "w") as f: f.write("=== Benchmark Statistics (with correct RTF) ===\n\n") f.write("Overall Stats:\n") f.write(f"Total tokens processed: {df['tokens'].sum()}\n") f.write(f"Total audio generated: {df['output_length'].sum():.2f}s\n") f.write(f"Total test duration: {df['elapsed_time'].max():.2f}s\n") f.write(f"Average processing rate: {df['tokens_per_second'].mean():.2f} tokens/second\n") f.write(f"Average RTF: {df['rtf'].mean():.2f}x\n\n") f.write("Per-chunk Stats:\n") f.write(f"Average chunk size: {df['tokens'].mean():.2f} tokens\n") f.write(f"Min chunk size: {df['tokens'].min():.2f} tokens\n") f.write(f"Max chunk size: {df['tokens'].max():.2f} tokens\n") f.write(f"Average processing time: {df['processing_time'].mean():.2f}s\n") f.write(f"Average output length: {df['output_length'].mean():.2f}s\n\n") f.write("Performance Ranges:\n") f.write(f"Processing rate range: {df['tokens_per_second'].min():.2f} - {df['tokens_per_second'].max():.2f} tokens/second\n") f.write(f"RTF range: {df['rtf'].min():.2f}x - {df['rtf'].max():.2f}x\n") plt.style.use("dark_background") # Plot Processing Time vs Token Count fig, ax = plt.subplots(figsize=(12, 8)) sns.scatterplot(data=df, x="tokens", y="processing_time", s=100, alpha=0.6, color="#ff2a6d") sns.regplot(data=df, x="tokens", y="processing_time", scatter=False, color="#05d9e8", line_kws={"linewidth": 2}) corr = df["tokens"].corr(df["processing_time"]) plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff", bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7)) setup_plot(fig, ax, "Processing Time vs Input Size") ax.set_xlabel("Number of Input Tokens") ax.set_ylabel("Processing Time (seconds)") plt.savefig("examples/benchmarks/processing_time_rtf.png", dpi=300, bbox_inches="tight") plt.close() # Plot RTF vs Token Count fig, ax = plt.subplots(figsize=(12, 8)) sns.scatterplot(data=df, x="tokens", y="rtf", s=100, alpha=0.6, color="#ff2a6d") sns.regplot(data=df, x="tokens", y="rtf", scatter=False, color="#05d9e8", line_kws={"linewidth": 2}) corr = df["tokens"].corr(df["rtf"]) plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=ax.transAxes, fontsize=10, color="#ffffff", bbox=dict(facecolor="#1a1a2e", edgecolor="#ffffff", alpha=0.7)) setup_plot(fig, ax, "Real-Time Factor vs Input Size") ax.set_xlabel("Number of Input Tokens") ax.set_ylabel("Real-Time Factor (processing time / audio length)") plt.savefig("examples/benchmarks/realtime_factor_rtf.png", dpi=300, bbox_inches="tight") plt.close() plot_system_metrics(system_metrics) print("\nResults saved to:") print("- examples/benchmarks/benchmark_results_rtf.json") print("- examples/benchmarks/benchmark_stats_rtf.txt") print("- examples/benchmarks/processing_time_rtf.png") print("- examples/benchmarks/realtime_factor_rtf.png") print("- examples/benchmarks/system_usage_rtf.png") print("\nAudio files saved in examples/benchmarks/output/") if __name__ == "__main__": main()