diff --git a/README.md b/README.md index faf1c0e..263c091 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,57 @@ -# Kokoro TTS API +

+ Kokoro TTS Banner +

-FastAPI wrapper for Kokoro TTS with voice cloning. Runs inference on GPU. +# Kokoro TTS API +[![Model Commit](https://img.shields.io/badge/model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/a67f11354c3e38c58c3327498bc4bd1e57e71c50) + +FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model with voice cloning capabilities. + +Dockerized with NVIDIA GPU support, simple queue handling via sqllite, and automatic chunking/stitching on lengthy input/outputs ## Quick Start ```bash -# Start the API (will automatically download model on first run) +# Start the API (will automatically clone source HF repo docker compose up --build ``` +Test it out: ```bash -# From host terminal, test it out with some API calls +# From host terminal python examples/test_tts.py "Hello world" --voice af_bella ``` + +## Performance Benchmarks + +Benchmarking was performed soley on generation via the API (no download) using various text lengths from 100 to ~10,000 characters, measuring processing time, token count, and output audio length. Tests were run on: +- NVIDIA 4060Ti 16gb GPU @ CUDA 12.1 +- 11th Gen i7-11700 @ 2.5GHz +- 64gb RAM +- Randomized chunks from H.G. Wells - The Time Machine + +

+ Processing Time vs Output Length + Processing Time vs Token Count +

+ + +- Average processing speed: ~3.4 seconds per minute of audio output +- Efficient token processing: ~0.01 seconds per token +- Scales well with longer texts, maintains consistent performance + ## API Endpoints ```bash -GET /tts/voices # List voices +GET /tts/voices # List available voices POST /tts # Generate speech -GET /tts/{request_id} # Check status -GET /tts/file/{request_id} # Get audio file +GET /tts/{request_id} # Check generation status +GET /tts/file/{request_id} # Download audio file ``` ## Example Usage -List voices: +List available voices: ```bash python examples/test_tts.py ``` @@ -37,14 +64,32 @@ python examples/test_tts.py "Your text here" # Specific voice python examples/test_tts.py --voice af_bella "Your text here" -# Just get file path (no download) +# Get file path without downloading python examples/test_tts.py --no-download "Your text here" ``` -Generated files in `examples/output/` (or in src/output/ of API if --no-download) +Generated files are saved in: +- With download: `examples/output/` +- Without download: `src/output/` (in API container) ## Requirements - Docker - NVIDIA GPU + CUDA -- nvidia-container-toolkit +- nvidia-container-toolkit installed on host + +## Model + +This API uses the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model from HuggingFace. + +Visit the model page for more details about training, architecture, and capabilities. I have no affiliation with any of their work, and produced this wrapper for ease of use and personal projects. + +## License + +This project is licensed under the Apache License 2.0 - see below for details: + +- The Kokoro model weights are licensed under Apache 2.0 (see [model page](https://huggingface.co/hexgrad/Kokoro-82M)) +- The FastAPI wrapper code in this repository is licensed under Apache 2.0 to match +- The inference code adapted from StyleTTS2 is MIT licensed + +The full Apache 2.0 license text can be found at: https://www.apache.org/licenses/LICENSE-2.0 diff --git a/examples/plot_benchmark_style.py b/examples/plot_benchmark_style.py new file mode 100644 index 0000000..f59b1e5 --- /dev/null +++ b/examples/plot_benchmark_style.py @@ -0,0 +1,97 @@ +import json +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +def setup_plot(fig, ax, title): + """Configure plot styling""" + # Improve grid + ax.grid(True, linestyle='--', alpha=0.3, color='#ffffff') + + # Set title and labels with better fonts + ax.set_title(title, pad=20, fontsize=16, fontweight='bold', color='#ffffff') + ax.set_xlabel(ax.get_xlabel(), fontsize=12, fontweight='medium', color='#ffffff') + ax.set_ylabel(ax.get_ylabel(), fontsize=12, fontweight='medium', color='#ffffff') + + # Improve tick labels + ax.tick_params(labelsize=10, colors='#ffffff') + + # Style spines + for spine in ax.spines.values(): + spine.set_color('#ffffff') + spine.set_alpha(0.3) + spine.set_linewidth(0.5) + + # Set background colors + ax.set_facecolor('#1a1a2e') + fig.patch.set_facecolor('#1a1a2e') + + return fig, ax + +def main(): + # Load benchmark results + with open('examples/benchmark_results.json', 'r') as f: + results = json.load(f) + + # Create DataFrame + df = pd.DataFrame(results) + + # Set the style + plt.style.use('dark_background') + + # Plot 1: Processing Time vs Output Length + fig, ax = plt.subplots(figsize=(12, 8)) + + # Create scatter plot with custom styling + scatter = sns.scatterplot(data=df, x='output_length', y='processing_time', + s=100, alpha=0.6, color='#ff2a6d') # Neon pink + + # Add regression line with confidence interval + sns.regplot(data=df, x='output_length', y='processing_time', + scatter=False, color='#05d9e8', # Neon blue + line_kws={'linewidth': 2}) + + # Calculate correlation + corr = df['output_length'].corr(df['processing_time']) + + # Add correlation annotation + plt.text(0.05, 0.95, f'Correlation: {corr:.2f}', + transform=ax.transAxes, fontsize=10, color='#ffffff', + bbox=dict(facecolor='#1a1a2e', edgecolor='#ffffff', alpha=0.7)) + + setup_plot(fig, ax, 'Processing Time vs Output Length') + ax.set_xlabel('Output Audio Length (seconds)') + ax.set_ylabel('Processing Time (seconds)') + + plt.savefig('examples/time_vs_output.png', dpi=300, bbox_inches='tight') + plt.close() + + # Plot 2: Processing Time vs Token Count + fig, ax = plt.subplots(figsize=(12, 8)) + + # Create scatter plot with custom styling + scatter = sns.scatterplot(data=df, x='tokens', y='processing_time', + s=100, alpha=0.6, color='#ff2a6d') # Neon pink + + # Add regression line with confidence interval + sns.regplot(data=df, x='tokens', y='processing_time', + scatter=False, color='#05d9e8', # Neon blue + line_kws={'linewidth': 2}) + + # Calculate correlation + corr = df['tokens'].corr(df['processing_time']) + + # Add correlation annotation + plt.text(0.05, 0.95, f'Correlation: {corr:.2f}', + transform=ax.transAxes, fontsize=10, color='#ffffff', + bbox=dict(facecolor='#1a1a2e', edgecolor='#ffffff', alpha=0.7)) + + setup_plot(fig, ax, 'Processing Time vs Token Count') + ax.set_xlabel('Number of Input Tokens') + ax.set_ylabel('Processing Time (seconds)') + + plt.savefig('examples/time_vs_tokens.png', dpi=300, bbox_inches='tight') + plt.close() + +if __name__ == '__main__': + main() diff --git a/examples/time_vs_output.png b/examples/time_vs_output.png index ee7900e..a7da976 100644 Binary files a/examples/time_vs_output.png and b/examples/time_vs_output.png differ diff --git a/examples/time_vs_tokens.png b/examples/time_vs_tokens.png index 1348933..8f7808f 100644 Binary files a/examples/time_vs_tokens.png and b/examples/time_vs_tokens.png differ diff --git a/githubbanner.png b/githubbanner.png new file mode 100644 index 0000000..692f642 Binary files /dev/null and b/githubbanner.png differ