diff --git a/README.md b/README.md
index faf1c0e..263c091 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,57 @@
-# Kokoro TTS API
+
+
+
-FastAPI wrapper for Kokoro TTS with voice cloning. Runs inference on GPU.
+# Kokoro TTS API
+[](https://huggingface.co/hexgrad/Kokoro-82M/tree/a67f11354c3e38c58c3327498bc4bd1e57e71c50)
+
+FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model with voice cloning capabilities.
+
+Dockerized with NVIDIA GPU support, simple queue handling via sqllite, and automatic chunking/stitching on lengthy input/outputs
## Quick Start
```bash
-# Start the API (will automatically download model on first run)
+# Start the API (will automatically clone source HF repo
docker compose up --build
```
+Test it out:
```bash
-# From host terminal, test it out with some API calls
+# From host terminal
python examples/test_tts.py "Hello world" --voice af_bella
```
+
+## Performance Benchmarks
+
+Benchmarking was performed soley on generation via the API (no download) using various text lengths from 100 to ~10,000 characters, measuring processing time, token count, and output audio length. Tests were run on:
+- NVIDIA 4060Ti 16gb GPU @ CUDA 12.1
+- 11th Gen i7-11700 @ 2.5GHz
+- 64gb RAM
+- Randomized chunks from H.G. Wells - The Time Machine
+
+
+
+
+
+
+
+- Average processing speed: ~3.4 seconds per minute of audio output
+- Efficient token processing: ~0.01 seconds per token
+- Scales well with longer texts, maintains consistent performance
+
## API Endpoints
```bash
-GET /tts/voices # List voices
+GET /tts/voices # List available voices
POST /tts # Generate speech
-GET /tts/{request_id} # Check status
-GET /tts/file/{request_id} # Get audio file
+GET /tts/{request_id} # Check generation status
+GET /tts/file/{request_id} # Download audio file
```
## Example Usage
-List voices:
+List available voices:
```bash
python examples/test_tts.py
```
@@ -37,14 +64,32 @@ python examples/test_tts.py "Your text here"
# Specific voice
python examples/test_tts.py --voice af_bella "Your text here"
-# Just get file path (no download)
+# Get file path without downloading
python examples/test_tts.py --no-download "Your text here"
```
-Generated files in `examples/output/` (or in src/output/ of API if --no-download)
+Generated files are saved in:
+- With download: `examples/output/`
+- Without download: `src/output/` (in API container)
## Requirements
- Docker
- NVIDIA GPU + CUDA
-- nvidia-container-toolkit
+- nvidia-container-toolkit installed on host
+
+## Model
+
+This API uses the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model from HuggingFace.
+
+Visit the model page for more details about training, architecture, and capabilities. I have no affiliation with any of their work, and produced this wrapper for ease of use and personal projects.
+
+## License
+
+This project is licensed under the Apache License 2.0 - see below for details:
+
+- The Kokoro model weights are licensed under Apache 2.0 (see [model page](https://huggingface.co/hexgrad/Kokoro-82M))
+- The FastAPI wrapper code in this repository is licensed under Apache 2.0 to match
+- The inference code adapted from StyleTTS2 is MIT licensed
+
+The full Apache 2.0 license text can be found at: https://www.apache.org/licenses/LICENSE-2.0
diff --git a/examples/plot_benchmark_style.py b/examples/plot_benchmark_style.py
new file mode 100644
index 0000000..f59b1e5
--- /dev/null
+++ b/examples/plot_benchmark_style.py
@@ -0,0 +1,97 @@
+import json
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def setup_plot(fig, ax, title):
+ """Configure plot styling"""
+ # Improve grid
+ ax.grid(True, linestyle='--', alpha=0.3, color='#ffffff')
+
+ # Set title and labels with better fonts
+ ax.set_title(title, pad=20, fontsize=16, fontweight='bold', color='#ffffff')
+ ax.set_xlabel(ax.get_xlabel(), fontsize=12, fontweight='medium', color='#ffffff')
+ ax.set_ylabel(ax.get_ylabel(), fontsize=12, fontweight='medium', color='#ffffff')
+
+ # Improve tick labels
+ ax.tick_params(labelsize=10, colors='#ffffff')
+
+ # Style spines
+ for spine in ax.spines.values():
+ spine.set_color('#ffffff')
+ spine.set_alpha(0.3)
+ spine.set_linewidth(0.5)
+
+ # Set background colors
+ ax.set_facecolor('#1a1a2e')
+ fig.patch.set_facecolor('#1a1a2e')
+
+ return fig, ax
+
+def main():
+ # Load benchmark results
+ with open('examples/benchmark_results.json', 'r') as f:
+ results = json.load(f)
+
+ # Create DataFrame
+ df = pd.DataFrame(results)
+
+ # Set the style
+ plt.style.use('dark_background')
+
+ # Plot 1: Processing Time vs Output Length
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # Create scatter plot with custom styling
+ scatter = sns.scatterplot(data=df, x='output_length', y='processing_time',
+ s=100, alpha=0.6, color='#ff2a6d') # Neon pink
+
+ # Add regression line with confidence interval
+ sns.regplot(data=df, x='output_length', y='processing_time',
+ scatter=False, color='#05d9e8', # Neon blue
+ line_kws={'linewidth': 2})
+
+ # Calculate correlation
+ corr = df['output_length'].corr(df['processing_time'])
+
+ # Add correlation annotation
+ plt.text(0.05, 0.95, f'Correlation: {corr:.2f}',
+ transform=ax.transAxes, fontsize=10, color='#ffffff',
+ bbox=dict(facecolor='#1a1a2e', edgecolor='#ffffff', alpha=0.7))
+
+ setup_plot(fig, ax, 'Processing Time vs Output Length')
+ ax.set_xlabel('Output Audio Length (seconds)')
+ ax.set_ylabel('Processing Time (seconds)')
+
+ plt.savefig('examples/time_vs_output.png', dpi=300, bbox_inches='tight')
+ plt.close()
+
+ # Plot 2: Processing Time vs Token Count
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # Create scatter plot with custom styling
+ scatter = sns.scatterplot(data=df, x='tokens', y='processing_time',
+ s=100, alpha=0.6, color='#ff2a6d') # Neon pink
+
+ # Add regression line with confidence interval
+ sns.regplot(data=df, x='tokens', y='processing_time',
+ scatter=False, color='#05d9e8', # Neon blue
+ line_kws={'linewidth': 2})
+
+ # Calculate correlation
+ corr = df['tokens'].corr(df['processing_time'])
+
+ # Add correlation annotation
+ plt.text(0.05, 0.95, f'Correlation: {corr:.2f}',
+ transform=ax.transAxes, fontsize=10, color='#ffffff',
+ bbox=dict(facecolor='#1a1a2e', edgecolor='#ffffff', alpha=0.7))
+
+ setup_plot(fig, ax, 'Processing Time vs Token Count')
+ ax.set_xlabel('Number of Input Tokens')
+ ax.set_ylabel('Processing Time (seconds)')
+
+ plt.savefig('examples/time_vs_tokens.png', dpi=300, bbox_inches='tight')
+ plt.close()
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/time_vs_output.png b/examples/time_vs_output.png
index ee7900e..a7da976 100644
Binary files a/examples/time_vs_output.png and b/examples/time_vs_output.png differ
diff --git a/examples/time_vs_tokens.png b/examples/time_vs_tokens.png
index 1348933..8f7808f 100644
Binary files a/examples/time_vs_tokens.png and b/examples/time_vs_tokens.png differ
diff --git a/githubbanner.png b/githubbanner.png
new file mode 100644
index 0000000..692f642
Binary files /dev/null and b/githubbanner.png differ