mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Allow ONNX support optimizations for CPU inference and update benchmarking scripts; modify README for clarity on performance metrics
This commit is contained in:
parent
93aa205da9
commit
76e8b07a92
11 changed files with 1169 additions and 510 deletions
|
@ -6,6 +6,7 @@ omit =
|
|||
Kokoro-82M/*
|
||||
MagicMock/*
|
||||
test_*.py
|
||||
examples/*
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
|
|
14
CHANGELOG.md
14
CHANGELOG.md
|
@ -2,6 +2,20 @@
|
|||
|
||||
Notable changes to this project will be documented in this file.
|
||||
|
||||
## 2025-01-04
|
||||
### Added
|
||||
- ONNX Support:
|
||||
- Added single batch ONNX support for CPU inference
|
||||
- Roughly 0.4 RTF (2.4x real-time speed)
|
||||
|
||||
### Modified
|
||||
- Code Refactoring:
|
||||
- Work on modularizing phonemizer and tokenizer into separate services
|
||||
- Incorporated these services into a dev endpoint
|
||||
- Testing and Benchmarking:
|
||||
- Cleaned up benchmarking scripts
|
||||
- Cleaned up test scripts
|
||||
- Added auto-WAV validation scripts
|
||||
|
||||
## 2025-01-02
|
||||
- Audio Format Support:
|
||||
|
|
|
@ -187,15 +187,13 @@ Key Performance Metrics:
|
|||
<summary>GPU Vs. CPU</summary>
|
||||
|
||||
```bash
|
||||
# GPU: Requires NVIDIA GPU with CUDA 12.1 support
|
||||
# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x realtime speed)
|
||||
docker compose up --build
|
||||
|
||||
# CPU: ~10x slower than GPU inference
|
||||
# CPU: ONNX optimized inference (~2.4x realtime speed)
|
||||
docker compose -f docker-compose.cpu.yml up --build
|
||||
```
|
||||
|
||||
*Note: CPU Inference is currently a very basic implementation, and not heavily tested*
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
|
|
@ -47,14 +47,14 @@ services:
|
|||
model-fetcher:
|
||||
condition: service_healthy
|
||||
|
||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
||||
# gradio-ui:
|
||||
# build:
|
||||
# context: ./ui
|
||||
# ports:
|
||||
# - "7860:7860"
|
||||
# volumes:
|
||||
# - ./ui/data:/app/ui/data
|
||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
# environment:
|
||||
# - GRADIO_WATCH=True # Enable hot reloading
|
||||
# Gradio UI service [Comment out everything below if you don't need it]
|
||||
gradio-ui:
|
||||
build:
|
||||
context: ./ui
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- ./ui/data:/app/ui/data
|
||||
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
environment:
|
||||
- GRADIO_WATCH=True # Enable hot reloading
|
||||
|
|
|
@ -46,14 +46,14 @@ services:
|
|||
model-fetcher:
|
||||
condition: service_healthy
|
||||
|
||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
||||
# gradio-ui:
|
||||
# build:
|
||||
# context: ./ui
|
||||
# ports:
|
||||
# - "7860:7860"
|
||||
# volumes:
|
||||
# - ./ui/data:/app/ui/data
|
||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
# environment:
|
||||
# - GRADIO_WATCH=True # Enable hot reloading
|
||||
# Gradio UI service [Comment out everything below if you don't need it]
|
||||
gradio-ui:
|
||||
build:
|
||||
context: ./ui
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- ./ui/data:/app/ui/data
|
||||
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
environment:
|
||||
- GRADIO_WATCH=True # Enable hot reloading
|
||||
|
|
|
@ -60,11 +60,11 @@ def main():
|
|||
# Initialize system monitor
|
||||
monitor = SystemMonitor(interval=1.0) # 1 second interval
|
||||
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
|
||||
prefix = "cpu_2_1_seq"
|
||||
prefix = "gpu"
|
||||
# Generate token sizes
|
||||
if 'gpu' in prefix:
|
||||
token_sizes = generate_token_sizes(
|
||||
max_tokens=3000, dense_step=150,
|
||||
max_tokens=5000, dense_step=150,
|
||||
dense_max=1000, sparse_step=1000)
|
||||
elif 'cpu' in prefix:
|
||||
token_sizes = generate_token_sizes(
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,23 +1,23 @@
|
|||
=== Benchmark Statistics (with correct RTF) ===
|
||||
|
||||
Total tokens processed: 8150
|
||||
Total audio generated (s): 2549.70
|
||||
Total test duration (s): 70.70
|
||||
Average processing rate (tokens/s): 120.20
|
||||
Total tokens processed: 17150
|
||||
Total audio generated (s): 5296.38
|
||||
Total test duration (s): 155.23
|
||||
Average processing rate (tokens/s): 102.86
|
||||
Average RTF: 0.03
|
||||
Average Real Time Speed: 36.36
|
||||
Average Real Time Speed: 31.25
|
||||
|
||||
=== Per-chunk Stats ===
|
||||
|
||||
Average chunk size (tokens): 1018.75
|
||||
Average chunk size (tokens): 1715.00
|
||||
Min chunk size (tokens): 150
|
||||
Max chunk size (tokens): 3000
|
||||
Average processing time (s): 8.75
|
||||
Average output length (s): 318.71
|
||||
Max chunk size (tokens): 5000
|
||||
Average processing time (s): 15.39
|
||||
Average output length (s): 529.64
|
||||
|
||||
=== Performance Ranges ===
|
||||
|
||||
Processing rate range (tokens/s): 107.14 - 145.63
|
||||
RTF range: 0.02x - 0.03x
|
||||
Real Time Speed range: 33.33x - 50.00x
|
||||
Processing rate range (tokens/s): 80.65 - 125.10
|
||||
RTF range: 0.03x - 0.04x
|
||||
Real Time Speed range: 25.00x - 33.33x
|
||||
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 232 KiB After Width: | Height: | Size: 238 KiB |
Binary file not shown.
Before Width: | Height: | Size: 202 KiB After Width: | Height: | Size: 250 KiB |
Binary file not shown.
Before Width: | Height: | Size: 448 KiB After Width: | Height: | Size: 459 KiB |
Loading…
Add table
Reference in a new issue