mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Allow ONNX support optimizations for CPU inference and update benchmarking scripts; modify README for clarity on performance metrics
This commit is contained in:
parent
93aa205da9
commit
76e8b07a92
11 changed files with 1169 additions and 510 deletions
|
@ -6,6 +6,7 @@ omit =
|
||||||
Kokoro-82M/*
|
Kokoro-82M/*
|
||||||
MagicMock/*
|
MagicMock/*
|
||||||
test_*.py
|
test_*.py
|
||||||
|
examples/*
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
|
|
14
CHANGELOG.md
14
CHANGELOG.md
|
@ -2,6 +2,20 @@
|
||||||
|
|
||||||
Notable changes to this project will be documented in this file.
|
Notable changes to this project will be documented in this file.
|
||||||
|
|
||||||
|
## 2025-01-04
|
||||||
|
### Added
|
||||||
|
- ONNX Support:
|
||||||
|
- Added single batch ONNX support for CPU inference
|
||||||
|
- Roughly 0.4 RTF (2.4x real-time speed)
|
||||||
|
|
||||||
|
### Modified
|
||||||
|
- Code Refactoring:
|
||||||
|
- Work on modularizing phonemizer and tokenizer into separate services
|
||||||
|
- Incorporated these services into a dev endpoint
|
||||||
|
- Testing and Benchmarking:
|
||||||
|
- Cleaned up benchmarking scripts
|
||||||
|
- Cleaned up test scripts
|
||||||
|
- Added auto-WAV validation scripts
|
||||||
|
|
||||||
## 2025-01-02
|
## 2025-01-02
|
||||||
- Audio Format Support:
|
- Audio Format Support:
|
||||||
|
|
|
@ -187,15 +187,13 @@ Key Performance Metrics:
|
||||||
<summary>GPU Vs. CPU</summary>
|
<summary>GPU Vs. CPU</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# GPU: Requires NVIDIA GPU with CUDA 12.1 support
|
# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x realtime speed)
|
||||||
docker compose up --build
|
docker compose up --build
|
||||||
|
|
||||||
# CPU: ~10x slower than GPU inference
|
# CPU: ONNX optimized inference (~2.4x realtime speed)
|
||||||
docker compose -f docker-compose.cpu.yml up --build
|
docker compose -f docker-compose.cpu.yml up --build
|
||||||
```
|
```
|
||||||
|
|
||||||
*Note: CPU Inference is currently a very basic implementation, and not heavily tested*
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
|
@ -47,14 +47,14 @@ services:
|
||||||
model-fetcher:
|
model-fetcher:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
# Gradio UI service [Comment out everything below if you don't need it]
|
||||||
# gradio-ui:
|
gradio-ui:
|
||||||
# build:
|
build:
|
||||||
# context: ./ui
|
context: ./ui
|
||||||
# ports:
|
ports:
|
||||||
# - "7860:7860"
|
- "7860:7860"
|
||||||
# volumes:
|
volumes:
|
||||||
# - ./ui/data:/app/ui/data
|
- ./ui/data:/app/ui/data
|
||||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||||
# environment:
|
environment:
|
||||||
# - GRADIO_WATCH=True # Enable hot reloading
|
- GRADIO_WATCH=True # Enable hot reloading
|
||||||
|
|
|
@ -46,14 +46,14 @@ services:
|
||||||
model-fetcher:
|
model-fetcher:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
# Gradio UI service [Comment out everything below if you don't need it]
|
||||||
# gradio-ui:
|
gradio-ui:
|
||||||
# build:
|
build:
|
||||||
# context: ./ui
|
context: ./ui
|
||||||
# ports:
|
ports:
|
||||||
# - "7860:7860"
|
- "7860:7860"
|
||||||
# volumes:
|
volumes:
|
||||||
# - ./ui/data:/app/ui/data
|
- ./ui/data:/app/ui/data
|
||||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||||
# environment:
|
environment:
|
||||||
# - GRADIO_WATCH=True # Enable hot reloading
|
- GRADIO_WATCH=True # Enable hot reloading
|
||||||
|
|
|
@ -60,11 +60,11 @@ def main():
|
||||||
# Initialize system monitor
|
# Initialize system monitor
|
||||||
monitor = SystemMonitor(interval=1.0) # 1 second interval
|
monitor = SystemMonitor(interval=1.0) # 1 second interval
|
||||||
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
|
# Set prefix for output files (e.g. "gpu", "cpu", "onnx", etc.)
|
||||||
prefix = "cpu_2_1_seq"
|
prefix = "gpu"
|
||||||
# Generate token sizes
|
# Generate token sizes
|
||||||
if 'gpu' in prefix:
|
if 'gpu' in prefix:
|
||||||
token_sizes = generate_token_sizes(
|
token_sizes = generate_token_sizes(
|
||||||
max_tokens=3000, dense_step=150,
|
max_tokens=5000, dense_step=150,
|
||||||
dense_max=1000, sparse_step=1000)
|
dense_max=1000, sparse_step=1000)
|
||||||
elif 'cpu' in prefix:
|
elif 'cpu' in prefix:
|
||||||
token_sizes = generate_token_sizes(
|
token_sizes = generate_token_sizes(
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,23 +1,23 @@
|
||||||
=== Benchmark Statistics (with correct RTF) ===
|
=== Benchmark Statistics (with correct RTF) ===
|
||||||
|
|
||||||
Total tokens processed: 8150
|
Total tokens processed: 17150
|
||||||
Total audio generated (s): 2549.70
|
Total audio generated (s): 5296.38
|
||||||
Total test duration (s): 70.70
|
Total test duration (s): 155.23
|
||||||
Average processing rate (tokens/s): 120.20
|
Average processing rate (tokens/s): 102.86
|
||||||
Average RTF: 0.03
|
Average RTF: 0.03
|
||||||
Average Real Time Speed: 36.36
|
Average Real Time Speed: 31.25
|
||||||
|
|
||||||
=== Per-chunk Stats ===
|
=== Per-chunk Stats ===
|
||||||
|
|
||||||
Average chunk size (tokens): 1018.75
|
Average chunk size (tokens): 1715.00
|
||||||
Min chunk size (tokens): 150
|
Min chunk size (tokens): 150
|
||||||
Max chunk size (tokens): 3000
|
Max chunk size (tokens): 5000
|
||||||
Average processing time (s): 8.75
|
Average processing time (s): 15.39
|
||||||
Average output length (s): 318.71
|
Average output length (s): 529.64
|
||||||
|
|
||||||
=== Performance Ranges ===
|
=== Performance Ranges ===
|
||||||
|
|
||||||
Processing rate range (tokens/s): 107.14 - 145.63
|
Processing rate range (tokens/s): 80.65 - 125.10
|
||||||
RTF range: 0.02x - 0.03x
|
RTF range: 0.03x - 0.04x
|
||||||
Real Time Speed range: 33.33x - 50.00x
|
Real Time Speed range: 25.00x - 33.33x
|
||||||
|
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 232 KiB After Width: | Height: | Size: 238 KiB |
Binary file not shown.
Before Width: | Height: | Size: 202 KiB After Width: | Height: | Size: 250 KiB |
Binary file not shown.
Before Width: | Height: | Size: 448 KiB After Width: | Height: | Size: 459 KiB |
Loading…
Add table
Reference in a new issue