mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
feat: merge master into core/uv-management for v0.1.0
Major changes: - Baked model directly into Dockerfile for improved deployment - Switched to uv for dependency management - Restructured Docker files into docker/cpu and docker/gpu directories - Updated configuration for better ONNX performance
This commit is contained in:
commit
007b1a35e8
8 changed files with 292 additions and 190 deletions
|
@ -1,6 +1,5 @@
|
|||
# Version control
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Python
|
||||
__pycache__
|
||||
|
|
92
.github/workflows/sync-develop.yml
vendored
92
.github/workflows/sync-develop.yml
vendored
|
@ -1,55 +1,55 @@
|
|||
name: Sync develop with master
|
||||
# name: Sync develop with master
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
# on:
|
||||
# push:
|
||||
# branches:
|
||||
# - master
|
||||
|
||||
jobs:
|
||||
sync-develop:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
issues: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: develop
|
||||
# jobs:
|
||||
# sync-develop:
|
||||
# runs-on: ubuntu-latest
|
||||
# permissions:
|
||||
# contents: write
|
||||
# issues: write
|
||||
# steps:
|
||||
# - name: Checkout repository
|
||||
# uses: actions/checkout@v4
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: develop
|
||||
|
||||
- name: Configure Git
|
||||
run: |
|
||||
git config user.name "GitHub Actions"
|
||||
git config user.email "actions@github.com"
|
||||
# - name: Configure Git
|
||||
# run: |
|
||||
# git config user.name "GitHub Actions"
|
||||
# git config user.email "actions@github.com"
|
||||
|
||||
- name: Merge master into develop
|
||||
run: |
|
||||
git fetch origin master:master
|
||||
git merge --no-ff origin/master -m "chore: Merge master into develop branch"
|
||||
# - name: Merge master into develop
|
||||
# run: |
|
||||
# git fetch origin master:master
|
||||
# git merge --no-ff origin/master -m "chore: Merge master into develop branch"
|
||||
|
||||
- name: Push changes
|
||||
run: |
|
||||
if ! git push origin develop; then
|
||||
echo "Failed to push to develop branch"
|
||||
exit 1
|
||||
fi
|
||||
# - name: Push changes
|
||||
# run: |
|
||||
# if ! git push origin develop; then
|
||||
# echo "Failed to push to develop branch"
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
- name: Handle Failure
|
||||
if: failure()
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const issueBody = `Automatic merge from master to develop failed.
|
||||
# - name: Handle Failure
|
||||
# if: failure()
|
||||
# uses: actions/github-script@v7
|
||||
# with:
|
||||
# script: |
|
||||
# const issueBody = `Automatic merge from master to develop failed.
|
||||
|
||||
Please resolve this manually
|
||||
# Please resolve this manually
|
||||
|
||||
Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
|
||||
# Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
|
||||
|
||||
await github.rest.issues.create({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
title: '🔄 Automatic master to develop merge failed',
|
||||
body: issueBody,
|
||||
labels: ['merge-failed', 'automation']
|
||||
});
|
||||
# await github.rest.issues.create({
|
||||
# owner: context.repo.owner,
|
||||
# repo: context.repo.repo,
|
||||
# title: '🔄 Automatic master to develop merge failed',
|
||||
# body: issueBody,
|
||||
# labels: ['merge-failed', 'automation']
|
||||
# });
|
||||
|
|
81
.gitignore
vendored
81
.gitignore
vendored
|
@ -1,33 +1,44 @@
|
|||
# Version control
|
||||
.git
|
||||
|
||||
output/*
|
||||
output_audio/*
|
||||
ui/data/*
|
||||
|
||||
*.db
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
.Python
|
||||
.pytest_cache
|
||||
.coverage
|
||||
.coveragerc
|
||||
|
||||
# Python package build artifacts
|
||||
*.egg-info/
|
||||
*.egg
|
||||
dist/
|
||||
build/
|
||||
|
||||
# Environment
|
||||
# .env
|
||||
.venv/
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Project specific
|
||||
# Model files
|
||||
*.pt
|
||||
*.pth
|
||||
*.tar*
|
||||
|
||||
Kokoro-82M/*
|
||||
__pycache__/
|
||||
.vscode/
|
||||
env/
|
||||
.venv/
|
||||
.Python
|
||||
|
||||
|
||||
.coverage
|
||||
|
||||
examples/assorted_checks/benchmarks/output_audio/*
|
||||
examples/assorted_checks/test_combinations/output/*
|
||||
examples/assorted_checks/test_openai/output/*
|
||||
|
||||
examples/assorted_checks/test_voices/output/*
|
||||
examples/assorted_checks/test_formats/output/*
|
||||
examples/assorted_checks/benchmarks/output_audio_stream/*
|
||||
ui/RepoScreenshot.png
|
||||
examples/assorted_checks/benchmarks/output_audio_stream_openai/*
|
||||
|
||||
# Voice files
|
||||
api/src/voices/af_bella.pt
|
||||
api/src/voices/af_nicole.pt
|
||||
api/src/voices/af_sarah.pt
|
||||
|
@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt
|
|||
api/src/voices/bf_isabella.pt
|
||||
api/src/voices/bm_george.pt
|
||||
api/src/voices/bm_lewis.pt
|
||||
|
||||
# Audio files
|
||||
examples/*.wav
|
||||
examples/*.pcm
|
||||
examples/*.mp3
|
||||
examples/*.flac
|
||||
examples/*.acc
|
||||
examples/*.ogg
|
||||
examples/speech.mp3
|
||||
examples/phoneme_examples/output/example_1.wav
|
||||
examples/phoneme_examples/output/example_2.wav
|
||||
examples/phoneme_examples/output/example_3.wav
|
||||
|
||||
# Other project files
|
||||
Kokoro-82M/
|
||||
ui/data/
|
||||
EXTERNAL_UV_DOCUMENTATION*
|
||||
|
||||
# Python package build artifacts
|
||||
*.egg-info/
|
||||
*.egg
|
||||
dist/
|
||||
build/
|
||||
# Docker
|
||||
Dockerfile*
|
||||
docker-compose*
|
||||
|
|
21
CHANGELOG.md
21
CHANGELOG.md
|
@ -2,12 +2,23 @@
|
|||
|
||||
Notable changes to this project will be documented in this file.
|
||||
|
||||
## [v0.0.6] - 2025-01-10
|
||||
## [v0.1.0] - 2025-01-13
|
||||
### Changed
|
||||
- Major Docker improvements:
|
||||
- Baked model directly into Dockerfile for improved deployment reliability
|
||||
- Switched to uv for dependency management
|
||||
- Streamlined container builds and reduced image sizes
|
||||
- Dependency Management:
|
||||
- Migrated from pip/poetry to uv for faster, more reliable package management
|
||||
- Added uv.lock for deterministic builds
|
||||
- Updated dependency resolution strategy
|
||||
|
||||
## [v0.0.5post1] - 2025-01-11
|
||||
### Fixed
|
||||
- Fixed dependency issues:
|
||||
- Let PyTorch manage numpy version
|
||||
- Pin aiofiles to 23.2.1 for Windows compatibility
|
||||
- Added CI workflow for testing
|
||||
- Docker image tagging and versioning improvements (-gpu, -cpu, -ui)
|
||||
- Minor vram management improvements
|
||||
- Gradio bugfix causing crashes and errant warnings
|
||||
- Updated GPU and UI container configurations
|
||||
|
||||
## [v0.0.5] - 2025-01-10
|
||||
### Fixed
|
||||
|
|
52
README.md
52
README.md
|
@ -5,7 +5,7 @@
|
|||
# Kokoro TTS API
|
||||
[]()
|
||||
[]()
|
||||
[](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
|
||||
[](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [](https://www.buymeacoffee.com/remsky)
|
||||
|
||||
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
|
||||
- OpenAI-compatible Speech endpoint, with inline voice combination functionality
|
||||
|
@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i
|
|||
```bash
|
||||
git clone https://github.com/remsky/Kokoro-FastAPI.git
|
||||
cd Kokoro-FastAPI
|
||||
docker compose up --build
|
||||
docker compose up --build # for GPU
|
||||
#docker compose -f docker-compose.cpu.yml up --build # for CPU
|
||||
```
|
||||
2. Run locally as an OpenAI-Compatible Speech Endpoint
|
||||
```python
|
||||
|
@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f:
|
|||
See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
|
||||
</details>
|
||||
|
||||
## Known Issues
|
||||
|
||||
<details>
|
||||
<summary>Linux GPU Permissions</summary>
|
||||
|
||||
Some Linux users may encounter GPU permission issues when running as non-root.
|
||||
Can't guarantee anything, but here are some common solutions, consider your security requirements carefully
|
||||
|
||||
### Option 1: Container Groups (Likely the best option)
|
||||
```yaml
|
||||
services:
|
||||
kokoro-tts:
|
||||
# ... existing config ...
|
||||
group_add:
|
||||
- "video"
|
||||
- "render"
|
||||
```
|
||||
|
||||
### Option 2: Host System Groups
|
||||
```yaml
|
||||
services:
|
||||
kokoro-tts:
|
||||
# ... existing config ...
|
||||
user: "${UID}:${GID}"
|
||||
group_add:
|
||||
- "video"
|
||||
```
|
||||
Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart.
|
||||
|
||||
### Option 3: Device Permissions (Use with caution)
|
||||
```yaml
|
||||
services:
|
||||
kokoro-tts:
|
||||
# ... existing config ...
|
||||
devices:
|
||||
- /dev/nvidia0:/dev/nvidia0
|
||||
- /dev/nvidiactl:/dev/nvidiactl
|
||||
- /dev/nvidia-uvm:/dev/nvidia-uvm
|
||||
```
|
||||
⚠️ Warning: Reduces system security. Use only in development environments.
|
||||
|
||||
Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured.
|
||||
|
||||
Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information
|
||||
|
||||
</details>
|
||||
|
||||
## Model and License
|
||||
|
||||
<details open>
|
||||
|
|
|
@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize
|
|||
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
||||
@torch.no_grad()
|
||||
def forward(model, tokens, ref_s, speed):
|
||||
"""Forward pass through the model with light optimizations that preserve output quality"""
|
||||
"""Forward pass through the model with moderate memory management"""
|
||||
device = ref_s.device
|
||||
|
||||
# Keep original token handling but optimize device placement
|
||||
try:
|
||||
# Initial tensor setup with proper device placement
|
||||
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
||||
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
||||
text_mask = length_to_mask(input_lengths).to(device)
|
||||
|
||||
# Split and clone reference signals with explicit device placement
|
||||
s_content = ref_s[:, 128:].clone().to(device)
|
||||
s_ref = ref_s[:, :128].clone().to(device)
|
||||
|
||||
# BERT and encoder pass
|
||||
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
||||
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
||||
|
||||
# Split reference signal once for efficiency
|
||||
s_content = ref_s[:, 128:]
|
||||
s_ref = ref_s[:, :128]
|
||||
|
||||
# Predictor forward pass
|
||||
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
|
||||
x, _ = model.predictor.lstm(d)
|
||||
|
||||
# Duration prediction - keeping original logic
|
||||
# Duration prediction
|
||||
duration = model.predictor.duration_proj(x)
|
||||
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
||||
pred_dur = torch.round(duration).clamp(min=1).long()
|
||||
# Only cleanup large intermediates
|
||||
del duration, x
|
||||
|
||||
# Alignment matrix construction - keeping original approach for quality
|
||||
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
|
||||
# Alignment matrix construction
|
||||
pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device)
|
||||
c_frame = 0
|
||||
for i in range(pred_aln_trg.size(0)):
|
||||
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
|
||||
c_frame += pred_dur[0, i].item()
|
||||
pred_aln_trg = pred_aln_trg.unsqueeze(0)
|
||||
|
||||
# Matrix multiplications - reuse unsqueezed tensor
|
||||
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
|
||||
# Matrix multiplications with selective cleanup
|
||||
en = d.transpose(-1, -2) @ pred_aln_trg
|
||||
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
|
||||
del d # Free large intermediate tensor
|
||||
|
||||
# Text encoding and final decoding
|
||||
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
|
||||
del en # Free large intermediate tensor
|
||||
|
||||
# Final text encoding and decoding
|
||||
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
||||
asr = t_en @ pred_aln_trg
|
||||
del t_en # Free large intermediate tensor
|
||||
|
||||
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
|
||||
# Final decoding and transfer to CPU
|
||||
output = model.decoder(asr, F0_pred, N_pred, s_ref)
|
||||
result = output.squeeze().cpu().numpy()
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
# Let PyTorch handle most cleanup automatically
|
||||
# Only explicitly free the largest tensors
|
||||
del pred_aln_trg, asr
|
||||
|
||||
|
||||
# def length_to_mask(lengths):
|
||||
|
@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel):
|
|||
def generate_from_tokens(
|
||||
cls, tokens: list[int], voicepack: torch.Tensor, speed: float
|
||||
) -> np.ndarray:
|
||||
"""Generate audio from tokens
|
||||
"""Generate audio from tokens with moderate memory management
|
||||
|
||||
Args:
|
||||
tokens: Token IDs
|
||||
|
@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel):
|
|||
if cls._instance is None:
|
||||
raise RuntimeError("GPU model not initialized")
|
||||
|
||||
# Get reference style
|
||||
ref_s = voicepack[len(tokens)]
|
||||
try:
|
||||
device = cls._device
|
||||
|
||||
# Check memory pressure
|
||||
if torch.cuda.is_available():
|
||||
memory_allocated = torch.cuda.memory_allocated(device) / 1e9 # Convert to GB
|
||||
if memory_allocated > 2.0: # 2GB limit
|
||||
logger.info(
|
||||
f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB "
|
||||
f"Clearing cache"
|
||||
)
|
||||
torch.cuda.empty_cache()
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
# Get reference style with proper device placement
|
||||
ref_s = voicepack[len(tokens)].clone().to(device)
|
||||
|
||||
# Generate audio
|
||||
audio = forward(cls._instance, tokens, ref_s, speed)
|
||||
|
||||
return audio
|
||||
|
||||
except RuntimeError as e:
|
||||
if "out of memory" in str(e):
|
||||
# On OOM, do a full cleanup and retry
|
||||
if torch.cuda.is_available():
|
||||
logger.warning("Out of memory detected, performing full cleanup")
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.empty_cache()
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
# Log memory stats after cleanup
|
||||
memory_allocated = torch.cuda.memory_allocated(device)
|
||||
memory_reserved = torch.cuda.memory_reserved(device)
|
||||
logger.info(
|
||||
f"Memory after OOM cleanup: "
|
||||
f"Allocated: {memory_allocated / 1e9:.2f}GB, "
|
||||
f"Reserved: {memory_reserved / 1e9:.2f}GB"
|
||||
)
|
||||
|
||||
# Retry generation
|
||||
ref_s = voicepack[len(tokens)].clone().to(device)
|
||||
audio = forward(cls._instance, tokens, ref_s, speed)
|
||||
return audio
|
||||
raise
|
||||
|
||||
finally:
|
||||
# Only synchronize at the top level, no empty_cache
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
|
|
@ -20,17 +20,17 @@ services:
|
|||
- ONNX_MEMORY_PATTERN=true
|
||||
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
|
||||
|
||||
# # Gradio UI service [Comment out everything below if you don't need it]
|
||||
# gradio-ui:
|
||||
# # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
|
||||
# # Uncomment below (and comment out above) to build from source instead of using the released image
|
||||
# # build:
|
||||
# # context: ./ui
|
||||
# ports:
|
||||
# - "7860:7860"
|
||||
# volumes:
|
||||
# - ./ui/data:/app/ui/data
|
||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
# environment:
|
||||
# - GRADIO_WATCH=True # Enable hot reloading
|
||||
# - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
|
||||
# Gradio UI service [Comment out everything below if you don't need it]
|
||||
gradio-ui:
|
||||
# image: ghcr.io/remsky/kokoro-fastapi:latest-ui
|
||||
# Uncomment below (and comment out above) to build from source instead of using the released image
|
||||
# build:
|
||||
# context: ./ui
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- ./ui/data:/app/ui/data
|
||||
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
environment:
|
||||
- GRADIO_WATCH=True # Enable hot reloading
|
||||
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
|
||||
|
|
|
@ -1,45 +1,10 @@
|
|||
services:
|
||||
# model-fetcher:
|
||||
# image: datamachines/git-lfs:latest
|
||||
# environment:
|
||||
# - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
|
||||
# volumes:
|
||||
# - ./Kokoro-82M:/app/Kokoro-82M
|
||||
# working_dir: /app/Kokoro-82M
|
||||
# command: >
|
||||
# sh -c "
|
||||
# if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
|
||||
# echo 'Skipping model fetch...' && touch .cloned;
|
||||
# else
|
||||
# rm -f .git/index.lock;
|
||||
# if [ -z \"$(ls -A .)\" ]; then
|
||||
# git clone https://huggingface.co/hexgrad/Kokoro-82M .
|
||||
# touch .cloned;
|
||||
# else
|
||||
# rm -f .git/index.lock && \
|
||||
# git checkout main && \
|
||||
# git pull origin main && \
|
||||
# touch .cloned;
|
||||
# fi;
|
||||
# fi;
|
||||
# tail -f /dev/null
|
||||
# "
|
||||
# healthcheck:
|
||||
# test: ["CMD", "test", "-f", ".cloned"]
|
||||
# interval: 5s
|
||||
# timeout: 2s
|
||||
# retries: 300
|
||||
# start_period: 1s
|
||||
|
||||
kokoro-tts:
|
||||
# image: ghcr.io/remsky/kokoro-fastapi-gpu:latest
|
||||
# Uncomment below to build from source instead of using the released image
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: docker/gpu/Dockerfile
|
||||
volumes:
|
||||
- ../../api/src:/app/api/src
|
||||
- ../../Kokoro-82M:/app/Kokoro-82M
|
||||
- ../../api/src:/app/api/src # Mount src for development
|
||||
ports:
|
||||
- "8880:8880"
|
||||
environment:
|
||||
|
@ -51,21 +16,18 @@ services:
|
|||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
# depends_on:
|
||||
# model-fetcher:
|
||||
# condition: service_healthy
|
||||
|
||||
# Gradio UI service [Comment out everything below if you don't need it]
|
||||
# gradio-ui:
|
||||
# Gradio UI service
|
||||
gradio-ui:
|
||||
# image: ghcr.io/remsky/kokoro-fastapi-ui:latest
|
||||
# Uncomment below to build from source instead of using the released image
|
||||
# build:
|
||||
# context: ./ui
|
||||
# ports:
|
||||
# - "7860:7860"
|
||||
# volumes:
|
||||
# - ./ui/data:/app/ui/data
|
||||
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
# environment:
|
||||
# - GRADIO_WATCH=1 # Enable hot reloading
|
||||
# - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
|
||||
build:
|
||||
context: ../../ui
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- ../../ui/data:/app/ui/data
|
||||
- ../../ui/app.py:/app/app.py # Mount app.py for hot reload
|
||||
environment:
|
||||
- GRADIO_WATCH=1 # Enable hot reloading
|
||||
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
|
||||
|
|
Loading…
Add table
Reference in a new issue