feat: merge master into core/uv-management for v0.1.0

Major changes:
- Baked model directly into Dockerfile for improved deployment
- Switched to uv for dependency management
- Restructured Docker files into docker/cpu and docker/gpu directories
- Updated configuration for better ONNX performance
This commit is contained in:
remsky 2025-01-13 19:31:44 -07:00
commit 007b1a35e8
8 changed files with 292 additions and 190 deletions

View file

@ -1,6 +1,5 @@
# Version control
.git
.gitignore
# Python
__pycache__

View file

@ -1,55 +1,55 @@
name: Sync develop with master
# name: Sync develop with master
on:
push:
branches:
- master
# on:
# push:
# branches:
# - master
jobs:
sync-develop:
runs-on: ubuntu-latest
permissions:
contents: write
issues: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: develop
# jobs:
# sync-develop:
# runs-on: ubuntu-latest
# permissions:
# contents: write
# issues: write
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# with:
# fetch-depth: 0
# ref: develop
- name: Configure Git
run: |
git config user.name "GitHub Actions"
git config user.email "actions@github.com"
# - name: Configure Git
# run: |
# git config user.name "GitHub Actions"
# git config user.email "actions@github.com"
- name: Merge master into develop
run: |
git fetch origin master:master
git merge --no-ff origin/master -m "chore: Merge master into develop branch"
# - name: Merge master into develop
# run: |
# git fetch origin master:master
# git merge --no-ff origin/master -m "chore: Merge master into develop branch"
- name: Push changes
run: |
if ! git push origin develop; then
echo "Failed to push to develop branch"
exit 1
fi
# - name: Push changes
# run: |
# if ! git push origin develop; then
# echo "Failed to push to develop branch"
# exit 1
# fi
- name: Handle Failure
if: failure()
uses: actions/github-script@v7
with:
script: |
const issueBody = `Automatic merge from master to develop failed.
# - name: Handle Failure
# if: failure()
# uses: actions/github-script@v7
# with:
# script: |
# const issueBody = `Automatic merge from master to develop failed.
Please resolve this manually
# Please resolve this manually
Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
# Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: '🔄 Automatic master to develop merge failed',
body: issueBody,
labels: ['merge-failed', 'automation']
});
# await github.rest.issues.create({
# owner: context.repo.owner,
# repo: context.repo.repo,
# title: '🔄 Automatic master to develop merge failed',
# body: issueBody,
# labels: ['merge-failed', 'automation']
# });

81
.gitignore vendored
View file

@ -1,33 +1,44 @@
# Version control
.git
output/*
output_audio/*
ui/data/*
*.db
# Python
__pycache__/
*.pyc
*.pyo
*.pyd
*.py[cod]
*$py.class
.Python
.pytest_cache
.coverage
.coveragerc
# Python package build artifacts
*.egg-info/
*.egg
dist/
build/
# Environment
# .env
.venv/
env/
venv/
ENV/
# IDE
.idea/
.vscode/
*.swp
*.swo
# Project specific
# Model files
*.pt
*.pth
*.tar*
Kokoro-82M/*
__pycache__/
.vscode/
env/
.venv/
.Python
.coverage
examples/assorted_checks/benchmarks/output_audio/*
examples/assorted_checks/test_combinations/output/*
examples/assorted_checks/test_openai/output/*
examples/assorted_checks/test_voices/output/*
examples/assorted_checks/test_formats/output/*
examples/assorted_checks/benchmarks/output_audio_stream/*
ui/RepoScreenshot.png
examples/assorted_checks/benchmarks/output_audio_stream_openai/*
# Voice files
api/src/voices/af_bella.pt
api/src/voices/af_nicole.pt
api/src/voices/af_sarah.pt
@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt
api/src/voices/bf_isabella.pt
api/src/voices/bm_george.pt
api/src/voices/bm_lewis.pt
# Audio files
examples/*.wav
examples/*.pcm
examples/*.mp3
examples/*.flac
examples/*.acc
examples/*.ogg
examples/speech.mp3
examples/phoneme_examples/output/example_1.wav
examples/phoneme_examples/output/example_2.wav
examples/phoneme_examples/output/example_3.wav
# Other project files
Kokoro-82M/
ui/data/
EXTERNAL_UV_DOCUMENTATION*
# Python package build artifacts
*.egg-info/
*.egg
dist/
build/
# Docker
Dockerfile*
docker-compose*

View file

@ -2,12 +2,23 @@
Notable changes to this project will be documented in this file.
## [v0.0.6] - 2025-01-10
## [v0.1.0] - 2025-01-13
### Changed
- Major Docker improvements:
- Baked model directly into Dockerfile for improved deployment reliability
- Switched to uv for dependency management
- Streamlined container builds and reduced image sizes
- Dependency Management:
- Migrated from pip/poetry to uv for faster, more reliable package management
- Added uv.lock for deterministic builds
- Updated dependency resolution strategy
## [v0.0.5post1] - 2025-01-11
### Fixed
- Fixed dependency issues:
- Let PyTorch manage numpy version
- Pin aiofiles to 23.2.1 for Windows compatibility
- Added CI workflow for testing
- Docker image tagging and versioning improvements (-gpu, -cpu, -ui)
- Minor vram management improvements
- Gradio bugfix causing crashes and errant warnings
- Updated GPU and UI container configurations
## [v0.0.5] - 2025-01-10
### Fixed

View file

@ -5,7 +5,7 @@
# Kokoro TTS API
[![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]()
[![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]()
[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky)
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- OpenAI-compatible Speech endpoint, with inline voice combination functionality
@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i
```bash
git clone https://github.com/remsky/Kokoro-FastAPI.git
cd Kokoro-FastAPI
docker compose up --build
docker compose up --build # for GPU
#docker compose -f docker-compose.cpu.yml up --build # for CPU
```
2. Run locally as an OpenAI-Compatible Speech Endpoint
```python
@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f:
See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
</details>
## Known Issues
<details>
<summary>Linux GPU Permissions</summary>
Some Linux users may encounter GPU permission issues when running as non-root.
Can't guarantee anything, but here are some common solutions, consider your security requirements carefully
### Option 1: Container Groups (Likely the best option)
```yaml
services:
kokoro-tts:
# ... existing config ...
group_add:
- "video"
- "render"
```
### Option 2: Host System Groups
```yaml
services:
kokoro-tts:
# ... existing config ...
user: "${UID}:${GID}"
group_add:
- "video"
```
Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart.
### Option 3: Device Permissions (Use with caution)
```yaml
services:
kokoro-tts:
# ... existing config ...
devices:
- /dev/nvidia0:/dev/nvidia0
- /dev/nvidiactl:/dev/nvidiactl
- /dev/nvidia-uvm:/dev/nvidia-uvm
```
⚠️ Warning: Reduces system security. Use only in development environments.
Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured.
Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information
</details>
## Model and License
<details open>

View file

@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
"""Forward pass through the model with light optimizations that preserve output quality"""
"""Forward pass through the model with moderate memory management"""
device = ref_s.device
# Keep original token handling but optimize device placement
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
try:
# Initial tensor setup with proper device placement
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
# BERT and encoder pass
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
# Split and clone reference signals with explicit device placement
s_content = ref_s[:, 128:].clone().to(device)
s_ref = ref_s[:, :128].clone().to(device)
# Split reference signal once for efficiency
s_content = ref_s[:, 128:]
s_ref = ref_s[:, :128]
# BERT and encoder pass
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
# Predictor forward pass
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
# Predictor forward pass
d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
# Duration prediction - keeping original logic
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
# Duration prediction
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
# Only cleanup large intermediates
del duration, x
# Alignment matrix construction - keeping original approach for quality
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
# Alignment matrix construction
pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device)
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
pred_aln_trg = pred_aln_trg.unsqueeze(0)
# Matrix multiplications - reuse unsqueezed tensor
pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
en = d.transpose(-1, -2) @ pred_aln_trg
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
# Matrix multiplications with selective cleanup
en = d.transpose(-1, -2) @ pred_aln_trg
del d # Free large intermediate tensor
# Text encoding and final decoding
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg
F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
del en # Free large intermediate tensor
return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
# Final text encoding and decoding
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg
del t_en # Free large intermediate tensor
# Final decoding and transfer to CPU
output = model.decoder(asr, F0_pred, N_pred, s_ref)
result = output.squeeze().cpu().numpy()
return result
finally:
# Let PyTorch handle most cleanup automatically
# Only explicitly free the largest tensors
del pred_aln_trg, asr
# def length_to_mask(lengths):
@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel):
def generate_from_tokens(
cls, tokens: list[int], voicepack: torch.Tensor, speed: float
) -> np.ndarray:
"""Generate audio from tokens
"""Generate audio from tokens with moderate memory management
Args:
tokens: Token IDs
@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel):
if cls._instance is None:
raise RuntimeError("GPU model not initialized")
# Get reference style
ref_s = voicepack[len(tokens)]
try:
device = cls._device
# Generate audio
audio = forward(cls._instance, tokens, ref_s, speed)
# Check memory pressure
if torch.cuda.is_available():
memory_allocated = torch.cuda.memory_allocated(device) / 1e9 # Convert to GB
if memory_allocated > 2.0: # 2GB limit
logger.info(
f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB "
f"Clearing cache"
)
torch.cuda.empty_cache()
import gc
gc.collect()
return audio
# Get reference style with proper device placement
ref_s = voicepack[len(tokens)].clone().to(device)
# Generate audio
audio = forward(cls._instance, tokens, ref_s, speed)
return audio
except RuntimeError as e:
if "out of memory" in str(e):
# On OOM, do a full cleanup and retry
if torch.cuda.is_available():
logger.warning("Out of memory detected, performing full cleanup")
torch.cuda.synchronize()
torch.cuda.empty_cache()
import gc
gc.collect()
# Log memory stats after cleanup
memory_allocated = torch.cuda.memory_allocated(device)
memory_reserved = torch.cuda.memory_reserved(device)
logger.info(
f"Memory after OOM cleanup: "
f"Allocated: {memory_allocated / 1e9:.2f}GB, "
f"Reserved: {memory_reserved / 1e9:.2f}GB"
)
# Retry generation
ref_s = voicepack[len(tokens)].clone().to(device)
audio = forward(cls._instance, tokens, ref_s, speed)
return audio
raise
finally:
# Only synchronize at the top level, no empty_cache
if torch.cuda.is_available():
torch.cuda.synchronize()

View file

@ -20,17 +20,17 @@ services:
- ONNX_MEMORY_PATTERN=true
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
# # Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui:
# # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
# # Uncomment below (and comment out above) to build from source instead of using the released image
# # build:
# # context: ./ui
# ports:
# - "7860:7860"
# volumes:
# - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment:
# - GRADIO_WATCH=True # Enable hot reloading
# - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
# Gradio UI service [Comment out everything below if you don't need it]
gradio-ui:
# image: ghcr.io/remsky/kokoro-fastapi:latest-ui
# Uncomment below (and comment out above) to build from source instead of using the released image
# build:
# context: ./ui
ports:
- "7860:7860"
volumes:
- ./ui/data:/app/ui/data
- ./ui/app.py:/app/app.py # Mount app.py for hot reload
environment:
- GRADIO_WATCH=True # Enable hot reloading
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered

View file

@ -1,45 +1,10 @@
services:
# model-fetcher:
# image: datamachines/git-lfs:latest
# environment:
# - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
# volumes:
# - ./Kokoro-82M:/app/Kokoro-82M
# working_dir: /app/Kokoro-82M
# command: >
# sh -c "
# if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
# echo 'Skipping model fetch...' && touch .cloned;
# else
# rm -f .git/index.lock;
# if [ -z \"$(ls -A .)\" ]; then
# git clone https://huggingface.co/hexgrad/Kokoro-82M .
# touch .cloned;
# else
# rm -f .git/index.lock && \
# git checkout main && \
# git pull origin main && \
# touch .cloned;
# fi;
# fi;
# tail -f /dev/null
# "
# healthcheck:
# test: ["CMD", "test", "-f", ".cloned"]
# interval: 5s
# timeout: 2s
# retries: 300
# start_period: 1s
kokoro-tts:
# image: ghcr.io/remsky/kokoro-fastapi-gpu:latest
# Uncomment below to build from source instead of using the released image
build:
context: ../..
dockerfile: docker/gpu/Dockerfile
volumes:
- ../../api/src:/app/api/src
- ../../Kokoro-82M:/app/Kokoro-82M
- ../../api/src:/app/api/src # Mount src for development
ports:
- "8880:8880"
environment:
@ -51,21 +16,18 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
# depends_on:
# model-fetcher:
# condition: service_healthy
# Gradio UI service [Comment out everything below if you don't need it]
# gradio-ui:
# Gradio UI service
gradio-ui:
# image: ghcr.io/remsky/kokoro-fastapi-ui:latest
# Uncomment below to build from source instead of using the released image
# build:
# context: ./ui
# ports:
# - "7860:7860"
# volumes:
# - ./ui/data:/app/ui/data
# - ./ui/app.py:/app/app.py # Mount app.py for hot reload
# environment:
# - GRADIO_WATCH=1 # Enable hot reloading
# - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
build:
context: ../../ui
ports:
- "7860:7860"
volumes:
- ../../ui/data:/app/ui/data
- ../../ui/app.py:/app/app.py # Mount app.py for hot reload
environment:
- GRADIO_WATCH=1 # Enable hot reloading
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered