diff --git a/.dockerignore b/.dockerignore
index b456f25..df5f9db 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,5 @@
# Version control
.git
-.gitignore
# Python
__pycache__
diff --git a/.github/workflows/sync-develop.yml b/.github/workflows/sync-develop.yml
index 2e0cfd5..56b881f 100644
--- a/.github/workflows/sync-develop.yml
+++ b/.github/workflows/sync-develop.yml
@@ -1,55 +1,55 @@
-name: Sync develop with master
+# name: Sync develop with master
-on:
- push:
- branches:
- - master
+# on:
+# push:
+# branches:
+# - master
-jobs:
- sync-develop:
- runs-on: ubuntu-latest
- permissions:
- contents: write
- issues: write
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: develop
+# jobs:
+# sync-develop:
+# runs-on: ubuntu-latest
+# permissions:
+# contents: write
+# issues: write
+# steps:
+# - name: Checkout repository
+# uses: actions/checkout@v4
+# with:
+# fetch-depth: 0
+# ref: develop
- - name: Configure Git
- run: |
- git config user.name "GitHub Actions"
- git config user.email "actions@github.com"
+# - name: Configure Git
+# run: |
+# git config user.name "GitHub Actions"
+# git config user.email "actions@github.com"
- - name: Merge master into develop
- run: |
- git fetch origin master:master
- git merge --no-ff origin/master -m "chore: Merge master into develop branch"
+# - name: Merge master into develop
+# run: |
+# git fetch origin master:master
+# git merge --no-ff origin/master -m "chore: Merge master into develop branch"
- - name: Push changes
- run: |
- if ! git push origin develop; then
- echo "Failed to push to develop branch"
- exit 1
- fi
+# - name: Push changes
+# run: |
+# if ! git push origin develop; then
+# echo "Failed to push to develop branch"
+# exit 1
+# fi
- - name: Handle Failure
- if: failure()
- uses: actions/github-script@v7
- with:
- script: |
- const issueBody = `Automatic merge from master to develop failed.
+# - name: Handle Failure
+# if: failure()
+# uses: actions/github-script@v7
+# with:
+# script: |
+# const issueBody = `Automatic merge from master to develop failed.
- Please resolve this manually
+# Please resolve this manually
- Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+# Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
- await github.rest.issues.create({
- owner: context.repo.owner,
- repo: context.repo.repo,
- title: '🔄 Automatic master to develop merge failed',
- body: issueBody,
- labels: ['merge-failed', 'automation']
- });
+# await github.rest.issues.create({
+# owner: context.repo.owner,
+# repo: context.repo.repo,
+# title: '🔄 Automatic master to develop merge failed',
+# body: issueBody,
+# labels: ['merge-failed', 'automation']
+# });
diff --git a/.gitignore b/.gitignore
index 37d236a..4d8d73c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,33 +1,44 @@
+# Version control
+.git
-output/*
-output_audio/*
-ui/data/*
-
-*.db
+# Python
+__pycache__/
*.pyc
+*.pyo
+*.pyd
+*.py[cod]
+*$py.class
+.Python
+.pytest_cache
+.coverage
+.coveragerc
+
+# Python package build artifacts
+*.egg-info/
+*.egg
+dist/
+build/
+
+# Environment
+# .env
+.venv/
+env/
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Project specific
+# Model files
+*.pt
*.pth
*.tar*
-Kokoro-82M/*
-__pycache__/
-.vscode/
-env/
-.venv/
-.Python
-
-
-.coverage
-
-examples/assorted_checks/benchmarks/output_audio/*
-examples/assorted_checks/test_combinations/output/*
-examples/assorted_checks/test_openai/output/*
-
-examples/assorted_checks/test_voices/output/*
-examples/assorted_checks/test_formats/output/*
-examples/assorted_checks/benchmarks/output_audio_stream/*
-ui/RepoScreenshot.png
-examples/assorted_checks/benchmarks/output_audio_stream_openai/*
-
+# Voice files
api/src/voices/af_bella.pt
api/src/voices/af_nicole.pt
api/src/voices/af_sarah.pt
@@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt
api/src/voices/bf_isabella.pt
api/src/voices/bm_george.pt
api/src/voices/bm_lewis.pt
+
+# Audio files
+examples/*.wav
+examples/*.pcm
+examples/*.mp3
+examples/*.flac
+examples/*.acc
+examples/*.ogg
examples/speech.mp3
examples/phoneme_examples/output/example_1.wav
examples/phoneme_examples/output/example_2.wav
examples/phoneme_examples/output/example_3.wav
+
+# Other project files
+Kokoro-82M/
+ui/data/
EXTERNAL_UV_DOCUMENTATION*
-# Python package build artifacts
-*.egg-info/
-*.egg
-dist/
-build/
+# Docker
+Dockerfile*
+docker-compose*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4329f75..c3515d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,12 +2,23 @@
Notable changes to this project will be documented in this file.
-## [v0.0.6] - 2025-01-10
+## [v0.1.0] - 2025-01-13
+### Changed
+- Major Docker improvements:
+ - Baked model directly into Dockerfile for improved deployment reliability
+ - Switched to uv for dependency management
+ - Streamlined container builds and reduced image sizes
+- Dependency Management:
+ - Migrated from pip/poetry to uv for faster, more reliable package management
+ - Added uv.lock for deterministic builds
+ - Updated dependency resolution strategy
+
+## [v0.0.5post1] - 2025-01-11
### Fixed
-- Fixed dependency issues:
- - Let PyTorch manage numpy version
- - Pin aiofiles to 23.2.1 for Windows compatibility
-- Added CI workflow for testing
+- Docker image tagging and versioning improvements (-gpu, -cpu, -ui)
+- Minor vram management improvements
+- Gradio bugfix causing crashes and errant warnings
+- Updated GPU and UI container configurations
## [v0.0.5] - 2025-01-10
### Fixed
diff --git a/README.md b/README.md
index 6a3b673..fddea39 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
# Kokoro TTS API
[]()
[]()
-[](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
+[](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [](https://www.buymeacoffee.com/remsky)
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- OpenAI-compatible Speech endpoint, with inline voice combination functionality
@@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i
```bash
git clone https://github.com/remsky/Kokoro-FastAPI.git
cd Kokoro-FastAPI
- docker compose up --build
+ docker compose up --build # for GPU
+ #docker compose -f docker-compose.cpu.yml up --build # for CPU
```
2. Run locally as an OpenAI-Compatible Speech Endpoint
```python
@@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f:
See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
+## Known Issues
+
+
+Linux GPU Permissions
+
+Some Linux users may encounter GPU permission issues when running as non-root.
+Can't guarantee anything, but here are some common solutions, consider your security requirements carefully
+
+### Option 1: Container Groups (Likely the best option)
+```yaml
+services:
+ kokoro-tts:
+ # ... existing config ...
+ group_add:
+ - "video"
+ - "render"
+```
+
+### Option 2: Host System Groups
+```yaml
+services:
+ kokoro-tts:
+ # ... existing config ...
+ user: "${UID}:${GID}"
+ group_add:
+ - "video"
+```
+Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart.
+
+### Option 3: Device Permissions (Use with caution)
+```yaml
+services:
+ kokoro-tts:
+ # ... existing config ...
+ devices:
+ - /dev/nvidia0:/dev/nvidia0
+ - /dev/nvidiactl:/dev/nvidiactl
+ - /dev/nvidia-uvm:/dev/nvidia-uvm
+```
+⚠️ Warning: Reduces system security. Use only in development environments.
+
+Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured.
+
+Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information
+
+
+
## Model and License
diff --git a/api/src/services/tts_gpu.py b/api/src/services/tts_gpu.py
index 87208e5..ca01b84 100644
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize
# return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
- """Forward pass through the model with light optimizations that preserve output quality"""
+ """Forward pass through the model with moderate memory management"""
device = ref_s.device
+
+ try:
+ # Initial tensor setup with proper device placement
+ tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+ text_mask = length_to_mask(input_lengths).to(device)
- # Keep original token handling but optimize device placement
- tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
- text_mask = length_to_mask(input_lengths).to(device)
+ # Split and clone reference signals with explicit device placement
+ s_content = ref_s[:, 128:].clone().to(device)
+ s_ref = ref_s[:, :128].clone().to(device)
- # BERT and encoder pass
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
- d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+ # BERT and encoder pass
+ bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+ d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
- # Split reference signal once for efficiency
- s_content = ref_s[:, 128:]
- s_ref = ref_s[:, :128]
+ # Predictor forward pass
+ d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
+ x, _ = model.predictor.lstm(d)
- # Predictor forward pass
- d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
- x, _ = model.predictor.lstm(d)
+ # Duration prediction
+ duration = model.predictor.duration_proj(x)
+ duration = torch.sigmoid(duration).sum(axis=-1) / speed
+ pred_dur = torch.round(duration).clamp(min=1).long()
+ # Only cleanup large intermediates
+ del duration, x
- # Duration prediction - keeping original logic
- duration = model.predictor.duration_proj(x)
- duration = torch.sigmoid(duration).sum(axis=-1) / speed
- pred_dur = torch.round(duration).clamp(min=1).long()
+ # Alignment matrix construction
+ pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device)
+ c_frame = 0
+ for i in range(pred_aln_trg.size(0)):
+ pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+ c_frame += pred_dur[0, i].item()
+ pred_aln_trg = pred_aln_trg.unsqueeze(0)
- # Alignment matrix construction - keeping original approach for quality
- pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
- c_frame = 0
- for i in range(pred_aln_trg.size(0)):
- pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
- c_frame += pred_dur[0, i].item()
+ # Matrix multiplications with selective cleanup
+ en = d.transpose(-1, -2) @ pred_aln_trg
+ del d # Free large intermediate tensor
+
+ F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+ del en # Free large intermediate tensor
- # Matrix multiplications - reuse unsqueezed tensor
- pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once
- en = d.transpose(-1, -2) @ pred_aln_trg
- F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+ # Final text encoding and decoding
+ t_en = model.text_encoder(tokens, input_lengths, text_mask)
+ asr = t_en @ pred_aln_trg
+ del t_en # Free large intermediate tensor
- # Text encoding and final decoding
- t_en = model.text_encoder(tokens, input_lengths, text_mask)
- asr = t_en @ pred_aln_trg
-
- return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
+ # Final decoding and transfer to CPU
+ output = model.decoder(asr, F0_pred, N_pred, s_ref)
+ result = output.squeeze().cpu().numpy()
+
+ return result
+
+ finally:
+ # Let PyTorch handle most cleanup automatically
+ # Only explicitly free the largest tensors
+ del pred_aln_trg, asr
# def length_to_mask(lengths):
@@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel):
def generate_from_tokens(
cls, tokens: list[int], voicepack: torch.Tensor, speed: float
) -> np.ndarray:
- """Generate audio from tokens
+ """Generate audio from tokens with moderate memory management
Args:
tokens: Token IDs
@@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel):
if cls._instance is None:
raise RuntimeError("GPU model not initialized")
- # Get reference style
- ref_s = voicepack[len(tokens)]
-
- # Generate audio
- audio = forward(cls._instance, tokens, ref_s, speed)
-
- return audio
+ try:
+ device = cls._device
+
+ # Check memory pressure
+ if torch.cuda.is_available():
+ memory_allocated = torch.cuda.memory_allocated(device) / 1e9 # Convert to GB
+ if memory_allocated > 2.0: # 2GB limit
+ logger.info(
+ f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB "
+ f"Clearing cache"
+ )
+ torch.cuda.empty_cache()
+ import gc
+ gc.collect()
+
+ # Get reference style with proper device placement
+ ref_s = voicepack[len(tokens)].clone().to(device)
+
+ # Generate audio
+ audio = forward(cls._instance, tokens, ref_s, speed)
+
+ return audio
+
+ except RuntimeError as e:
+ if "out of memory" in str(e):
+ # On OOM, do a full cleanup and retry
+ if torch.cuda.is_available():
+ logger.warning("Out of memory detected, performing full cleanup")
+ torch.cuda.synchronize()
+ torch.cuda.empty_cache()
+ import gc
+ gc.collect()
+
+ # Log memory stats after cleanup
+ memory_allocated = torch.cuda.memory_allocated(device)
+ memory_reserved = torch.cuda.memory_reserved(device)
+ logger.info(
+ f"Memory after OOM cleanup: "
+ f"Allocated: {memory_allocated / 1e9:.2f}GB, "
+ f"Reserved: {memory_reserved / 1e9:.2f}GB"
+ )
+
+ # Retry generation
+ ref_s = voicepack[len(tokens)].clone().to(device)
+ audio = forward(cls._instance, tokens, ref_s, speed)
+ return audio
+ raise
+
+ finally:
+ # Only synchronize at the top level, no empty_cache
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml
index f22704e..fbe0436 100644
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@@ -20,17 +20,17 @@ services:
- ONNX_MEMORY_PATTERN=true
- ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
- # # Gradio UI service [Comment out everything below if you don't need it]
- # gradio-ui:
- # # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
- # # Uncomment below (and comment out above) to build from source instead of using the released image
- # # build:
- # # context: ./ui
- # ports:
- # - "7860:7860"
- # volumes:
- # - ./ui/data:/app/ui/data
- # - ./ui/app.py:/app/app.py # Mount app.py for hot reload
- # environment:
- # - GRADIO_WATCH=True # Enable hot reloading
- # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
+ # Gradio UI service [Comment out everything below if you don't need it]
+ gradio-ui:
+ # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
+ # Uncomment below (and comment out above) to build from source instead of using the released image
+ # build:
+ # context: ./ui
+ ports:
+ - "7860:7860"
+ volumes:
+ - ./ui/data:/app/ui/data
+ - ./ui/app.py:/app/app.py # Mount app.py for hot reload
+ environment:
+ - GRADIO_WATCH=True # Enable hot reloading
+ - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml
index 7256a5c..61852cd 100644
--- a/docker/gpu/docker-compose.yml
+++ b/docker/gpu/docker-compose.yml
@@ -1,45 +1,10 @@
services:
- # model-fetcher:
- # image: datamachines/git-lfs:latest
- # environment:
- # - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
- # volumes:
- # - ./Kokoro-82M:/app/Kokoro-82M
- # working_dir: /app/Kokoro-82M
- # command: >
- # sh -c "
- # if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
- # echo 'Skipping model fetch...' && touch .cloned;
- # else
- # rm -f .git/index.lock;
- # if [ -z \"$(ls -A .)\" ]; then
- # git clone https://huggingface.co/hexgrad/Kokoro-82M .
- # touch .cloned;
- # else
- # rm -f .git/index.lock && \
- # git checkout main && \
- # git pull origin main && \
- # touch .cloned;
- # fi;
- # fi;
- # tail -f /dev/null
- # "
- # healthcheck:
- # test: ["CMD", "test", "-f", ".cloned"]
- # interval: 5s
- # timeout: 2s
- # retries: 300
- # start_period: 1s
-
kokoro-tts:
- # image: ghcr.io/remsky/kokoro-fastapi-gpu:latest
- # Uncomment below to build from source instead of using the released image
build:
context: ../..
dockerfile: docker/gpu/Dockerfile
volumes:
- - ../../api/src:/app/api/src
- - ../../Kokoro-82M:/app/Kokoro-82M
+ - ../../api/src:/app/api/src # Mount src for development
ports:
- "8880:8880"
environment:
@@ -51,21 +16,18 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
- # depends_on:
- # model-fetcher:
- # condition: service_healthy
- # Gradio UI service [Comment out everything below if you don't need it]
- # gradio-ui:
+ # Gradio UI service
+ gradio-ui:
# image: ghcr.io/remsky/kokoro-fastapi-ui:latest
# Uncomment below to build from source instead of using the released image
- # build:
- # context: ./ui
- # ports:
- # - "7860:7860"
- # volumes:
- # - ./ui/data:/app/ui/data
- # - ./ui/app.py:/app/app.py # Mount app.py for hot reload
- # environment:
- # - GRADIO_WATCH=1 # Enable hot reloading
- # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
+ build:
+ context: ../../ui
+ ports:
+ - "7860:7860"
+ volumes:
+ - ../../ui/data:/app/ui/data
+ - ../../ui/app.py:/app/app.py # Mount app.py for hot reload
+ environment:
+ - GRADIO_WATCH=1 # Enable hot reloading
+ - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered