diff --git a/.dockerignore b/.dockerignore index b456f25..df5f9db 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,5 @@ # Version control .git -.gitignore # Python __pycache__ diff --git a/.github/workflows/sync-develop.yml b/.github/workflows/sync-develop.yml index 2e0cfd5..56b881f 100644 --- a/.github/workflows/sync-develop.yml +++ b/.github/workflows/sync-develop.yml @@ -1,55 +1,55 @@ -name: Sync develop with master +# name: Sync develop with master -on: - push: - branches: - - master +# on: +# push: +# branches: +# - master -jobs: - sync-develop: - runs-on: ubuntu-latest - permissions: - contents: write - issues: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: develop +# jobs: +# sync-develop: +# runs-on: ubuntu-latest +# permissions: +# contents: write +# issues: write +# steps: +# - name: Checkout repository +# uses: actions/checkout@v4 +# with: +# fetch-depth: 0 +# ref: develop - - name: Configure Git - run: | - git config user.name "GitHub Actions" - git config user.email "actions@github.com" +# - name: Configure Git +# run: | +# git config user.name "GitHub Actions" +# git config user.email "actions@github.com" - - name: Merge master into develop - run: | - git fetch origin master:master - git merge --no-ff origin/master -m "chore: Merge master into develop branch" +# - name: Merge master into develop +# run: | +# git fetch origin master:master +# git merge --no-ff origin/master -m "chore: Merge master into develop branch" - - name: Push changes - run: | - if ! git push origin develop; then - echo "Failed to push to develop branch" - exit 1 - fi +# - name: Push changes +# run: | +# if ! git push origin develop; then +# echo "Failed to push to develop branch" +# exit 1 +# fi - - name: Handle Failure - if: failure() - uses: actions/github-script@v7 - with: - script: | - const issueBody = `Automatic merge from master to develop failed. +# - name: Handle Failure +# if: failure() +# uses: actions/github-script@v7 +# with: +# script: | +# const issueBody = `Automatic merge from master to develop failed. - Please resolve this manually +# Please resolve this manually - Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`; +# Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`; - await github.rest.issues.create({ - owner: context.repo.owner, - repo: context.repo.repo, - title: '🔄 Automatic master to develop merge failed', - body: issueBody, - labels: ['merge-failed', 'automation'] - }); +# await github.rest.issues.create({ +# owner: context.repo.owner, +# repo: context.repo.repo, +# title: '🔄 Automatic master to develop merge failed', +# body: issueBody, +# labels: ['merge-failed', 'automation'] +# }); diff --git a/.gitignore b/.gitignore index 37d236a..4d8d73c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,33 +1,44 @@ +# Version control +.git -output/* -output_audio/* -ui/data/* - -*.db +# Python +__pycache__/ *.pyc +*.pyo +*.pyd +*.py[cod] +*$py.class +.Python +.pytest_cache +.coverage +.coveragerc + +# Python package build artifacts +*.egg-info/ +*.egg +dist/ +build/ + +# Environment +# .env +.venv/ +env/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Project specific +# Model files +*.pt *.pth *.tar* -Kokoro-82M/* -__pycache__/ -.vscode/ -env/ -.venv/ -.Python - - -.coverage - -examples/assorted_checks/benchmarks/output_audio/* -examples/assorted_checks/test_combinations/output/* -examples/assorted_checks/test_openai/output/* - -examples/assorted_checks/test_voices/output/* -examples/assorted_checks/test_formats/output/* -examples/assorted_checks/benchmarks/output_audio_stream/* -ui/RepoScreenshot.png -examples/assorted_checks/benchmarks/output_audio_stream_openai/* - +# Voice files api/src/voices/af_bella.pt api/src/voices/af_nicole.pt api/src/voices/af_sarah.pt @@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt api/src/voices/bf_isabella.pt api/src/voices/bm_george.pt api/src/voices/bm_lewis.pt + +# Audio files +examples/*.wav +examples/*.pcm +examples/*.mp3 +examples/*.flac +examples/*.acc +examples/*.ogg examples/speech.mp3 examples/phoneme_examples/output/example_1.wav examples/phoneme_examples/output/example_2.wav examples/phoneme_examples/output/example_3.wav + +# Other project files +Kokoro-82M/ +ui/data/ EXTERNAL_UV_DOCUMENTATION* -# Python package build artifacts -*.egg-info/ -*.egg -dist/ -build/ +# Docker +Dockerfile* +docker-compose* diff --git a/CHANGELOG.md b/CHANGELOG.md index 4329f75..c3515d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,23 @@ Notable changes to this project will be documented in this file. -## [v0.0.6] - 2025-01-10 +## [v0.1.0] - 2025-01-13 +### Changed +- Major Docker improvements: + - Baked model directly into Dockerfile for improved deployment reliability + - Switched to uv for dependency management + - Streamlined container builds and reduced image sizes +- Dependency Management: + - Migrated from pip/poetry to uv for faster, more reliable package management + - Added uv.lock for deterministic builds + - Updated dependency resolution strategy + +## [v0.0.5post1] - 2025-01-11 ### Fixed -- Fixed dependency issues: - - Let PyTorch manage numpy version - - Pin aiofiles to 23.2.1 for Windows compatibility -- Added CI workflow for testing +- Docker image tagging and versioning improvements (-gpu, -cpu, -ui) +- Minor vram management improvements +- Gradio bugfix causing crashes and errant warnings +- Updated GPU and UI container configurations ## [v0.0.5] - 2025-01-10 ### Fixed diff --git a/README.md b/README.md index 6a3b673..fddea39 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # Kokoro TTS API [![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]() [![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]() -[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) +[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky) Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model - OpenAI-compatible Speech endpoint, with inline voice combination functionality @@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i ```bash git clone https://github.com/remsky/Kokoro-FastAPI.git cd Kokoro-FastAPI - docker compose up --build + docker compose up --build # for GPU + #docker compose -f docker-compose.cpu.yml up --build # for CPU ``` 2. Run locally as an OpenAI-Compatible Speech Endpoint ```python @@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f: See `examples/phoneme_examples/generate_phonemes.py` for a sample script. +## Known Issues + +
+Linux GPU Permissions + +Some Linux users may encounter GPU permission issues when running as non-root. +Can't guarantee anything, but here are some common solutions, consider your security requirements carefully + +### Option 1: Container Groups (Likely the best option) +```yaml +services: + kokoro-tts: + # ... existing config ... + group_add: + - "video" + - "render" +``` + +### Option 2: Host System Groups +```yaml +services: + kokoro-tts: + # ... existing config ... + user: "${UID}:${GID}" + group_add: + - "video" +``` +Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart. + +### Option 3: Device Permissions (Use with caution) +```yaml +services: + kokoro-tts: + # ... existing config ... + devices: + - /dev/nvidia0:/dev/nvidia0 + - /dev/nvidiactl:/dev/nvidiactl + - /dev/nvidia-uvm:/dev/nvidia-uvm +``` +⚠️ Warning: Reduces system security. Use only in development environments. + +Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured. + +Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information + +
+ ## Model and License
diff --git a/api/src/services/tts_gpu.py b/api/src/services/tts_gpu.py index 87208e5..ca01b84 100644 --- a/api/src/services/tts_gpu.py +++ b/api/src/services/tts_gpu.py @@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize # return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy() @torch.no_grad() def forward(model, tokens, ref_s, speed): - """Forward pass through the model with light optimizations that preserve output quality""" + """Forward pass through the model with moderate memory management""" device = ref_s.device + + try: + # Initial tensor setup with proper device placement + tokens = torch.LongTensor([[0, *tokens, 0]]).to(device) + input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device) + text_mask = length_to_mask(input_lengths).to(device) - # Keep original token handling but optimize device placement - tokens = torch.LongTensor([[0, *tokens, 0]]).to(device) - input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device) - text_mask = length_to_mask(input_lengths).to(device) + # Split and clone reference signals with explicit device placement + s_content = ref_s[:, 128:].clone().to(device) + s_ref = ref_s[:, :128].clone().to(device) - # BERT and encoder pass - bert_dur = model.bert(tokens, attention_mask=(~text_mask).int()) - d_en = model.bert_encoder(bert_dur).transpose(-1, -2) + # BERT and encoder pass + bert_dur = model.bert(tokens, attention_mask=(~text_mask).int()) + d_en = model.bert_encoder(bert_dur).transpose(-1, -2) - # Split reference signal once for efficiency - s_content = ref_s[:, 128:] - s_ref = ref_s[:, :128] + # Predictor forward pass + d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask) + x, _ = model.predictor.lstm(d) - # Predictor forward pass - d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask) - x, _ = model.predictor.lstm(d) + # Duration prediction + duration = model.predictor.duration_proj(x) + duration = torch.sigmoid(duration).sum(axis=-1) / speed + pred_dur = torch.round(duration).clamp(min=1).long() + # Only cleanup large intermediates + del duration, x - # Duration prediction - keeping original logic - duration = model.predictor.duration_proj(x) - duration = torch.sigmoid(duration).sum(axis=-1) / speed - pred_dur = torch.round(duration).clamp(min=1).long() + # Alignment matrix construction + pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device) + c_frame = 0 + for i in range(pred_aln_trg.size(0)): + pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1 + c_frame += pred_dur[0, i].item() + pred_aln_trg = pred_aln_trg.unsqueeze(0) - # Alignment matrix construction - keeping original approach for quality - pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device) - c_frame = 0 - for i in range(pred_aln_trg.size(0)): - pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1 - c_frame += pred_dur[0, i].item() + # Matrix multiplications with selective cleanup + en = d.transpose(-1, -2) @ pred_aln_trg + del d # Free large intermediate tensor + + F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content) + del en # Free large intermediate tensor - # Matrix multiplications - reuse unsqueezed tensor - pred_aln_trg = pred_aln_trg.unsqueeze(0) # Do unsqueeze once - en = d.transpose(-1, -2) @ pred_aln_trg - F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content) + # Final text encoding and decoding + t_en = model.text_encoder(tokens, input_lengths, text_mask) + asr = t_en @ pred_aln_trg + del t_en # Free large intermediate tensor - # Text encoding and final decoding - t_en = model.text_encoder(tokens, input_lengths, text_mask) - asr = t_en @ pred_aln_trg - - return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy() + # Final decoding and transfer to CPU + output = model.decoder(asr, F0_pred, N_pred, s_ref) + result = output.squeeze().cpu().numpy() + + return result + + finally: + # Let PyTorch handle most cleanup automatically + # Only explicitly free the largest tensors + del pred_aln_trg, asr # def length_to_mask(lengths): @@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel): def generate_from_tokens( cls, tokens: list[int], voicepack: torch.Tensor, speed: float ) -> np.ndarray: - """Generate audio from tokens + """Generate audio from tokens with moderate memory management Args: tokens: Token IDs @@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel): if cls._instance is None: raise RuntimeError("GPU model not initialized") - # Get reference style - ref_s = voicepack[len(tokens)] - - # Generate audio - audio = forward(cls._instance, tokens, ref_s, speed) - - return audio + try: + device = cls._device + + # Check memory pressure + if torch.cuda.is_available(): + memory_allocated = torch.cuda.memory_allocated(device) / 1e9 # Convert to GB + if memory_allocated > 2.0: # 2GB limit + logger.info( + f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB " + f"Clearing cache" + ) + torch.cuda.empty_cache() + import gc + gc.collect() + + # Get reference style with proper device placement + ref_s = voicepack[len(tokens)].clone().to(device) + + # Generate audio + audio = forward(cls._instance, tokens, ref_s, speed) + + return audio + + except RuntimeError as e: + if "out of memory" in str(e): + # On OOM, do a full cleanup and retry + if torch.cuda.is_available(): + logger.warning("Out of memory detected, performing full cleanup") + torch.cuda.synchronize() + torch.cuda.empty_cache() + import gc + gc.collect() + + # Log memory stats after cleanup + memory_allocated = torch.cuda.memory_allocated(device) + memory_reserved = torch.cuda.memory_reserved(device) + logger.info( + f"Memory after OOM cleanup: " + f"Allocated: {memory_allocated / 1e9:.2f}GB, " + f"Reserved: {memory_reserved / 1e9:.2f}GB" + ) + + # Retry generation + ref_s = voicepack[len(tokens)].clone().to(device) + audio = forward(cls._instance, tokens, ref_s, speed) + return audio + raise + + finally: + # Only synchronize at the top level, no empty_cache + if torch.cuda.is_available(): + torch.cuda.synchronize() diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml index f22704e..fbe0436 100644 --- a/docker/cpu/docker-compose.yml +++ b/docker/cpu/docker-compose.yml @@ -20,17 +20,17 @@ services: - ONNX_MEMORY_PATTERN=true - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo - # # Gradio UI service [Comment out everything below if you don't need it] - # gradio-ui: - # # image: ghcr.io/remsky/kokoro-fastapi:latest-ui - # # Uncomment below (and comment out above) to build from source instead of using the released image - # # build: - # # context: ./ui - # ports: - # - "7860:7860" - # volumes: - # - ./ui/data:/app/ui/data - # - ./ui/app.py:/app/app.py # Mount app.py for hot reload - # environment: - # - GRADIO_WATCH=True # Enable hot reloading - # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + # Gradio UI service [Comment out everything below if you don't need it] + gradio-ui: + # image: ghcr.io/remsky/kokoro-fastapi:latest-ui + # Uncomment below (and comment out above) to build from source instead of using the released image + # build: + # context: ./ui + ports: + - "7860:7860" + volumes: + - ./ui/data:/app/ui/data + - ./ui/app.py:/app/app.py # Mount app.py for hot reload + environment: + - GRADIO_WATCH=True # Enable hot reloading + - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml index 7256a5c..61852cd 100644 --- a/docker/gpu/docker-compose.yml +++ b/docker/gpu/docker-compose.yml @@ -1,45 +1,10 @@ services: - # model-fetcher: - # image: datamachines/git-lfs:latest - # environment: - # - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false} - # volumes: - # - ./Kokoro-82M:/app/Kokoro-82M - # working_dir: /app/Kokoro-82M - # command: > - # sh -c " - # if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then - # echo 'Skipping model fetch...' && touch .cloned; - # else - # rm -f .git/index.lock; - # if [ -z \"$(ls -A .)\" ]; then - # git clone https://huggingface.co/hexgrad/Kokoro-82M . - # touch .cloned; - # else - # rm -f .git/index.lock && \ - # git checkout main && \ - # git pull origin main && \ - # touch .cloned; - # fi; - # fi; - # tail -f /dev/null - # " - # healthcheck: - # test: ["CMD", "test", "-f", ".cloned"] - # interval: 5s - # timeout: 2s - # retries: 300 - # start_period: 1s - kokoro-tts: - # image: ghcr.io/remsky/kokoro-fastapi-gpu:latest - # Uncomment below to build from source instead of using the released image build: context: ../.. dockerfile: docker/gpu/Dockerfile volumes: - - ../../api/src:/app/api/src - - ../../Kokoro-82M:/app/Kokoro-82M + - ../../api/src:/app/api/src # Mount src for development ports: - "8880:8880" environment: @@ -51,21 +16,18 @@ services: - driver: nvidia count: 1 capabilities: [gpu] - # depends_on: - # model-fetcher: - # condition: service_healthy - # Gradio UI service [Comment out everything below if you don't need it] - # gradio-ui: + # Gradio UI service + gradio-ui: # image: ghcr.io/remsky/kokoro-fastapi-ui:latest # Uncomment below to build from source instead of using the released image - # build: - # context: ./ui - # ports: - # - "7860:7860" - # volumes: - # - ./ui/data:/app/ui/data - # - ./ui/app.py:/app/app.py # Mount app.py for hot reload - # environment: - # - GRADIO_WATCH=1 # Enable hot reloading - # - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + build: + context: ../../ui + ports: + - "7860:7860" + volumes: + - ../../ui/data:/app/ui/data + - ../../ui/app.py:/app/app.py # Mount app.py for hot reload + environment: + - GRADIO_WATCH=1 # Enable hot reloading + - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered