feat: merge master into core/uv-management for v0.1.0

Major changes: - Baked model directly into Dockerfile for improved deployment - Switched to uv for dependency management - Restructured Docker files into docker/cpu and docker/gpu directories - Updated configuration for better ONNX performance
2025-08-05 16:48:53 +00:00 · 2025-01-13 19:31:44 -07:00 · 2025-01-13 19:31:44 -07:00 · 007b1a35e8
commit 007b1a35e8
parent 387653050b 258b5fff54
8 changed files with 292 additions and 190 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,5 @@
 # Version control
 .git
-.gitignore

 # Python
 __pycache__
--- a/.github/workflows/sync-develop.yml
+++ b/.github/workflows/sync-develop.yml
@ -1,55 +1,55 @@
-name: Sync develop with master
+# name: Sync develop with master

-on:
-  push:
-    branches:
-      - master
+# on:
+#   push:
+#     branches:
+#       - master

-jobs:
-  sync-develop:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      issues: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: develop
+# jobs:
+#   sync-develop:
+#     runs-on: ubuntu-latest
+#     permissions:
+#       contents: write
+#       issues: write
+#     steps:
+#       - name: Checkout repository
+#         uses: actions/checkout@v4
+#         with:
+#           fetch-depth: 0
+#           ref: develop
          
-      - name: Configure Git
-        run: |
-          git config user.name "GitHub Actions"
-          git config user.email "actions@github.com"
+#       - name: Configure Git
+#         run: |
+#           git config user.name "GitHub Actions"
+#           git config user.email "actions@github.com"
          
-      - name: Merge master into develop
-        run: |
-          git fetch origin master:master
-          git merge --no-ff origin/master -m "chore: Merge master into develop branch"
+#       - name: Merge master into develop
+#         run: |
+#           git fetch origin master:master
+#           git merge --no-ff origin/master -m "chore: Merge master into develop branch"
          
-      - name: Push changes
-        run: |
-          if ! git push origin develop; then
-            echo "Failed to push to develop branch"
-            exit 1
-          fi
+#       - name: Push changes
+#         run: |
+#           if ! git push origin develop; then
+#             echo "Failed to push to develop branch"
+#             exit 1
+#           fi
        
-      - name: Handle Failure
-        if: failure()
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const issueBody = `Automatic merge from master to develop failed.
+#       - name: Handle Failure
+#         if: failure()
+#         uses: actions/github-script@v7
+#         with:
+#           script: |
+#             const issueBody = `Automatic merge from master to develop failed.

-            Please resolve this manually
+#             Please resolve this manually

-            Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+#             Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
            
-            await github.rest.issues.create({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              title: '🔄 Automatic master to develop merge failed',
-              body: issueBody,
-              labels: ['merge-failed', 'automation']
-            });
+#             await github.rest.issues.create({
+#               owner: context.repo.owner,
+#               repo: context.repo.repo,
+#               title: '🔄 Automatic master to develop merge failed',
+#               body: issueBody,
+#               labels: ['merge-failed', 'automation']
+#             });
--- a/.gitignore
+++ b/.gitignore
@ -1,33 +1,44 @@
+# Version control
+.git

-output/*
-output_audio/*
-ui/data/*
-
-*.db
+# Python
+__pycache__/
 *.pyc
+*.pyo
+*.pyd
+*.py[cod]
+*$py.class
+.Python
+.pytest_cache
+.coverage
+.coveragerc
+
+# Python package build artifacts
+*.egg-info/
+*.egg
+dist/
+build/
+
+# Environment
+# .env
+.venv/
+env/
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Project specific
+# Model files
+*.pt
 *.pth
 *.tar*

-Kokoro-82M/*
-__pycache__/
-.vscode/
-env/
-.venv/
-.Python
-
-
-.coverage
-
-examples/assorted_checks/benchmarks/output_audio/*
-examples/assorted_checks/test_combinations/output/*
-examples/assorted_checks/test_openai/output/*
-
-examples/assorted_checks/test_voices/output/*
-examples/assorted_checks/test_formats/output/*
-examples/assorted_checks/benchmarks/output_audio_stream/*
-ui/RepoScreenshot.png
-examples/assorted_checks/benchmarks/output_audio_stream_openai/*
-
+# Voice files
 api/src/voices/af_bella.pt
 api/src/voices/af_nicole.pt
 api/src/voices/af_sarah.pt
@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt
 api/src/voices/bf_isabella.pt
 api/src/voices/bm_george.pt
 api/src/voices/bm_lewis.pt
+
+# Audio files
+examples/*.wav
+examples/*.pcm
+examples/*.mp3
+examples/*.flac
+examples/*.acc
+examples/*.ogg
 examples/speech.mp3
 examples/phoneme_examples/output/example_1.wav
 examples/phoneme_examples/output/example_2.wav
 examples/phoneme_examples/output/example_3.wav
+
+# Other project files
+Kokoro-82M/
+ui/data/
 EXTERNAL_UV_DOCUMENTATION*

-# Python package build artifacts
-*.egg-info/
-*.egg
-dist/
-build/
+# Docker
+Dockerfile*
+docker-compose*
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,12 +2,23 @@

 Notable changes to this project will be documented in this file.

-## [v0.0.6] - 2025-01-10
+## [v0.1.0] - 2025-01-13
+### Changed
+- Major Docker improvements:
+  - Baked model directly into Dockerfile for improved deployment reliability
+  - Switched to uv for dependency management
+  - Streamlined container builds and reduced image sizes
+- Dependency Management:
+  - Migrated from pip/poetry to uv for faster, more reliable package management
+  - Added uv.lock for deterministic builds
+  - Updated dependency resolution strategy
+
+## [v0.0.5post1] - 2025-01-11
 ### Fixed
- Fixed dependency issues:
-  - Let PyTorch manage numpy version
-  - Pin aiofiles to 23.2.1 for Windows compatibility
- Added CI workflow for testing
+- Docker image tagging and versioning improvements (-gpu, -cpu, -ui)
+- Minor vram management improvements
+- Gradio bugfix causing crashes and errant warnings
+- Updated GPU and UI container configurations

 ## [v0.0.5] - 2025-01-10
 ### Fixed
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
 # Kokoro TTS API
 [![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]()
 [![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]()
-[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
+[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky)

 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
 - OpenAI-compatible Speech endpoint, with inline voice combination functionality
@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i
        ```bash
        git clone https://github.com/remsky/Kokoro-FastAPI.git
        cd Kokoro-FastAPI
-        docker compose up --build
+        docker compose up --build # for GPU
+        #docker compose -f docker-compose.cpu.yml up --build # for CPU
        ```
 2. Run locally as an OpenAI-Compatible Speech Endpoint
    ```python
@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f:
 See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
 </details>

+## Known Issues
+
+<details>
+<summary>Linux GPU Permissions</summary>
+
+Some Linux users may encounter GPU permission issues when running as non-root. 
+Can't guarantee anything, but here are some common solutions, consider your security requirements carefully
+
+### Option 1: Container Groups (Likely the best option)
+```yaml
+services:
+  kokoro-tts:
+    # ... existing config ...
+    group_add:
+      - "video"
+      - "render"
+```
+
+### Option 2: Host System Groups
+```yaml
+services:
+  kokoro-tts:
+    # ... existing config ...
+    user: "${UID}:${GID}"
+    group_add:
+      - "video"
+```
+Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart.
+
+### Option 3: Device Permissions (Use with caution)
+```yaml
+services:
+  kokoro-tts:
+    # ... existing config ...
+    devices:
+      - /dev/nvidia0:/dev/nvidia0
+      - /dev/nvidiactl:/dev/nvidiactl
+      - /dev/nvidia-uvm:/dev/nvidia-uvm
+```
+⚠️ Warning: Reduces system security. Use only in development environments.
+
+Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured.
+
+Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information
+
+</details>
+
 ## Model and License

 <details open>
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize
 #     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
 def forward(model, tokens, ref_s, speed):
-    """Forward pass through the model with light optimizations that preserve output quality"""
+    """Forward pass through the model with moderate memory management"""
    device = ref_s.device
    
-    # Keep original token handling but optimize device placement
-    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
-    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-    text_mask = length_to_mask(input_lengths).to(device)
+    try:
+        # Initial tensor setup with proper device placement
+        tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+        text_mask = length_to_mask(input_lengths).to(device)

-    # BERT and encoder pass
-    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+        # Split and clone reference signals with explicit device placement
+        s_content = ref_s[:, 128:].clone().to(device)
+        s_ref = ref_s[:, :128].clone().to(device)

-    # Split reference signal once for efficiency
-    s_content = ref_s[:, 128:]
-    s_ref = ref_s[:, :128]
+        # BERT and encoder pass
+        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

-    # Predictor forward pass
-    d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
-    x, _ = model.predictor.lstm(d)
+        # Predictor forward pass
+        d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
+        x, _ = model.predictor.lstm(d)

-    # Duration prediction - keeping original logic
-    duration = model.predictor.duration_proj(x)
-    duration = torch.sigmoid(duration).sum(axis=-1) / speed
-    pred_dur = torch.round(duration).clamp(min=1).long()
+        # Duration prediction
+        duration = model.predictor.duration_proj(x)
+        duration = torch.sigmoid(duration).sum(axis=-1) / speed
+        pred_dur = torch.round(duration).clamp(min=1).long()
+        # Only cleanup large intermediates
+        del duration, x

-    # Alignment matrix construction - keeping original approach for quality
-    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
-    c_frame = 0
-    for i in range(pred_aln_trg.size(0)):
-        pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
-        c_frame += pred_dur[0, i].item()
+        # Alignment matrix construction
+        pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device)
+        c_frame = 0
+        for i in range(pred_aln_trg.size(0)):
+            pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+            c_frame += pred_dur[0, i].item()
+        pred_aln_trg = pred_aln_trg.unsqueeze(0)

-    # Matrix multiplications - reuse unsqueezed tensor
-    pred_aln_trg = pred_aln_trg.unsqueeze(0)  # Do unsqueeze once
-    en = d.transpose(-1, -2) @ pred_aln_trg
-    F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+        # Matrix multiplications with selective cleanup
+        en = d.transpose(-1, -2) @ pred_aln_trg
+        del d  # Free large intermediate tensor
        
-    # Text encoding and final decoding
-    t_en = model.text_encoder(tokens, input_lengths, text_mask)
-    asr = t_en @ pred_aln_trg
+        F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+        del en  # Free large intermediate tensor

-    return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
+        # Final text encoding and decoding
+        t_en = model.text_encoder(tokens, input_lengths, text_mask)
+        asr = t_en @ pred_aln_trg
+        del t_en  # Free large intermediate tensor
+
+        # Final decoding and transfer to CPU
+        output = model.decoder(asr, F0_pred, N_pred, s_ref)
+        result = output.squeeze().cpu().numpy()
+        
+        return result
+        
+    finally:
+        # Let PyTorch handle most cleanup automatically
+        # Only explicitly free the largest tensors
+        del pred_aln_trg, asr


 # def length_to_mask(lengths):
@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel):
    def generate_from_tokens(
        cls, tokens: list[int], voicepack: torch.Tensor, speed: float
    ) -> np.ndarray:
-        """Generate audio from tokens
+        """Generate audio from tokens with moderate memory management

        Args:
            tokens: Token IDs
@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel):
        if cls._instance is None:
            raise RuntimeError("GPU model not initialized")

-        # Get reference style
-        ref_s = voicepack[len(tokens)]
+        try:
+            device = cls._device
            
-        # Generate audio
-        audio = forward(cls._instance, tokens, ref_s, speed)
+            # Check memory pressure
+            if torch.cuda.is_available():
+                memory_allocated = torch.cuda.memory_allocated(device) / 1e9  # Convert to GB
+                if memory_allocated > 2.0:  # 2GB limit
+                    logger.info(
+                        f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB "
+                        f"Clearing cache"
+                    )
+                    torch.cuda.empty_cache()
+                    import gc
+                    gc.collect()
            
-        return audio
+            # Get reference style with proper device placement
+            ref_s = voicepack[len(tokens)].clone().to(device)
+            
+            # Generate audio
+            audio = forward(cls._instance, tokens, ref_s, speed)
+            
+            return audio
+            
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                # On OOM, do a full cleanup and retry
+                if torch.cuda.is_available():
+                    logger.warning("Out of memory detected, performing full cleanup")
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                    import gc
+                    gc.collect()
+                    
+                    # Log memory stats after cleanup
+                    memory_allocated = torch.cuda.memory_allocated(device)
+                    memory_reserved = torch.cuda.memory_reserved(device)
+                    logger.info(
+                        f"Memory after OOM cleanup: "
+                        f"Allocated: {memory_allocated / 1e9:.2f}GB, "
+                        f"Reserved: {memory_reserved / 1e9:.2f}GB"
+                    )
+                    
+                    # Retry generation
+                    ref_s = voicepack[len(tokens)].clone().to(device)
+                    audio = forward(cls._instance, tokens, ref_s, speed)
+                    return audio
+            raise
+            
+        finally:
+            # Only synchronize at the top level, no empty_cache
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@ -20,17 +20,17 @@ services:
      - ONNX_MEMORY_PATTERN=true
      - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
      
-  # # Gradio UI service [Comment out everything below if you don't need it]
-  # gradio-ui:
-  #   # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
-  #   # Uncomment below (and comment out above) to build from source instead of using the released image
-  #   # build:
-  #   #   context: ./ui
-  #   ports:
-  #     - "7860:7860"
-  #   volumes:
-  #     - ./ui/data:/app/ui/data
-  #     - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-  #   environment:
-  #     - GRADIO_WATCH=True  # Enable hot reloading
-  #     - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
+  # Gradio UI service [Comment out everything below if you don't need it]
+  gradio-ui:
+    # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
+    # Uncomment below (and comment out above) to build from source instead of using the released image
+    # build:
+    #   context: ./ui
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./ui/data:/app/ui/data
+      - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+    environment:
+      - GRADIO_WATCH=True  # Enable hot reloading
+      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
--- a/docker/gpu/docker-compose.yml
+++ b/docker/gpu/docker-compose.yml
@ -1,45 +1,10 @@
 services:
-  # model-fetcher:
-  #   image: datamachines/git-lfs:latest
-  #   environment:
-  #     - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
-  #   volumes:
-  #     - ./Kokoro-82M:/app/Kokoro-82M
-  #   working_dir: /app/Kokoro-82M
-  #   command: >
-  #     sh -c "
-  #       if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
-  #         echo 'Skipping model fetch...' && touch .cloned;
-  #       else
-  #         rm -f .git/index.lock;
-  #         if [ -z \"$(ls -A .)\" ]; then
-  #           git clone https://huggingface.co/hexgrad/Kokoro-82M .
-  #           touch .cloned;
-  #         else
-  #           rm -f .git/index.lock && \
-  #           git checkout main && \
-  #           git pull origin main && \
-  #           touch .cloned;
-  #         fi;
-  #       fi;
-  #       tail -f /dev/null
-  #     "
-  #   healthcheck:
-  #     test: ["CMD", "test", "-f", ".cloned"]
-  #     interval: 5s
-  #     timeout: 2s
-  #     retries: 300
-  #     start_period: 1s
-
  kokoro-tts:
-    # image: ghcr.io/remsky/kokoro-fastapi-gpu:latest
-    # Uncomment below to build from source instead of using the released image
    build:
      context: ../..
      dockerfile: docker/gpu/Dockerfile
    volumes:
-      - ../../api/src:/app/api/src
-      - ../../Kokoro-82M:/app/Kokoro-82M
+      - ../../api/src:/app/api/src  # Mount src for development
    ports:
      - "8880:8880"
    environment:
@ -51,21 +16,18 @@ services:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
-    # depends_on:
-    #   model-fetcher:
-    #     condition: service_healthy

-  # Gradio UI service [Comment out everything below if you don't need it]
-  # gradio-ui:
+  # Gradio UI service
+  gradio-ui:
    # image: ghcr.io/remsky/kokoro-fastapi-ui:latest
    # Uncomment below to build from source instead of using the released image
-    # build:
-    #   context: ./ui
-    # ports:
-    #   - "7860:7860"
-    # volumes:
-    #   - ./ui/data:/app/ui/data
-    #   - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-    # environment:
-    #   - GRADIO_WATCH=1  # Enable hot reloading
-    #   - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
+    build:
+      context: ../../ui
+    ports:
+      - "7860:7860"
+    volumes:
+      - ../../ui/data:/app/ui/data
+      - ../../ui/app.py:/app/app.py  # Mount app.py for hot reload
+    environment:
+      - GRADIO_WATCH=1  # Enable hot reloading
+      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered