feat: merge master into core/uv-management for v0.1.0

Major changes: - Baked model directly into Dockerfile for improved deployment - Switched to uv for dependency management - Restructured Docker files into docker/cpu and docker/gpu directories - Updated configuration for better ONNX performance
2025-08-05 16:48:53 +00:00 · 2025-01-13 19:31:44 -07:00 · 2025-01-13 19:31:44 -07:00 · 007b1a35e8
commit 007b1a35e8
parent 387653050b 258b5fff54
8 changed files with 292 additions and 190 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,5 @@
 # Version control
 .git
 .gitignore
 # Python
 __pycache__
--- a/.github/workflows/sync-develop.yml
+++ b/.github/workflows/sync-develop.yml
@ -1,55 +1,55 @@
-name: Sync develop with master
+# name: Sync develop with master
-on:
+# on:
-  push:
+#   push:
-    branches:
+#     branches:
-      - master
+#       - master
-jobs:
+# jobs:
-  sync-develop:
+#   sync-develop:
-    runs-on: ubuntu-latest
+#     runs-on: ubuntu-latest
-    permissions:
+#     permissions:
-      contents: write
+#       contents: write
-      issues: write
+#       issues: write
-    steps:
+#     steps:
-      - name: Checkout repository
+#       - name: Checkout repository
-        uses: actions/checkout@v4
+#         uses: actions/checkout@v4
-        with:
+#         with:
-          fetch-depth: 0
+#           fetch-depth: 0
-          ref: develop
+#           ref: develop
-      - name: Configure Git
+#       - name: Configure Git
-        run: |
+#         run: |
-          git config user.name "GitHub Actions"
+#           git config user.name "GitHub Actions"
-          git config user.email "actions@github.com"
+#           git config user.email "actions@github.com"
-      - name: Merge master into develop
+#       - name: Merge master into develop
-        run: |
+#         run: |
-          git fetch origin master:master
+#           git fetch origin master:master
-          git merge --no-ff origin/master -m "chore: Merge master into develop branch"
+#           git merge --no-ff origin/master -m "chore: Merge master into develop branch"
-      - name: Push changes
+#       - name: Push changes
-        run: |
+#         run: |
-          if ! git push origin develop; then
+#           if ! git push origin develop; then
-            echo "Failed to push to develop branch"
+#             echo "Failed to push to develop branch"
-            exit 1
+#             exit 1
-          fi
+#           fi
-      - name: Handle Failure
+#       - name: Handle Failure
-        if: failure()
+#         if: failure()
-        uses: actions/github-script@v7
+#         uses: actions/github-script@v7
-        with:
+#         with:
-          script: |
+#           script: |
-            const issueBody = `Automatic merge from master to develop failed.
+#             const issueBody = `Automatic merge from master to develop failed.
-            Please resolve this manually
+#             Please resolve this manually
-            Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+#             Workflow run: ${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
-            await github.rest.issues.create({
+#             await github.rest.issues.create({
-              owner: context.repo.owner,
+#               owner: context.repo.owner,
-              repo: context.repo.repo,
+#               repo: context.repo.repo,
-              title: '🔄 Automatic master to develop merge failed',
+#               title: '🔄 Automatic master to develop merge failed',
-              body: issueBody,
+#               body: issueBody,
-              labels: ['merge-failed', 'automation']
+#               labels: ['merge-failed', 'automation']
-            });
+#             });
--- a/.gitignore
+++ b/.gitignore
@ -1,33 +1,44 @@
 # Version control
 .git
-output/*
+# Python
-output_audio/*
+__pycache__/
 ui/data/*
 *.db
 *.pyc
 *.pyo
 *.pyd
 *.py[cod]
 *$py.class
 .Python
 .pytest_cache
 .coverage
 .coveragerc
 # Python package build artifacts
 *.egg-info/
 *.egg
 dist/
 build/
 # Environment
 # .env
 .venv/
 env/
 venv/
 ENV/
 # IDE
 .idea/
 .vscode/
 *.swp
 *.swo
 # Project specific
 # Model files
 *.pt
 *.pth
 *.tar*
-Kokoro-82M/*
+# Voice files
 __pycache__/
 .vscode/
 env/
 .venv/
 .Python
 .coverage
 examples/assorted_checks/benchmarks/output_audio/*
 examples/assorted_checks/test_combinations/output/*
 examples/assorted_checks/test_openai/output/*
 examples/assorted_checks/test_voices/output/*
 examples/assorted_checks/test_formats/output/*
 examples/assorted_checks/benchmarks/output_audio_stream/*
 ui/RepoScreenshot.png
 examples/assorted_checks/benchmarks/output_audio_stream_openai/*
 api/src/voices/af_bella.pt
 api/src/voices/af_nicole.pt
 api/src/voices/af_sarah.pt
@ -39,14 +50,24 @@ api/src/voices/bf_emma.pt
 api/src/voices/bf_isabella.pt
 api/src/voices/bm_george.pt
 api/src/voices/bm_lewis.pt
 # Audio files
 examples/*.wav
 examples/*.pcm
 examples/*.mp3
 examples/*.flac
 examples/*.acc
 examples/*.ogg
 examples/speech.mp3
 examples/phoneme_examples/output/example_1.wav
 examples/phoneme_examples/output/example_2.wav
 examples/phoneme_examples/output/example_3.wav
 # Other project files
 Kokoro-82M/
 ui/data/
 EXTERNAL_UV_DOCUMENTATION*
-# Python package build artifacts
+# Docker
-*.egg-info/
+Dockerfile*
-*.egg
+docker-compose*
 dist/
 build/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,12 +2,23 @@
 Notable changes to this project will be documented in this file.
-## [v0.0.6] - 2025-01-10
+## [v0.1.0] - 2025-01-13
 ### Changed
 - Major Docker improvements:
  - Baked model directly into Dockerfile for improved deployment reliability
  - Switched to uv for dependency management
  - Streamlined container builds and reduced image sizes
 - Dependency Management:
  - Migrated from pip/poetry to uv for faster, more reliable package management
  - Added uv.lock for deterministic builds
  - Updated dependency resolution strategy
 ## [v0.0.5post1] - 2025-01-11
 ### Fixed
- Fixed dependency issues:
+- Docker image tagging and versioning improvements (-gpu, -cpu, -ui)
-  - Let PyTorch manage numpy version
+- Minor vram management improvements
-  - Pin aiofiles to 23.2.1 for Windows compatibility
+- Gradio bugfix causing crashes and errant warnings
- Added CI workflow for testing
+- Updated GPU and UI container configurations
 ## [v0.0.5] - 2025-01-10
 ### Fixed
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
 # Kokoro TTS API
 [![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]()
 [![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]()
-[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
+[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky)
 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
 - OpenAI-compatible Speech endpoint, with inline voice combination functionality
@ -29,7 +29,8 @@ The service can be accessed through either the API endpoints or the Gradio web i
        ```bash
        git clone https://github.com/remsky/Kokoro-FastAPI.git
        cd Kokoro-FastAPI
-        docker compose up --build
+        docker compose up --build # for GPU
        #docker compose -f docker-compose.cpu.yml up --build # for CPU
        ```
 2. Run locally as an OpenAI-Compatible Speech Endpoint
    ```python
@ -317,6 +318,53 @@ with open("speech.wav", "wb") as f:
 See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
 </details>
 ## Known Issues
 <details>
 <summary>Linux GPU Permissions</summary>
 Some Linux users may encounter GPU permission issues when running as non-root. 
 Can't guarantee anything, but here are some common solutions, consider your security requirements carefully
 ### Option 1: Container Groups (Likely the best option)
 ```yaml
 services:
  kokoro-tts:
    # ... existing config ...
    group_add:
      - "video"
      - "render"
 ```
 ### Option 2: Host System Groups
 ```yaml
 services:
  kokoro-tts:
    # ... existing config ...
    user: "${UID}:${GID}"
    group_add:
      - "video"
 ```
 Note: May require adding host user to groups: `sudo usermod -aG docker,video $USER` and system restart.
 ### Option 3: Device Permissions (Use with caution)
 ```yaml
 services:
  kokoro-tts:
    # ... existing config ...
    devices:
      - /dev/nvidia0:/dev/nvidia0
      - /dev/nvidiactl:/dev/nvidiactl
      - /dev/nvidia-uvm:/dev/nvidia-uvm
 ```
 ⚠️ Warning: Reduces system security. Use only in development environments.
 Prerequisites: NVIDIA GPU, drivers, and container toolkit must be properly configured.
 Visit [NVIDIA Container Toolkit installation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) for more detailed information
 </details>
 ## Model and License
 <details open>
--- a/api/src/services/tts_gpu.py
+++ b/api/src/services/tts_gpu.py
@ -38,48 +38,64 @@ from .text_processing import tokenize, phonemize
 #     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
@torch.no_grad()
 def forward(model, tokens, ref_s, speed):
-    """Forward pass through the model with light optimizations that preserve output quality"""
+    """Forward pass through the model with moderate memory management"""
    device = ref_s.device
-    # Keep original token handling but optimize device placement
+    try:
        # Initial tensor setup with proper device placement
        tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)
        # Split and clone reference signals with explicit device placement
        s_content = ref_s[:, 128:].clone().to(device)
        s_ref = ref_s[:, :128].clone().to(device)
        # BERT and encoder pass
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
    # Split reference signal once for efficiency
    s_content = ref_s[:, 128:]
    s_ref = ref_s[:, :128]
        # Predictor forward pass
        d = model.predictor.text_encoder(d_en, s_content, input_lengths, text_mask)
        x, _ = model.predictor.lstm(d)
-    # Duration prediction - keeping original logic
+        # Duration prediction
        duration = model.predictor.duration_proj(x)
        duration = torch.sigmoid(duration).sum(axis=-1) / speed
        pred_dur = torch.round(duration).clamp(min=1).long()
        # Only cleanup large intermediates
        del duration, x
-    # Alignment matrix construction - keeping original approach for quality
+        # Alignment matrix construction
-    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item(), device=device)
+        pred_aln_trg = torch.zeros(input_lengths.item(), pred_dur.sum().item(), device=device)
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
            c_frame += pred_dur[0, i].item()
        pred_aln_trg = pred_aln_trg.unsqueeze(0)
-    # Matrix multiplications - reuse unsqueezed tensor
+        # Matrix multiplications with selective cleanup
    pred_aln_trg = pred_aln_trg.unsqueeze(0)  # Do unsqueeze once
        en = d.transpose(-1, -2) @ pred_aln_trg
-    F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
+        del d  # Free large intermediate tensor
-    # Text encoding and final decoding
+        F0_pred, N_pred = model.predictor.F0Ntrain(en, s_content)
        del en  # Free large intermediate tensor
        # Final text encoding and decoding
        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        asr = t_en @ pred_aln_trg
        del t_en  # Free large intermediate tensor
-    return model.decoder(asr, F0_pred, N_pred, s_ref).squeeze().cpu().numpy()
+        # Final decoding and transfer to CPU
        output = model.decoder(asr, F0_pred, N_pred, s_ref)
        result = output.squeeze().cpu().numpy()
        return result
    finally:
        # Let PyTorch handle most cleanup automatically
        # Only explicitly free the largest tensors
        del pred_aln_trg, asr
 # def length_to_mask(lengths):
@ -179,7 +195,7 @@ class TTSGPUModel(TTSBaseModel):
    def generate_from_tokens(
        cls, tokens: list[int], voicepack: torch.Tensor, speed: float
    ) -> np.ndarray:
-        """Generate audio from tokens
+        """Generate audio from tokens with moderate memory management
        Args:
            tokens: Token IDs
@ -192,10 +208,55 @@ class TTSGPUModel(TTSBaseModel):
        if cls._instance is None:
            raise RuntimeError("GPU model not initialized")
-        # Get reference style
+        try:
-        ref_s = voicepack[len(tokens)]
+            device = cls._device
            # Check memory pressure
            if torch.cuda.is_available():
                memory_allocated = torch.cuda.memory_allocated(device) / 1e9  # Convert to GB
                if memory_allocated > 2.0:  # 2GB limit
                    logger.info(
                        f"Memory usage above 2GB threshold:{memory_allocated:.2f}GB "
                        f"Clearing cache"
                    )
                    torch.cuda.empty_cache()
                    import gc
                    gc.collect()
            # Get reference style with proper device placement
            ref_s = voicepack[len(tokens)].clone().to(device)
            # Generate audio
            audio = forward(cls._instance, tokens, ref_s, speed)
            return audio
        except RuntimeError as e:
            if "out of memory" in str(e):
                # On OOM, do a full cleanup and retry
                if torch.cuda.is_available():
                    logger.warning("Out of memory detected, performing full cleanup")
                    torch.cuda.synchronize()
                    torch.cuda.empty_cache()
                    import gc
                    gc.collect()
                    # Log memory stats after cleanup
                    memory_allocated = torch.cuda.memory_allocated(device)
                    memory_reserved = torch.cuda.memory_reserved(device)
                    logger.info(
                        f"Memory after OOM cleanup: "
                        f"Allocated: {memory_allocated / 1e9:.2f}GB, "
                        f"Reserved: {memory_reserved / 1e9:.2f}GB"
                    )
                    # Retry generation
                    ref_s = voicepack[len(tokens)].clone().to(device)
                    audio = forward(cls._instance, tokens, ref_s, speed)
                    return audio
            raise
        finally:
            # Only synchronize at the top level, no empty_cache
            if torch.cuda.is_available():
                torch.cuda.synchronize()
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@ -20,17 +20,17 @@ services:
      - ONNX_MEMORY_PATTERN=true
      - ONNX_ARENA_EXTEND_STRATEGY=kNextPowerOfTwo
-  # # Gradio UI service [Comment out everything below if you don't need it]
+  # Gradio UI service [Comment out everything below if you don't need it]
-  # gradio-ui:
+  gradio-ui:
-  #   # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
+    # image: ghcr.io/remsky/kokoro-fastapi:latest-ui
-  #   # Uncomment below (and comment out above) to build from source instead of using the released image
+    # Uncomment below (and comment out above) to build from source instead of using the released image
-  #   # build:
+    # build:
-  #   #   context: ./ui
+    #   context: ./ui
-  #   ports:
+    ports:
-  #     - "7860:7860"
+      - "7860:7860"
-  #   volumes:
+    volumes:
-  #     - ./ui/data:/app/ui/data
+      - ./ui/data:/app/ui/data
-  #     - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+      - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
-  #   environment:
+    environment:
-  #     - GRADIO_WATCH=True  # Enable hot reloading
+      - GRADIO_WATCH=True  # Enable hot reloading
-  #     - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
+      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
--- a/docker/gpu/docker-compose.yml
+++ b/docker/gpu/docker-compose.yml
@ -1,45 +1,10 @@
 services:
  # model-fetcher:
  #   image: datamachines/git-lfs:latest
  #   environment:
  #     - SKIP_MODEL_FETCH=${SKIP_MODEL_FETCH:-false}
  #   volumes:
  #     - ./Kokoro-82M:/app/Kokoro-82M
  #   working_dir: /app/Kokoro-82M
  #   command: >
  #     sh -c "
  #       if [ \"$$SKIP_MODEL_FETCH\" = \"true\" ]; then
  #         echo 'Skipping model fetch...' && touch .cloned;
  #       else
  #         rm -f .git/index.lock;
  #         if [ -z \"$(ls -A .)\" ]; then
  #           git clone https://huggingface.co/hexgrad/Kokoro-82M .
  #           touch .cloned;
  #         else
  #           rm -f .git/index.lock && \
  #           git checkout main && \
  #           git pull origin main && \
  #           touch .cloned;
  #         fi;
  #       fi;
  #       tail -f /dev/null
  #     "
  #   healthcheck:
  #     test: ["CMD", "test", "-f", ".cloned"]
  #     interval: 5s
  #     timeout: 2s
  #     retries: 300
  #     start_period: 1s
  kokoro-tts:
    # image: ghcr.io/remsky/kokoro-fastapi-gpu:latest
    # Uncomment below to build from source instead of using the released image
    build:
      context: ../..
      dockerfile: docker/gpu/Dockerfile
    volumes:
-      - ../../api/src:/app/api/src
+      - ../../api/src:/app/api/src  # Mount src for development
      - ../../Kokoro-82M:/app/Kokoro-82M
    ports:
      - "8880:8880"
    environment:
@ -51,21 +16,18 @@ services:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    # depends_on:
    #   model-fetcher:
    #     condition: service_healthy
-  # Gradio UI service [Comment out everything below if you don't need it]
+  # Gradio UI service
-  # gradio-ui:
+  gradio-ui:
    # image: ghcr.io/remsky/kokoro-fastapi-ui:latest
    # Uncomment below to build from source instead of using the released image
-    # build:
+    build:
-    #   context: ./ui
+      context: ../../ui
-    # ports:
+    ports:
-    #   - "7860:7860"
+      - "7860:7860"
-    # volumes:
+    volumes:
-    #   - ./ui/data:/app/ui/data
+      - ../../ui/data:/app/ui/data
-    #   - ./ui/app.py:/app/app.py  # Mount app.py for hot reload
+      - ../../ui/app.py:/app/app.py  # Mount app.py for hot reload
-    # environment:
+    environment:
-    #   - GRADIO_WATCH=1  # Enable hot reloading
+      - GRADIO_WATCH=1  # Enable hot reloading
-    #   - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
+      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered