From 9a9bc4aca91ff876a8711d73ea174a72c86c8ced Mon Sep 17 00:00:00 2001
From: Cong Nguyen <5560025+rampadc@users.noreply.github.com>
Date: Mon, 10 Mar 2025 11:58:45 +1100
Subject: [PATCH] added support for mps on mac with apple silicon

---
 .gitignore                         |  3 +++
 api/src/core/config.py             | 20 ++++++++++++++++-
 api/src/inference/kokoro_v1.py     | 22 ++++++++++++++-----
 api/src/inference/voice_manager.py |  2 +-
 api/src/main.py                    |  7 +++++-
 api/src/routers/debug.py           | 10 ++++++++-
 start-gpu_mac.sh                   | 35 ++++++++++++++++++++++++++++++
 7 files changed, 89 insertions(+), 10 deletions(-)
 create mode 100755 start-gpu_mac.sh

diff --git a/.gitignore b/.gitignore
index 3a439db..35fa9fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,3 +70,6 @@ examples/speech.mp3
 examples/phoneme_examples/output/*.wav
 examples/assorted_checks/benchmarks/output_audio/*
 uv.lock
+
+# Mac MPS virtualenv for dual testing
+.venv-mps
diff --git a/api/src/core/config.py b/api/src/core/config.py
index f5fd569..bb32ea2 100644
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@@ -1,4 +1,5 @@
 from pydantic_settings import BaseSettings
+import torch
 
 
 class Settings(BaseSettings):
@@ -15,6 +16,7 @@ class Settings(BaseSettings):
     default_voice: str = "af_heart"
     default_voice_code: str | None = None  # If set, overrides the first letter of voice name, though api call param still takes precedence
     use_gpu: bool = True  # Whether to use GPU acceleration if available
+    device_type: str | None = None  # Will be auto-detected if None, can be "cuda", "mps", or "cpu"
     allow_local_voice_saving: bool = (
         False  # Whether to allow saving combined voices locally
     )
@@ -29,7 +31,7 @@ class Settings(BaseSettings):
     target_min_tokens: int = 175  # Target minimum tokens per chunk
     target_max_tokens: int = 250  # Target maximum tokens per chunk
     absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
-    advanced_text_normalization: bool = True # Preproesses the text before misiki which leads 
+    advanced_text_normalization: bool = True # Preproesses the text before misiki which leads
 
     gap_trim_ms: int = 1  # Base amount to trim from streaming chunk ends in milliseconds
     dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
@@ -50,5 +52,21 @@ class Settings(BaseSettings):
     class Config:
         env_file = ".env"
 
+    def get_device(self) -> str:
+        """Get the appropriate device based on settings and availability"""
+        if not self.use_gpu:
+            return "cpu"
+
+        if self.device_type:
+            return self.device_type
+
+        # Auto-detect device
+        if torch.backends.mps.is_available():
+            return "mps"
+        elif torch.cuda.is_available():
+            return "cuda"
+        return "cpu"
+
+
 
 settings = Settings()
diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py
index 419ade7..cff936d 100644
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@@ -21,7 +21,7 @@ class KokoroV1(BaseModelBackend):
         """Initialize backend with environment-based configuration."""
         super().__init__()
         # Strictly respect settings.use_gpu
-        self._device = "cuda" if settings.use_gpu else "cpu"
+        self._device = settings.get_device()
         self._model: Optional[KModel] = None
         self._pipelines: Dict[str, KPipeline] = {}  # Store pipelines by lang_code
 
@@ -48,9 +48,14 @@ class KokoroV1(BaseModelBackend):
 
             # Load model and let KModel handle device mapping
             self._model = KModel(config=config_path, model=model_path).eval()
-            # Move to CUDA if needed
-            if self._device == "cuda":
+            # For MPS, manually move ISTFT layers to CPU while keeping rest on MPS
+            if self._device == "mps":
+                logger.info("Moving model to MPS device with CPU fallback for unsupported operations")
+                self._model = self._model.to(torch.device("mps"))
+            elif self._device == "cuda":
                 self._model = self._model.cuda()
+            else:
+                self._model = self._model.cpu()
 
         except FileNotFoundError as e:
             raise e
@@ -277,7 +282,7 @@ class KokoroV1(BaseModelBackend):
                                         continue
                                     if not token.text or not token.text.strip():
                                         continue
-                                    
+
                                     start_time = float(token.start_ts) + current_offset
                                     end_time = float(token.end_ts) + current_offset
                                     word_timestamps.append(
@@ -295,8 +300,8 @@ class KokoroV1(BaseModelBackend):
                                 logger.error(
                                     f"Failed to process timestamps for chunk: {e}"
                                 )
-                      
-                      
+
+
                     yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps)
                 else:
                     logger.warning("No audio in chunk")
@@ -318,6 +323,7 @@ class KokoroV1(BaseModelBackend):
         if self._device == "cuda":
             memory_gb = torch.cuda.memory_allocated() / 1e9
             return memory_gb > model_config.pytorch_gpu.memory_threshold
+        # MPS doesn't provide memory management APIs
         return False
 
     def _clear_memory(self) -> None:
@@ -325,6 +331,10 @@ class KokoroV1(BaseModelBackend):
         if self._device == "cuda":
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
+        elif self._device == "mps":
+            # Empty cache if available (future-proofing)
+            if hasattr(torch.mps, 'empty_cache'):
+                torch.mps.empty_cache()
 
     def unload(self) -> None:
         """Unload model and free resources."""
diff --git a/api/src/inference/voice_manager.py b/api/src/inference/voice_manager.py
index 5466fa9..0d82c4f 100644
--- a/api/src/inference/voice_manager.py
+++ b/api/src/inference/voice_manager.py
@@ -19,7 +19,7 @@ class VoiceManager:
     def __init__(self):
         """Initialize voice manager."""
         # Strictly respect settings.use_gpu
-        self._device = "cuda" if settings.use_gpu else "cpu"
+        self._device = settings.get_device()
         self._voices: Dict[str, torch.Tensor] = {}
 
     async def get_voice_path(self, voice_name: str) -> str:
diff --git a/api/src/main.py b/api/src/main.py
index e940193..23299cf 100644
--- a/api/src/main.py
+++ b/api/src/main.py
@@ -85,7 +85,12 @@ async def lifespan(app: FastAPI):
 {boundary}
                 """
     startup_msg += f"\nModel warmed up on {device}: {model}"
-    startup_msg += f"CUDA: {torch.cuda.is_available()}"
+    if device == "mps":
+        startup_msg += "\nUsing Apple Metal Performance Shaders (MPS)"
+    elif device == "cuda":
+        startup_msg += f"\nCUDA: {torch.cuda.is_available()}"
+    else:
+        startup_msg += "\nRunning on CPU"
     startup_msg += f"\n{voicepack_count} voice packs loaded"
 
     # Add web player info if enabled
diff --git a/api/src/routers/debug.py b/api/src/routers/debug.py
index 6a65362..9a3ca1d 100644
--- a/api/src/routers/debug.py
+++ b/api/src/routers/debug.py
@@ -4,6 +4,7 @@ from datetime import datetime
 
 import psutil
 from fastapi import APIRouter
+import torch
 
 try:
     import GPUtil
@@ -113,7 +114,14 @@ async def get_system_info():
 
     # GPU Info if available
     gpu_info = None
-    if GPU_AVAILABLE:
+    if torch.backends.mps.is_available():
+            gpu_info = {
+                "type": "MPS",
+                "available": True,
+                "device": "Apple Silicon",
+                "backend": "Metal"
+            }
+    elif GPU_AVAILABLE:
         try:
             gpus = GPUtil.getGPUs()
             gpu_info = [
diff --git a/start-gpu_mac.sh b/start-gpu_mac.sh
new file mode 100755
index 0000000..e0b0196
--- /dev/null
+++ b/start-gpu_mac.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Get project root directory
+PROJECT_ROOT=$(pwd)
+
+# Create mps-specific venv directory
+VENV_DIR="$PROJECT_ROOT/.venv-mps"
+if [ ! -d "$VENV_DIR" ]; then
+    echo "Creating MPS-specific virtual environment..."
+    python3 -m venv "$VENV_DIR"
+fi
+
+# Set other environment variables
+export USE_GPU=true
+export USE_ONNX=false
+export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
+export MODEL_DIR=src/models
+export VOICES_DIR=src/voices/v1_0
+export WEB_PLAYER_PATH=$PROJECT_ROOT/web
+
+# Set environment variables
+export USE_GPU=true
+export USE_ONNX=false
+export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
+export MODEL_DIR=src/models
+export VOICES_DIR=src/voices/v1_0
+export WEB_PLAYER_PATH=$PROJECT_ROOT/web
+
+export DEVICE_TYPE=mps
+# Enable MPS fallback for unsupported operations
+export PYTORCH_ENABLE_MPS_FALLBACK=1
+
+# Run FastAPI with GPU extras using uv run
+uv pip install -e .
+uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8881