Kokoro-FastAPI/longshot.py

#!/usr/bin/env python3
"""
1. Uses huggingface_hub.snapshot_download() with HF_TOKEN to fetch your gated model.
2. Launches NVIDIA NIM microservice via Docker, mounting the downloaded model.
"""

import os
import subprocess
import sys
from huggingface_hub import snapshot_download

# ——— Configuration ———
HF_TOKEN = os.environ.get("HF_TOKEN")  # Expect you exported this already
if not HF_TOKEN:
    print("Error: Please set your HF_TOKEN environment variable.", file=sys.stderr)
    sys.exit(1)

# Replace with your HF org/model path
HF_REPO_ID = "yuiuo/alex-65_BATCH-gradient_65"
# Where to cache the downloaded model
LOCAL_MODEL_DIR = os.path.expanduser("~/nim_models/your-gated-model")

# NIM Docker image for your base model type; adjust if using a different microservice
NIM_IMAGE = "ghcr.io/nvidia/nim/text-generation-base:latest"
# The environment variable inside the container pointing to your model store
NIM_MODEL_PATH = "/models/your-gated-model"

# GPU flags (remove --gpus if CPU-only)
DOCKER_GPU_FLAG = "--gpus all"

def download_model():
    """
    Pulls the gated HF model to LOCAL_MODEL_DIR using your read token.
    """
    print(f"Downloading '{HF_REPO_ID}' to '{LOCAL_MODEL_DIR}' …")
    os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)
    snapshot_download(
        repo_id=HF_REPO_ID,
        cache_dir=LOCAL_MODEL_DIR,
        library_name="nim-integration",
        token=HF_TOKEN,
        local_files_only=False,  # ensure it pulls from remote
    )
    print("Download complete.\n")

def serve_with_nim():
    """
    Launches the NIM microservice Docker container, mounting the model dir.
    """
    cmd = [
        "docker", "run", "-d",
        DOCKER_GPU_FLAG,
        "--name", "nim-gated-model",
        "-p", "8000:8000",                         # expose port 8000 for inference
        "-e", f"NIM_MODEL_STORE={NIM_MODEL_PATH}", # tells NIM where to find models
        "-e", "NIM_REFRESH_INTERVAL=3600",         # reload every hour
        "-v", f"{LOCAL_MODEL_DIR}:{NIM_MODEL_PATH}:ro",  # mount your model readonly
        NIM_IMAGE
    ]
    print("Starting NIM container with command:")
    print("  " + " ".join(cmd))
    subprocess.check_call(cmd)
    print("\nNIM microservice is now running on http://localhost:8000")

def main():
    download_model()
    serve_with_nim()

if __name__ == "__main__":
    main()