diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 0000000..959e555 --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,43 @@ +FROM ubuntu:22.04 + +# Install base system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip \ + python3-dev \ + espeak-ng \ + git \ + libsndfile1 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install PyTorch CPU version +RUN pip3 install --no-cache-dir torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu + +# Install all other dependencies from requirements.txt +COPY requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt + +# Copy application code and model +COPY . /app/ + +# Set working directory +WORKDIR /app + +# Run with Python unbuffered output for live logging +ENV PYTHONUNBUFFERED=1 + +# Create non-root user +RUN useradd -m -u 1000 appuser + +# Create directories and set permissions +RUN mkdir -p /app/Kokoro-82M && \ + chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser + +# Set Python path (app first for our imports, then model dir for model imports) +ENV PYTHONPATH=/app:/app/Kokoro-82M + +# Run FastAPI server with debug logging and reload +CMD ["uvicorn", "api.src.main:app", "--host", "0.0.0.0", "--port", "8880", "--log-level", "debug"] diff --git a/README.md b/README.md index fd8a3a1..be9622c 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Coverage](https://img.shields.io/badge/coverage-97%25-darkgreen)]() FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model, providing an OpenAI-compatible endpoint with: -- NVIDIA GPU acceleration enabled +- NVIDIA GPU accelerated inference (or CPU) option - automatic chunking/stitching for long texts - very fast generation time (~35-49x RTF) @@ -24,10 +24,15 @@ FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text git clone https://github.com/remsky/Kokoro-FastAPI.git cd Kokoro-FastAPI -# Start the API (will automatically clone source HF repo via git-lfs) +# For GPU acceleration (requires NVIDIA GPU): docker compose up --build + +# For CPU-only deployment (~10x slower, but doesn't require an NVIDIA GPU): +docker compose -f docker-compose.cpu.yml up --build ``` + + Test all voices (from another terminal): ```bash python examples/test_all_voices.py @@ -106,11 +111,12 @@ Key Performance Metrics: ## Features - OpenAI-compatible API endpoints -- GPU-accelerated inference +- GPU-accelerated inference (if desired) - Multiple audio formats: mp3, wav, opus, flac, (aac & pcm not implemented) - Natural Boundary Detection: - Automatically splits and stitches at sentence boundaries to reduce artifacts and maintain performacne +*Note: CPU Inference is currently a very basic implementation, and not heavily tested* ## Model @@ -135,4 +141,3 @@ The full Apache 2.0 license text can be found at: https://www.apache.org/license https://user-images.githubusercontent.com/338912d2-90f3-41fb-bca0-5db7b4e02287.mp4 - diff --git a/docker-compose.cpu.yml b/docker-compose.cpu.yml new file mode 100644 index 0000000..02aa674 --- /dev/null +++ b/docker-compose.cpu.yml @@ -0,0 +1,38 @@ +services: + model-fetcher: + image: datamachines/git-lfs:latest + volumes: + - ./Kokoro-82M:/app/Kokoro-82M + working_dir: /app/Kokoro-82M + command: > + sh -c " + if [ -z \"$(ls -A .)\" ]; then + git clone https://huggingface.co/hexgrad/Kokoro-82M . && \ + git checkout 8228a351f87c8a6076502c1e3b7e72e821ebec9a; + touch .cloned; + else + touch .cloned; + fi; + tail -f /dev/null + " + healthcheck: + test: ["CMD", "test", "-f", ".cloned"] + interval: 1s + timeout: 1s + retries: 120 + start_period: 1s + + kokoro-tts: + build: + context: . + dockerfile: Dockerfile.cpu + volumes: + - ./api/src:/app/api/src + - ./Kokoro-82M:/app/Kokoro-82M + ports: + - "8880:8880" + environment: + - PYTHONPATH=/app:/app/Kokoro-82M + depends_on: + model-fetcher: + condition: service_healthy