mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00

- Introduced v1.0 model build system integration. - Updated imports to reflect new directory structure for versioned models. - Modified environment variables - Added version selection in the frontend for voice management. - Enhanced Docker build scripts for multi-platform support. - Updated configuration settings for default voice and model paths.
4.9 KiB
4.9 KiB
Kokoro v1.0 Wrapper Technical Specification
Overview
This document details the technical implementation of the KokoroV1Wrapper class that integrates the Kokoro v1.0 KModel/KPipeline architecture with our existing system.
Class Implementation
from pathlib import Path
from kokoro import KModel, KPipeline
class KokoroV1Wrapper:
"""Wrapper for Kokoro v1.0 KModel/KPipeline integration.
This wrapper manages:
1. Model initialization and weight loading
2. Pipeline creation and caching per language
3. Streaming audio generation
"""
def __init__(self, config_path: str, model_path: str):
"""Initialize KModel with config and weights.
Args:
config_path: Path to config.json in builds/v1_0/
model_path: Path to model weights in models/v1_0/
"""
self.model = KModel() # Will load config and weights
self.pipelines = {} # lang_code -> KPipeline cache
def get_pipeline(self, lang_code: str) -> KPipeline:
"""Get or create a KPipeline for the given language code.
Args:
lang_code: Language code for phoneme processing
Returns:
KPipeline instance for the language
"""
if lang_code not in self.pipelines:
self.pipelines[lang_code] = KPipeline(
lang_code=lang_code,
model=self.model
)
return self.pipelines[lang_code]
async def forward(self, text: str, voice: str, lang_code: str):
"""Generate audio using the appropriate pipeline.
Args:
text: Input text to synthesize
voice: Voice ID to use
lang_code: Language code for phoneme processing
Yields:
Audio chunks as torch.FloatTensor
"""
pipeline = self.get_pipeline(lang_code)
generator = pipeline(text, voice=voice)
for gs, ps, audio in generator:
yield audio
class ModelManager:
"""Manages multiple model versions and their initialization."""
def __init__(self):
self.models = {}
async def get_model(self, version: str):
"""Get or initialize a model for the specified version.
Args:
version: Model version ("v0.19" or "v1.0")
Returns:
Initialized model instance
"""
if version not in self.models:
if version == "v0.19":
from ..builds.v0_19.models import build_model
self.models[version] = await build_model()
elif version == "v1.0":
from ..builds.v1_0.wrapper import KokoroV1Wrapper
# Config in builds directory
config_path = Path(__file__).parent / "builds/v1_0/config.json"
# Model weights in models directory
model_path = Path(__file__).parent / "models/v1_0/kokoro-v1_0.pth"
self.models[version] = KokoroV1Wrapper(
config_path=str(config_path),
model_path=str(model_path)
)
return self.models[version]
Key Design Points
-
Model Management
- KModel handles weights and inference
- Config and weights loaded from separate directories
- Language-blind design (phoneme focused)
-
Pipeline Caching
- One KPipeline per language code
- Pipelines created on demand and cached
- Reuses single KModel instance
-
Streaming Integration
- Maintains compatibility with existing streaming system
- Yields audio chunks progressively
- Handles both quiet and loud pipeline modes
-
Version Control
- Clear separation between v0.19 and v1.0
- Version-specific model initialization
- Shared model manager interface
Usage Example
# Initialize model manager
manager = ModelManager()
# Get v1.0 model
model = await manager.get_model("v1.0")
# Generate audio
async for audio in model.forward(
text="Hello world",
voice="af_bella",
lang_code="en"
):
# Process audio chunk
process_audio(audio)
Error Handling
-
File Access
- Verify config.json exists in builds/v1_0/
- Verify model weights exist in models/v1_0/
- Handle missing or corrupt files
-
Pipeline Creation
- Validate language codes
- Handle initialization failures
- Clean up failed pipeline instances
-
Voice Loading
- Verify voice file existence
- Handle voice format compatibility
- Manage voice loading failures
Testing Strategy
-
Unit Tests
- Model initialization
- Pipeline creation and caching
- Audio generation
- Error handling
-
Integration Tests
- End-to-end audio generation
- Streaming performance
- Memory usage
- Multi-language support
-
Performance Tests
- Pipeline creation overhead
- Memory usage patterns
- Streaming latency
- Voice loading speed