Kokoro-FastAPI/api/src/inference/base.py

"""Base interface for Kokoro inference."""

from abc import ABC, abstractmethod
from typing import AsyncGenerator, Optional, Tuple, Union, List

import numpy as np
import torch

class AudioChunk:
    """Class for audio chunks returned by model backends"""
    
    def __init__(self,
                 audio: np.ndarray,
                 word_timestamps: Optional[List]=[],
                 output: Optional[Union[bytes,np.ndarray]]=b""
                 ):
        self.audio=audio
        self.word_timestamps=word_timestamps
        self.output=output
        
    @staticmethod
    def combine(audio_chunk_list: List):
        output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
        
        for audio_chunk in audio_chunk_list[1:]:
            output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16)
            if output.word_timestamps is not None:
                output.word_timestamps+=audio_chunk.word_timestamps
                
        return output
            
class ModelBackend(ABC):
    """Abstract base class for model inference backend."""

    @abstractmethod
    async def load_model(self, path: str) -> None:
        """Load model from path.

        Args:
            path: Path to model file

        Raises:
            RuntimeError: If model loading fails
        """
        pass

    @abstractmethod
    async def generate(
        self,
        text: str,
        voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
        speed: float = 1.0,
    ) -> AsyncGenerator[AudioChunk, None]:
        """Generate audio from text.

        Args:
            text: Input text to synthesize
            voice: Either a voice path or tuple of (name, tensor/path)
            speed: Speed multiplier

        Yields:
            Generated audio chunks

        Raises:
            RuntimeError: If generation fails
        """
        pass

    @abstractmethod
    def unload(self) -> None:
        """Unload model and free resources."""
        pass

    @property
    @abstractmethod
    def is_loaded(self) -> bool:
        """Check if model is loaded.

        Returns:
            True if model is loaded, False otherwise
        """
        pass

    @property
    @abstractmethod
    def device(self) -> str:
        """Get device model is running on.

        Returns:
            Device string ('cpu' or 'cuda')
        """
        pass


class BaseModelBackend(ModelBackend):
    """Base implementation of model backend."""

    def __init__(self):
        """Initialize base backend."""
        self._model: Optional[torch.nn.Module] = None
        self._device: str = "cpu"

    @property
    def is_loaded(self) -> bool:
        """Check if model is loaded."""
        return self._model is not None

    @property
    def device(self) -> str:
        """Get device model is running on."""
        return self._device

    def unload(self) -> None:
        """Unload model and free resources."""
        if self._model is not None:
            del self._model
            self._model = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`"""Base interface for Kokoro inference."""`
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00
			`from abc import ABC, abstractmethod`
WIP 2025-02-11 22:32:10 -05:00			`from typing import AsyncGenerator, Optional, Tuple, Union, List`
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00
			`import numpy as np`
			`import torch`

WIP 2025-02-11 22:32:10 -05:00			`class AudioChunk:`
			`"""Class for audio chunks returned by model backends"""`

			`def __init__(self,`
			`audio: np.ndarray,`
Simplify code so erverything uses AudioChunks 2025-02-16 15:37:01 -05:00			`word_timestamps: Optional[List]=[],`
			`output: Optional[Union[bytes,np.ndarray]]=b""`
WIP 2025-02-11 22:32:10 -05:00			`):`
			`self.audio=audio`
			`self.word_timestamps=word_timestamps`
Simplify code so erverything uses AudioChunks 2025-02-16 15:37:01 -05:00			`self.output=output`
WIP 2025-02-11 22:32:10 -05:00
Simplifed generate_audio in tts_service mostly working (audio conversion does not work) 2025-02-12 22:42:41 -05:00			`@staticmethod`
			`def combine(audio_chunk_list: List):`
			`output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)`

			`for audio_chunk in audio_chunk_list[1:]:`
fixed no stream file writing 2025-02-13 16:12:51 -05:00			`output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16)`
Simplifed generate_audio in tts_service mostly working (audio conversion does not work) 2025-02-12 22:42:41 -05:00			`if output.word_timestamps is not None:`
Mostly completed work on refractoring a bunch of code as well as streaming word level time stamps 2025-02-14 14:29:47 -05:00			`output.word_timestamps+=audio_chunk.word_timestamps`
Simplifed generate_audio in tts_service mostly working (audio conversion does not work) 2025-02-12 22:42:41 -05:00
			`return output`

Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`class ModelBackend(ABC):`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`"""Abstract base class for model inference backend."""`
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00
			`@abstractmethod`
			`async def load_model(self, path: str) -> None:`
			`"""Load model from path.`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Args:`
			`path: Path to model file`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Raises:`
			`RuntimeError: If model loading fails`
			`"""`
			`pass`

			`@abstractmethod`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`async def generate(`
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`self,`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`text: str,`
			`voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],`
Ruff check + formatting 2025-02-09 18:32:17 -07:00			`speed: float = 1.0,`
WIP 2025-02-11 22:32:10 -05:00			`) -> AsyncGenerator[AudioChunk, None]:`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`"""Generate audio from text.`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Args:`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`text: Input text to synthesize`
			`voice: Either a voice path or tuple of (name, tensor/path)`
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`speed: Speed multiplier`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`Yields:`
			`Generated audio chunks`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Raises:`
			`RuntimeError: If generation fails`
			`"""`
			`pass`

			`@abstractmethod`
			`def unload(self) -> None:`
			`"""Unload model and free resources."""`
			`pass`

			`@property`
			`@abstractmethod`
			`def is_loaded(self) -> bool:`
			`"""Check if model is loaded.`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Returns:`
			`True if model is loaded, False otherwise`
			`"""`
			`pass`

			`@property`
			`@abstractmethod`
			`def device(self) -> str:`
			`"""Get device model is running on.`
Ruff check + formatting 2025-02-09 18:32:17 -07:00
Refactor inference architecture: remove legacy TTS model, add ONNX and PyTorch backends, and introduce model configuration schemas 2025-01-20 22:42:29 -07:00			`Returns:`
			`Device string ('cpu' or 'cuda')`
			`"""`
			`pass`


			`class BaseModelBackend(ModelBackend):`
			`"""Base implementation of model backend."""`

			`def __init__(self):`
			`"""Initialize base backend."""`
			`self._model: Optional[torch.nn.Module] = None`
			`self._device: str = "cpu"`

			`@property`
			`def is_loaded(self) -> bool:`
			`"""Check if model is loaded."""`
			`return self._model is not None`

			`@property`
			`def device(self) -> str:`
			`"""Get device model is running on."""`
			`return self._device`

			`def unload(self) -> None:`
			`"""Unload model and free resources."""`
			`if self._model is not None:`
			`del self._model`
			`self._model = None`
			`if torch.cuda.is_available():`
Initial commit of Kokoro V1.0-only integration 2025-02-04 03:37:56 -07:00			`torch.cuda.empty_cache()`
Ruff check + formatting 2025-02-09 18:32:17 -07:00			`torch.cuda.synchronize()`