Kokoro-FastAPI/api/src/services/audio.py

"""Audio conversion service"""

from io import BytesIO

import numpy as np
import scipy.io.wavfile as wavfile
import soundfile as sf
from loguru import logger

from ..core.config import settings


class AudioNormalizer:
    """Handles audio normalization state for a single stream"""

    def __init__(self):
        self.int16_max = np.iinfo(np.int16).max
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)

    def normalize(
        self, audio_data: np.ndarray, is_last_chunk: bool = False
    ) -> np.ndarray:
        """Convert audio data to int16 range and trim chunk boundaries"""
        if len(audio_data) == 0:
            raise ValueError("Audio data cannot be empty")
            
        # Simple float32 to int16 conversion
        audio_float = audio_data.astype(np.float32)
        
        # Trim for non-final chunks
        if not is_last_chunk and len(audio_float) > self.samples_to_trim:
            audio_float = audio_float[:-self.samples_to_trim]
        
        # Direct scaling like the non-streaming version
        return (audio_float * 32767).astype(np.int16)


class AudioService:
    """Service for audio format conversions"""

    # Default audio format settings balanced for speed and compression
    DEFAULT_SETTINGS = {
        "mp3": {
            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
            "compression_level": 0.0,  # Balanced compression
        },
        "opus": {
            "compression_level": 0.0,  # Good balance for speech
        },
        "flac": {
            "compression_level": 0.0,  # Light compression, still fast
        },
    }

    @staticmethod
    def convert_audio(
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
        format_settings: dict = None,
        stream: bool = True,
    ) -> bytes:
        """Convert audio data to specified format

        Args:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
            is_first_chunk: Whether this is the first chunk of a stream
            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
            format_settings: Optional dict of format-specific settings to override defaults
                Example: {
                    "mp3": {
                        "bitrate_mode": "VARIABLE",
                        "compression_level": 0.8
                    }
                }
                Default settings balance speed and compression:
                optimized for localhost @ 0.0
                - MP3: constant bitrate, no compression (0.0)
                - OPUS: no compression (0.0)
                - FLAC: no compression (0.0)

        Returns:
            Bytes of the converted audio
        """
        buffer = BytesIO()

        try:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = normalizer.normalize(
                audio_data, is_last_chunk=is_last_chunk
            )

            if output_format == "pcm":
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
                # WAV format with headers
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="WAV",
                    subtype="PCM_16",
                )
            elif output_format == "mp3":
                # MP3 format with proper framing
                settings = format_settings.get("mp3", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
                sf.write(
                    buffer, normalized_audio, sample_rate, format="MP3", **settings
                )
            elif output_format == "opus":
                # Opus format in OGG container
                settings = format_settings.get("opus", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="OGG",
                    subtype="OPUS",
                    **settings,
                )
            elif output_format == "flac":
                # FLAC format with proper framing
                if is_first_chunk:
                    logger.info("Starting FLAC stream...")
                settings = format_settings.get("flac", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="FLAC",
                    subtype="PCM_16",
                    **settings,
                )
            elif output_format == "aac":
                raise ValueError(
                    "Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm."
                )
            else:
                raise ValueError(
                    f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
                )

            buffer.seek(0)
            return buffer.getvalue()

        except Exception as e:
            logger.error(f"Error converting audio to {output_format}: {str(e)}")
            raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""Audio conversion service"""`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`from io import BytesIO`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`import numpy as np`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`import scipy.io.wavfile as wavfile`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`import soundfile as sf`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00			`from loguru import logger`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`from ..core.config import settings`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`class AudioNormalizer:`
			`"""Handles audio normalization state for a single stream"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`def __init__(self):`
			`self.int16_max = np.iinfo(np.int16).max`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`self.chunk_trim_ms = settings.gap_trim_ms`
			`self.sample_rate = 24000 # Sample rate of the audio`
			`self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
			`def normalize(`
			`self, audio_data: np.ndarray, is_last_chunk: bool = False`
			`) -> np.ndarray:`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00			`"""Convert audio data to int16 range and trim chunk boundaries"""`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`if len(audio_data) == 0:`
			`raise ValueError("Audio data cannot be empty")`

refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00			`# Simple float32 to int16 conversion`
WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`audio_float = audio_data.astype(np.float32)`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00
			`# Trim for non-final chunks`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`if not is_last_chunk and len(audio_float) > self.samples_to_trim:`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00			`audio_float = audio_float[:-self.samples_to_trim]`

			`# Direct scaling like the non-streaming version`
			`return (audio_float * 32767).astype(np.int16)`
First streaming attempt 2025-01-04 17:54:54 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`class AudioService:`
			`"""Service for audio format conversions"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# Default audio format settings balanced for speed and compression`
			`DEFAULT_SETTINGS = {`
			`"mp3": {`
			`"bitrate_mode": "CONSTANT", # Faster than variable bitrate`
			`"compression_level": 0.0, # Balanced compression`
			`},`
			`"opus": {`
			`"compression_level": 0.0, # Good balance for speech`
			`},`
			`"flac": {`
			`"compression_level": 0.0, # Light compression, still fast`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`},`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`}`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`@staticmethod`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`def convert_audio(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`audio_data: np.ndarray,`
			`sample_rate: int,`
			`output_format: str,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`is_first_chunk: bool = True,`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`is_last_chunk: bool = False,`
			`normalizer: AudioNormalizer = None,`
			`format_settings: dict = None,`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`stream: bool = True,`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`) -> bytes:`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""Convert audio data to specified format`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Args:`
			`audio_data: Numpy array of audio samples`
			`sample_rate: Sample rate of the audio`
Update audio.py 2025-01-01 21:11:23 +05:30			`output_format: Target format (wav, mp3, opus, flac, pcm)`
First streaming attempt 2025-01-04 17:54:54 -07:00			`is_first_chunk: Whether this is the first chunk of a stream`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`normalizer: Optional AudioNormalizer instance for consistent normalization across chunks`
			`format_settings: Optional dict of format-specific settings to override defaults`
			`Example: {`
			`"mp3": {`
			`"bitrate_mode": "VARIABLE",`
			`"compression_level": 0.8`
			`}`
			`}`
			`Default settings balance speed and compression:`
			`optimized for localhost @ 0.0`
			`- MP3: constant bitrate, no compression (0.0)`
			`- OPUS: no compression (0.0)`
			`- FLAC: no compression (0.0)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Returns:`
			`Bytes of the converted audio`
			`"""`
			`buffer = BytesIO()`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`try:`
WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`# Always normalize audio to ensure proper amplitude scaling`
- Added GenerateFromPhonemesRequest model to text_schemas.py - Refactored TTS model initialization methods in tts_gpu.py and tts_cpu.py - Added custom logger configuration in main.py - Deprecated text_processing router -> development route 2025-01-09 07:20:14 -07:00			`if normalizer is None:`
			`normalizer = AudioNormalizer()`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`normalized_audio = normalizer.normalize(`
			`audio_data, is_last_chunk=is_last_chunk`
			`)`
First streaming attempt 2025-01-04 17:54:54 -07:00
			`if output_format == "pcm":`
			`# Raw 16-bit PCM samples, no header`
			`buffer.write(normalized_audio.tobytes())`
			`elif output_format == "wav":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# WAV format with headers`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="WAV",`
			`subtype="PCM_16",`
			`)`
WIP: basic tests on OpenAI streaming compatibility 2025-01-04 18:09:23 -07:00			`elif output_format == "mp3":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# MP3 format with proper framing`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`settings = format_settings.get("mp3", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["mp3"], settings}`
			`sf.write(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`buffer, normalized_audio, sample_rate, format="MP3", **settings`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`elif output_format == "opus":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# Opus format in OGG container`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`settings = format_settings.get("opus", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["opus"], settings}`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="OGG",`
			`subtype="OPUS",`
			`**settings,`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`elif output_format == "flac":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# FLAC format with proper framing`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`if is_first_chunk:`
			`logger.info("Starting FLAC stream...")`
			`settings = format_settings.get("flac", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["flac"], settings}`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="FLAC",`
			`subtype="PCM_16",`
			`**settings,`
			`)`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`elif output_format == "aac":`
			`raise ValueError(`
			`"Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm."`
			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`else:`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`raise ValueError(`
			`f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
Update audio.py 2025-01-01 21:11:23 +05:30			`buffer.seek(0)`
			`return buffer.getvalue()`

- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`except Exception as e:`
			`logger.error(f"Error converting audio to {output_format}: {str(e)}")`
			`raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")`