Kokoro-FastAPI/api/src/services/audio.py

"""Audio conversion service"""

from io import BytesIO

import numpy as np
import scipy.io.wavfile as wavfile
import soundfile as sf
from loguru import logger
from pydub import AudioSegment

from ..core.config import settings


class AudioNormalizer:
    """Handles audio normalization state for a single stream"""

    def __init__(self):
        self.int16_max = np.iinfo(np.int16).max
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)

    async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
        """Convert audio data to int16 range and trim silence from start and end
        
        Args:
            audio_data: Input audio data as numpy array
            
        Returns:
            Normalized and trimmed audio data
        """
        if len(audio_data) == 0:
            raise ValueError("Audio data cannot be empty")
            
        # Convert to float32 for processing
        audio_float = audio_data.astype(np.float32)
        
        # Trim start and end if enough samples
        if len(audio_float) > (2 * self.samples_to_trim):
            audio_float = audio_float[self.samples_to_trim:-self.samples_to_trim]
        
        # Scale to int16 range
        return (audio_float * 32767).astype(np.int16)


class AudioService:
    """Service for audio format conversions"""

    # Default audio format settings balanced for speed and compression
    DEFAULT_SETTINGS = {
        "mp3": {
            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
            "compression_level": 0.0,  # Balanced compression
        },
        "opus": {
            "compression_level": 0.0,  # Good balance for speech
        },
        "flac": {
            "compression_level": 0.0,  # Light compression, still fast
        },
        "aac": {
            "bitrate": "192k",  # Default AAC bitrate
        },
    }

    @staticmethod
    async def convert_audio(
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
        format_settings: dict = None,
        stream: bool = True,
    ) -> bytes:
        """Convert audio data to specified format

        Args:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, opus, flac, pcm)
            is_first_chunk: Whether this is the first chunk of a stream
            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
            format_settings: Optional dict of format-specific settings to override defaults
                Example: {
                    "mp3": {
                        "bitrate_mode": "VARIABLE",
                        "compression_level": 0.8
                    }
                }
                Default settings balance speed and compression:
                optimized for localhost @ 0.0
                - MP3: constant bitrate, no compression (0.0)
                - OPUS: no compression (0.0)
                - FLAC: no compression (0.0)

        Returns:
            Bytes of the converted audio
        """
        buffer = BytesIO()

        try:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = await normalizer.normalize(audio_data)

            if output_format == "pcm":
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
                # WAV format with headers
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="WAV",
                    subtype="PCM_16",
                )
            elif output_format == "mp3":
                # MP3 format with proper framing
                settings = format_settings.get("mp3", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
                sf.write(
                    buffer, normalized_audio, sample_rate, format="MP3", **settings
                )
            elif output_format == "opus":
                # Opus format in OGG container
                settings = format_settings.get("opus", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="OGG",
                    subtype="OPUS",
                    **settings,
                )
            elif output_format == "flac":
                # FLAC format with proper framing
                if is_first_chunk:
                    logger.info("Starting FLAC stream...")
                settings = format_settings.get("flac", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="FLAC",
                    subtype="PCM_16",
                    **settings,
                )
            elif output_format == "aac":           
                # Convert numpy array directly to AAC using pydub
                audio_segment = AudioSegment(
                    normalized_audio.tobytes(), 
                    frame_rate=sample_rate,
                    sample_width=normalized_audio.dtype.itemsize,
                    channels=1 if len(normalized_audio.shape) == 1 else normalized_audio.shape[1]
                )
                
                settings = format_settings.get("aac", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["aac"], **settings}
                
                audio_segment.export(
                    buffer,
                    format="adts",  # ADTS is a common AAC container format
                    bitrate=settings["bitrate"]
                )
            else:
                raise ValueError(
                    f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
                )

            buffer.seek(0)
            return buffer.getvalue()

        except Exception as e:
            logger.error(f"Error converting audio to {output_format}: {str(e)}")
            raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""Audio conversion service"""`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`from io import BytesIO`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`import numpy as np`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`import scipy.io.wavfile as wavfile`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`import soundfile as sf`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00			`from loguru import logger`
add AAC audio format and test 2025-01-17 21:43:10 -07:00			`from pydub import AudioSegment`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`from ..core.config import settings`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`class AudioNormalizer:`
			`"""Handles audio normalization state for a single stream"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`def __init__(self):`
			`self.int16_max = np.iinfo(np.int16).max`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`self.chunk_trim_ms = settings.gap_trim_ms`
			`self.sample_rate = 24000 # Sample rate of the audio`
			`self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`async def normalize(self, audio_data: np.ndarray) -> np.ndarray:`
			`"""Convert audio data to int16 range and trim silence from start and end`

			`Args:`
			`audio_data: Input audio data as numpy array`

			`Returns:`
			`Normalized and trimmed audio data`
			`"""`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`if len(audio_data) == 0:`
			`raise ValueError("Audio data cannot be empty")`

Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`# Convert to float32 for processing`
WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`audio_float = audio_data.astype(np.float32)`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`# Trim start and end if enough samples`
			`if len(audio_float) > (2 * self.samples_to_trim):`
			`audio_float = audio_float[self.samples_to_trim:-self.samples_to_trim]`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`# Scale to int16 range`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00			`return (audio_float * 32767).astype(np.int16)`
First streaming attempt 2025-01-04 17:54:54 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`class AudioService:`
			`"""Service for audio format conversions"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# Default audio format settings balanced for speed and compression`
			`DEFAULT_SETTINGS = {`
			`"mp3": {`
			`"bitrate_mode": "CONSTANT", # Faster than variable bitrate`
			`"compression_level": 0.0, # Balanced compression`
			`},`
			`"opus": {`
			`"compression_level": 0.0, # Good balance for speech`
			`},`
			`"flac": {`
			`"compression_level": 0.0, # Light compression, still fast`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`},`
add AAC audio format and test 2025-01-17 21:43:10 -07:00			`"aac": {`
			`"bitrate": "192k", # Default AAC bitrate`
			`},`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`}`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`@staticmethod`
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`async def convert_audio(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`audio_data: np.ndarray,`
			`sample_rate: int,`
			`output_format: str,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`is_first_chunk: bool = True,`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`is_last_chunk: bool = False,`
			`normalizer: AudioNormalizer = None,`
			`format_settings: dict = None,`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`stream: bool = True,`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`) -> bytes:`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""Convert audio data to specified format`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Args:`
			`audio_data: Numpy array of audio samples`
			`sample_rate: Sample rate of the audio`
Update audio.py 2025-01-01 21:11:23 +05:30			`output_format: Target format (wav, mp3, opus, flac, pcm)`
First streaming attempt 2025-01-04 17:54:54 -07:00			`is_first_chunk: Whether this is the first chunk of a stream`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`normalizer: Optional AudioNormalizer instance for consistent normalization across chunks`
			`format_settings: Optional dict of format-specific settings to override defaults`
			`Example: {`
			`"mp3": {`
			`"bitrate_mode": "VARIABLE",`
			`"compression_level": 0.8`
			`}`
			`}`
			`Default settings balance speed and compression:`
			`optimized for localhost @ 0.0`
			`- MP3: constant bitrate, no compression (0.0)`
			`- OPUS: no compression (0.0)`
			`- FLAC: no compression (0.0)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Returns:`
			`Bytes of the converted audio`
			`"""`
			`buffer = BytesIO()`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`try:`
WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`# Always normalize audio to ensure proper amplitude scaling`
- Added GenerateFromPhonemesRequest model to text_schemas.py - Refactored TTS model initialization methods in tts_gpu.py and tts_cpu.py - Added custom logger configuration in main.py - Deprecated text_processing router -> development route 2025-01-09 07:20:14 -07:00			`if normalizer is None:`
			`normalizer = AudioNormalizer()`
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`normalized_audio = await normalizer.normalize(audio_data)`
First streaming attempt 2025-01-04 17:54:54 -07:00
			`if output_format == "pcm":`
			`# Raw 16-bit PCM samples, no header`
			`buffer.write(normalized_audio.tobytes())`
			`elif output_format == "wav":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# WAV format with headers`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="WAV",`
			`subtype="PCM_16",`
			`)`
WIP: basic tests on OpenAI streaming compatibility 2025-01-04 18:09:23 -07:00			`elif output_format == "mp3":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# MP3 format with proper framing`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`settings = format_settings.get("mp3", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["mp3"], settings}`
			`sf.write(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`buffer, normalized_audio, sample_rate, format="MP3", **settings`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`elif output_format == "opus":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# Opus format in OGG container`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`settings = format_settings.get("opus", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["opus"], settings}`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="OGG",`
			`subtype="OPUS",`
			`**settings,`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`elif output_format == "flac":`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`# FLAC format with proper framing`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`if is_first_chunk:`
			`logger.info("Starting FLAC stream...")`
			`settings = format_settings.get("flac", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["flac"], settings}`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`sf.write(`
			`buffer,`
			`normalized_audio,`
			`sample_rate,`
			`format="FLAC",`
			`subtype="PCM_16",`
			`**settings,`
			`)`
add AAC audio format and test 2025-01-17 21:43:10 -07:00			`elif output_format == "aac":`
			`# Convert numpy array directly to AAC using pydub`
			`audio_segment = AudioSegment(`
			`normalized_audio.tobytes(),`
			`frame_rate=sample_rate,`
			`sample_width=normalized_audio.dtype.itemsize,`
			`channels=1 if len(normalized_audio.shape) == 1 else normalized_audio.shape[1]`
			`)`

			`settings = format_settings.get("aac", {}) if format_settings else {}`
			`settings = {AudioService.DEFAULT_SETTINGS["aac"], settings}`

			`audio_segment.export(`
			`buffer,`
			`format="adts", # ADTS is a common AAC container format`
			`bitrate=settings["bitrate"]`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`)`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`else:`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`raise ValueError(`
			`f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."`
			`)`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
Update audio.py 2025-01-01 21:11:23 +05:30			`buffer.seek(0)`
			`return buffer.getvalue()`

- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`except Exception as e:`
			`logger.error(f"Error converting audio to {output_format}: {str(e)}")`
			`raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")`