Kokoro-FastAPI/api/src/services/audio.py

"""Audio conversion service"""

from io import BytesIO
import struct

import numpy as np
import scipy.io.wavfile as wavfile
import soundfile as sf
from loguru import logger
from pydub import AudioSegment

from ..core.config import settings
from .streaming_audio_writer import StreamingAudioWriter

class AudioNormalizer:
    """Handles audio normalization state for a single stream"""

    def __init__(self):
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)

    async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
        """Convert audio data to int16 range and trim silence from start and end
        
        Args:
            audio_data: Input audio data as numpy array
            
        Returns:
            Normalized and trimmed audio data
        """
        if len(audio_data) == 0:
            raise ValueError("Empty audio data")
            
        # Trim start and end if enough samples
        if len(audio_data) > (2 * self.samples_to_trim):
            audio_data = audio_data[self.samples_to_trim:-self.samples_to_trim]
        
        # Scale directly to int16 range with clipping
        return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)


class AudioService:
    """Service for audio format conversions with streaming support"""

    # Supported formats
    SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm", "ogg"}

    # Default audio format settings balanced for speed and compression
    DEFAULT_SETTINGS = {
        "mp3": {
            "bitrate_mode": "CONSTANT",  # Faster than variable bitrate
            "compression_level": 0.0,  # Balanced compression
        },
        "opus": {
            "compression_level": 0.0,  # Good balance for speech
        },
        "flac": {
            "compression_level": 0.0,  # Light compression, still fast
        },
        "aac": {
            "bitrate": "192k",  # Default AAC bitrate
        },
    }

    _writers = {}

    @staticmethod
    async def convert_audio(
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
    ) -> bytes:
        """Convert audio data to specified format with streaming support

        Args:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, ogg, pcm)
            is_first_chunk: Whether this is the first chunk
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization

        Returns:
            Bytes of the converted audio chunk
        """
        try:
            # Validate format
            if output_format not in AudioService.SUPPORTED_FORMATS:
                raise ValueError(f"Format {output_format} not supported")

            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = await normalizer.normalize(audio_data)

            # Get or create format-specific writer
            writer_key = f"{output_format}_{sample_rate}"
            if is_first_chunk or writer_key not in AudioService._writers:
                AudioService._writers[writer_key] = StreamingAudioWriter(
                    output_format, sample_rate
                )
            writer = AudioService._writers[writer_key]

            # Write chunk or finalize
            if is_last_chunk:
                chunk_data = writer.write_chunk(finalize=True)
                del AudioService._writers[writer_key]
            else:
                chunk_data = writer.write_chunk(normalized_audio)
            
            return chunk_data if chunk_data else b''

        except Exception as e:
            logger.error(f"Error converting audio stream to {output_format}: {str(e)}")
            raise ValueError(f"Failed to convert audio stream to {output_format}: {str(e)}")
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""Audio conversion service"""`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`from io import BytesIO`
Fix truncated playback issue in streaming WAV responses. 2025-01-26 12:38:37 -08:00			`import struct`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`import numpy as np`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`import scipy.io.wavfile as wavfile`
Ruff checks, ci fix 2025-01-13 20:15:46 -07:00			`import soundfile as sf`
Refactor TTS API and enhance testing setup with coverage and logging improvements 2024-12-31 02:55:51 -07:00			`from loguru import logger`
add AAC audio format and test 2025-01-17 21:43:10 -07:00			`from pydub import AudioSegment`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`from ..core.config import settings`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`from .streaming_audio_writer import StreamingAudioWriter`
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`class AudioNormalizer:`
			`"""Handles audio normalization state for a single stream"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
First streaming attempt 2025-01-04 17:54:54 -07:00			`def __init__(self):`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`self.chunk_trim_ms = settings.gap_trim_ms`
			`self.sample_rate = 24000 # Sample rate of the audio`
			`self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`async def normalize(self, audio_data: np.ndarray) -> np.ndarray:`
			`"""Convert audio data to int16 range and trim silence from start and end`

			`Args:`
			`audio_data: Input audio data as numpy array`

			`Returns:`
			`Normalized and trimmed audio data`
			`"""`
WIP: v1_0_0 migration 2025-01-28 13:52:57 -07:00			`if len(audio_data) == 0:`
			`raise ValueError("Empty audio data")`

Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`# Trim start and end if enough samples`
WIP: v1_0_0 migration 2025-01-28 13:52:57 -07:00			`if len(audio_data) > (2 * self.samples_to_trim):`
			`audio_data = audio_data[self.samples_to_trim:-self.samples_to_trim]`
refactor: streamline audio normalization process and update tests 2025-01-13 18:56:49 -07:00
WIP: v1_0_0 migration 2025-01-28 13:52:57 -07:00			`# Scale directly to int16 range with clipping`
			`return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)`
First streaming attempt 2025-01-04 17:54:54 -07:00
Ruff format + fix 2025-01-09 18:41:44 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`class AudioService:`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`"""Service for audio format conversions with streaming support"""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
WIP: v1_0_0 migration 2025-01-28 13:52:57 -07:00			`# Supported formats`
			`SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm", "ogg"}`

-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# Default audio format settings balanced for speed and compression`
			`DEFAULT_SETTINGS = {`
			`"mp3": {`
			`"bitrate_mode": "CONSTANT", # Faster than variable bitrate`
			`"compression_level": 0.0, # Balanced compression`
			`},`
			`"opus": {`
			`"compression_level": 0.0, # Good balance for speech`
			`},`
			`"flac": {`
			`"compression_level": 0.0, # Light compression, still fast`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`},`
add AAC audio format and test 2025-01-17 21:43:10 -07:00			`"aac": {`
			`"bitrate": "192k", # Default AAC bitrate`
			`},`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`}`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`_writers = {}`

- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`@staticmethod`
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`async def convert_audio(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`audio_data: np.ndarray,`
			`sample_rate: int,`
			`output_format: str,`
First streaming attempt 2025-01-04 17:54:54 -07:00			`is_first_chunk: bool = True,`
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`is_last_chunk: bool = False,`
			`normalizer: AudioNormalizer = None,`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00			`) -> bytes:`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`"""Convert audio data to specified format with streaming support`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Args:`
			`audio_data: Numpy array of audio samples`
			`sample_rate: Sample rate of the audio`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`output_format: Target format (wav, mp3, ogg, pcm)`
			`is_first_chunk: Whether this is the first chunk`
			`is_last_chunk: Whether this is the last chunk`
			`normalizer: Optional AudioNormalizer instance for consistent normalization`
Enhance TTS API with logging, voice pack loading, and schema updates 2024-12-31 01:57:00 -07:00
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`Returns:`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`Bytes of the converted audio chunk`
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`"""`
			`try:`
WIP: v1_0_0 migration 2025-01-28 13:52:57 -07:00			`# Validate format`
			`if output_format not in AudioService.SUPPORTED_FORMATS:`
			`raise ValueError(f"Format {output_format} not supported")`

WIP: open ai compatible streaming 2025-01-04 17:55:36 -07:00			`# Always normalize audio to ensure proper amplitude scaling`
- Added GenerateFromPhonemesRequest model to text_schemas.py - Refactored TTS model initialization methods in tts_gpu.py and tts_cpu.py - Added custom logger configuration in main.py - Deprecated text_processing router -> development route 2025-01-09 07:20:14 -07:00			`if normalizer is None:`
			`normalizer = AudioNormalizer()`
Add async audio processing and semantic chunking support; flattened static audio trimming 2025-01-24 04:06:47 -07:00			`normalized_audio = await normalizer.normalize(audio_data)`
First streaming attempt 2025-01-04 17:54:54 -07:00
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`# Get or create format-specific writer`
			`writer_key = f"{output_format}_{sample_rate}"`
			`if is_first_chunk or writer_key not in AudioService._writers:`
			`AudioService._writers[writer_key] = StreamingAudioWriter(`
			`output_format, sample_rate`
Refactor Docker configurations and update test mocks for development routers 2025-01-10 22:03:16 -07:00			`)`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`writer = AudioService._writers[writer_key]`

Refactor audio processing and cleanup: remove unused chunker, enhance StreamingAudioWriter for better MP3 handling, and improve text processing compatibility. 2025-01-27 20:23:42 -07:00			`# Write chunk or finalize`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`if is_last_chunk:`
Refactor audio processing and cleanup: remove unused chunker, enhance StreamingAudioWriter for better MP3 handling, and improve text processing compatibility. 2025-01-27 20:23:42 -07:00			`chunk_data = writer.write_chunk(finalize=True)`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`del AudioService._writers[writer_key]`
Refactor audio processing and cleanup: remove unused chunker, enhance StreamingAudioWriter for better MP3 handling, and improve text processing compatibility. 2025-01-27 20:23:42 -07:00			`else:`
			`chunk_data = writer.write_chunk(normalized_audio)`

			`return chunk_data if chunk_data else b''`
Update audio.py 2025-01-01 21:11:23 +05:30
- Complete TTS endpoint replacement with OpenAI compatible -Removed output directory, and update configuration settings - Added benchmarking for entire novel 2024-12-31 01:52:16 -07:00			`except Exception as e:`
Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes 2025-01-27 20:23:35 -07:00			`logger.error(f"Error converting audio stream to {output_format}: {str(e)}")`
			`raise ValueError(f"Failed to convert audio stream to {output_format}: {str(e)}")`