Kokoro-FastAPI/api/src/services/audio.py

162 lines
5.9 KiB
Python
Raw Normal View History

"""Audio conversion service"""
from io import BytesIO
import numpy as np
import scipy.io.wavfile as wavfile
2025-01-13 20:15:46 -07:00
import soundfile as sf
from loguru import logger
2025-01-09 18:41:44 -07:00
from ..core.config import settings
2025-01-09 18:41:44 -07:00
2025-01-04 17:54:54 -07:00
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
2025-01-09 18:41:44 -07:00
2025-01-04 17:54:54 -07:00
def __init__(self):
self.int16_max = np.iinfo(np.int16).max
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
2025-01-09 18:41:44 -07:00
def normalize(
self, audio_data: np.ndarray, is_last_chunk: bool = False
) -> np.ndarray:
"""Convert audio data to int16 range and trim chunk boundaries"""
2025-01-13 20:15:46 -07:00
if len(audio_data) == 0:
raise ValueError("Audio data cannot be empty")
# Simple float32 to int16 conversion
2025-01-04 17:55:36 -07:00
audio_float = audio_data.astype(np.float32)
# Trim for non-final chunks
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
audio_float = audio_float[:-self.samples_to_trim]
# Direct scaling like the non-streaming version
return (audio_float * 32767).astype(np.int16)
2025-01-04 17:54:54 -07:00
2025-01-09 18:41:44 -07:00
class AudioService:
"""Service for audio format conversions"""
2025-01-09 18:41:44 -07:00
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
"mp3": {
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
"compression_level": 0.0, # Balanced compression
},
"opus": {
"compression_level": 0.0, # Good balance for speech
},
"flac": {
"compression_level": 0.0, # Light compression, still fast
2025-01-09 18:41:44 -07:00
},
}
2025-01-09 18:41:44 -07:00
@staticmethod
def convert_audio(
2025-01-09 18:41:44 -07:00
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
2025-01-04 17:54:54 -07:00
is_first_chunk: bool = True,
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
2025-01-09 18:41:44 -07:00
stream: bool = True,
) -> bytes:
"""Convert audio data to specified format
Args:
audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio
2025-01-01 21:11:23 +05:30
output_format: Target format (wav, mp3, opus, flac, pcm)
2025-01-04 17:54:54 -07:00
is_first_chunk: Whether this is the first chunk of a stream
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
format_settings: Optional dict of format-specific settings to override defaults
Example: {
"mp3": {
"bitrate_mode": "VARIABLE",
"compression_level": 0.8
}
}
Default settings balance speed and compression:
optimized for localhost @ 0.0
- MP3: constant bitrate, no compression (0.0)
- OPUS: no compression (0.0)
- FLAC: no compression (0.0)
Returns:
Bytes of the converted audio
"""
buffer = BytesIO()
try:
2025-01-04 17:55:36 -07:00
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
2025-01-09 18:41:44 -07:00
normalized_audio = normalizer.normalize(
audio_data, is_last_chunk=is_last_chunk
)
2025-01-04 17:54:54 -07:00
if output_format == "pcm":
# Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes())
elif output_format == "wav":
# WAV format with headers
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="WAV",
subtype="PCM_16",
)
elif output_format == "mp3":
# MP3 format with proper framing
settings = format_settings.get("mp3", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
sf.write(
2025-01-09 18:41:44 -07:00
buffer, normalized_audio, sample_rate, format="MP3", **settings
)
elif output_format == "opus":
# Opus format in OGG container
settings = format_settings.get("opus", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="OGG",
subtype="OPUS",
**settings,
)
elif output_format == "flac":
# FLAC format with proper framing
if is_first_chunk:
logger.info("Starting FLAC stream...")
settings = format_settings.get("flac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="FLAC",
subtype="PCM_16",
**settings,
)
elif output_format == "aac":
raise ValueError(
"Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm."
)
else:
raise ValueError(
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
)
2025-01-01 21:11:23 +05:30
buffer.seek(0)
return buffer.getvalue()
except Exception as e:
logger.error(f"Error converting audio to {output_format}: {str(e)}")
raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")