Kokoro-FastAPI/api/src/services/audio.py

182 lines
6.7 KiB
Python
Raw Normal View History

"""Audio conversion service"""
from io import BytesIO
import numpy as np
import scipy.io.wavfile as wavfile
2025-01-13 20:15:46 -07:00
import soundfile as sf
from loguru import logger
2025-01-17 21:43:10 -07:00
from pydub import AudioSegment
2025-01-09 18:41:44 -07:00
from ..core.config import settings
2025-01-09 18:41:44 -07:00
2025-01-04 17:54:54 -07:00
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
2025-01-09 18:41:44 -07:00
2025-01-04 17:54:54 -07:00
def __init__(self):
self.int16_max = np.iinfo(np.int16).max
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
2025-01-09 18:41:44 -07:00
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range and trim silence from start and end
Args:
audio_data: Input audio data as numpy array
Returns:
Normalized and trimmed audio data
"""
2025-01-13 20:15:46 -07:00
if len(audio_data) == 0:
raise ValueError("Audio data cannot be empty")
# Convert to float32 for processing
2025-01-04 17:55:36 -07:00
audio_float = audio_data.astype(np.float32)
# Trim start and end if enough samples
if len(audio_float) > (2 * self.samples_to_trim):
audio_float = audio_float[self.samples_to_trim:-self.samples_to_trim]
# Scale to int16 range
return (audio_float * 32767).astype(np.int16)
2025-01-04 17:54:54 -07:00
2025-01-09 18:41:44 -07:00
class AudioService:
"""Service for audio format conversions"""
2025-01-09 18:41:44 -07:00
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
"mp3": {
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
"compression_level": 0.0, # Balanced compression
},
"opus": {
"compression_level": 0.0, # Good balance for speech
},
"flac": {
"compression_level": 0.0, # Light compression, still fast
2025-01-09 18:41:44 -07:00
},
2025-01-17 21:43:10 -07:00
"aac": {
"bitrate": "192k", # Default AAC bitrate
},
}
2025-01-09 18:41:44 -07:00
@staticmethod
async def convert_audio(
2025-01-09 18:41:44 -07:00
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
2025-01-04 17:54:54 -07:00
is_first_chunk: bool = True,
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
2025-01-09 18:41:44 -07:00
stream: bool = True,
) -> bytes:
"""Convert audio data to specified format
Args:
audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio
2025-01-01 21:11:23 +05:30
output_format: Target format (wav, mp3, opus, flac, pcm)
2025-01-04 17:54:54 -07:00
is_first_chunk: Whether this is the first chunk of a stream
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
format_settings: Optional dict of format-specific settings to override defaults
Example: {
"mp3": {
"bitrate_mode": "VARIABLE",
"compression_level": 0.8
}
}
Default settings balance speed and compression:
optimized for localhost @ 0.0
- MP3: constant bitrate, no compression (0.0)
- OPUS: no compression (0.0)
- FLAC: no compression (0.0)
Returns:
Bytes of the converted audio
"""
buffer = BytesIO()
try:
2025-01-04 17:55:36 -07:00
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = await normalizer.normalize(audio_data)
2025-01-04 17:54:54 -07:00
if output_format == "pcm":
# Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes())
elif output_format == "wav":
# WAV format with headers
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="WAV",
subtype="PCM_16",
)
elif output_format == "mp3":
# MP3 format with proper framing
settings = format_settings.get("mp3", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
sf.write(
2025-01-09 18:41:44 -07:00
buffer, normalized_audio, sample_rate, format="MP3", **settings
)
elif output_format == "opus":
# Opus format in OGG container
settings = format_settings.get("opus", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="OGG",
subtype="OPUS",
**settings,
)
elif output_format == "flac":
# FLAC format with proper framing
if is_first_chunk:
logger.info("Starting FLAC stream...")
settings = format_settings.get("flac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
2025-01-09 18:41:44 -07:00
sf.write(
buffer,
normalized_audio,
sample_rate,
format="FLAC",
subtype="PCM_16",
**settings,
)
2025-01-17 21:43:10 -07:00
elif output_format == "aac":
# Convert numpy array directly to AAC using pydub
audio_segment = AudioSegment(
normalized_audio.tobytes(),
frame_rate=sample_rate,
sample_width=normalized_audio.dtype.itemsize,
channels=1 if len(normalized_audio.shape) == 1 else normalized_audio.shape[1]
)
settings = format_settings.get("aac", {}) if format_settings else {}
settings = {**AudioService.DEFAULT_SETTINGS["aac"], **settings}
audio_segment.export(
buffer,
format="adts", # ADTS is a common AAC container format
bitrate=settings["bitrate"]
)
else:
raise ValueError(
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
)
2025-01-01 21:11:23 +05:30
buffer.seek(0)
return buffer.getvalue()
except Exception as e:
logger.error(f"Error converting audio to {output_format}: {str(e)}")
raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")