2024-12-31 01:52:16 -07:00
|
|
|
"""Audio conversion service"""
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
from io import BytesIO
|
2024-12-31 02:55:51 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
import numpy as np
|
2025-01-06 03:32:41 -07:00
|
|
|
import scipy.io.wavfile as wavfile
|
2025-01-13 20:15:46 -07:00
|
|
|
import soundfile as sf
|
2024-12-31 02:55:51 -07:00
|
|
|
from loguru import logger
|
2025-01-17 21:43:10 -07:00
|
|
|
from pydub import AudioSegment
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-06 03:32:41 -07:00
|
|
|
from ..core.config import settings
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
class AudioNormalizer:
|
|
|
|
"""Handles audio normalization state for a single stream"""
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
def __init__(self):
|
|
|
|
self.int16_max = np.iinfo(np.int16).max
|
2025-01-06 03:32:41 -07:00
|
|
|
self.chunk_trim_ms = settings.gap_trim_ms
|
|
|
|
self.sample_rate = 24000 # Sample rate of the audio
|
|
|
|
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
|
|
|
"""Convert audio data to int16 range and trim silence from start and end
|
|
|
|
|
|
|
|
Args:
|
|
|
|
audio_data: Input audio data as numpy array
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Normalized and trimmed audio data
|
|
|
|
"""
|
2025-01-13 20:15:46 -07:00
|
|
|
if len(audio_data) == 0:
|
|
|
|
raise ValueError("Audio data cannot be empty")
|
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
# Convert to float32 for processing
|
2025-01-04 17:55:36 -07:00
|
|
|
audio_float = audio_data.astype(np.float32)
|
2025-01-13 18:56:49 -07:00
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
# Trim start and end if enough samples
|
|
|
|
if len(audio_float) > (2 * self.samples_to_trim):
|
|
|
|
audio_float = audio_float[self.samples_to_trim:-self.samples_to_trim]
|
2025-01-13 18:56:49 -07:00
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
# Scale to int16 range
|
2025-01-13 18:56:49 -07:00
|
|
|
return (audio_float * 32767).astype(np.int16)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
class AudioService:
|
|
|
|
"""Service for audio format conversions"""
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-06 03:32:41 -07:00
|
|
|
# Default audio format settings balanced for speed and compression
|
|
|
|
DEFAULT_SETTINGS = {
|
|
|
|
"mp3": {
|
|
|
|
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
|
|
|
|
"compression_level": 0.0, # Balanced compression
|
|
|
|
},
|
|
|
|
"opus": {
|
|
|
|
"compression_level": 0.0, # Good balance for speech
|
|
|
|
},
|
|
|
|
"flac": {
|
|
|
|
"compression_level": 0.0, # Light compression, still fast
|
2025-01-09 18:41:44 -07:00
|
|
|
},
|
2025-01-17 21:43:10 -07:00
|
|
|
"aac": {
|
|
|
|
"bitrate": "192k", # Default AAC bitrate
|
|
|
|
},
|
2025-01-06 03:32:41 -07:00
|
|
|
}
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
@staticmethod
|
2025-01-24 04:06:47 -07:00
|
|
|
async def convert_audio(
|
2025-01-09 18:41:44 -07:00
|
|
|
audio_data: np.ndarray,
|
|
|
|
sample_rate: int,
|
|
|
|
output_format: str,
|
2025-01-04 17:54:54 -07:00
|
|
|
is_first_chunk: bool = True,
|
2025-01-06 03:32:41 -07:00
|
|
|
is_last_chunk: bool = False,
|
|
|
|
normalizer: AudioNormalizer = None,
|
|
|
|
format_settings: dict = None,
|
2025-01-09 18:41:44 -07:00
|
|
|
stream: bool = True,
|
2024-12-31 01:57:00 -07:00
|
|
|
) -> bytes:
|
2024-12-31 01:52:16 -07:00
|
|
|
"""Convert audio data to specified format
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
Args:
|
|
|
|
audio_data: Numpy array of audio samples
|
|
|
|
sample_rate: Sample rate of the audio
|
2025-01-01 21:11:23 +05:30
|
|
|
output_format: Target format (wav, mp3, opus, flac, pcm)
|
2025-01-04 17:54:54 -07:00
|
|
|
is_first_chunk: Whether this is the first chunk of a stream
|
2025-01-06 03:32:41 -07:00
|
|
|
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
|
|
|
|
format_settings: Optional dict of format-specific settings to override defaults
|
|
|
|
Example: {
|
|
|
|
"mp3": {
|
|
|
|
"bitrate_mode": "VARIABLE",
|
|
|
|
"compression_level": 0.8
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Default settings balance speed and compression:
|
|
|
|
optimized for localhost @ 0.0
|
|
|
|
- MP3: constant bitrate, no compression (0.0)
|
|
|
|
- OPUS: no compression (0.0)
|
|
|
|
- FLAC: no compression (0.0)
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
Returns:
|
|
|
|
Bytes of the converted audio
|
|
|
|
"""
|
|
|
|
buffer = BytesIO()
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
try:
|
2025-01-04 17:55:36 -07:00
|
|
|
# Always normalize audio to ensure proper amplitude scaling
|
2025-01-09 07:20:14 -07:00
|
|
|
if normalizer is None:
|
|
|
|
normalizer = AudioNormalizer()
|
2025-01-24 04:06:47 -07:00
|
|
|
normalized_audio = await normalizer.normalize(audio_data)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
|
|
|
if output_format == "pcm":
|
|
|
|
# Raw 16-bit PCM samples, no header
|
|
|
|
buffer.write(normalized_audio.tobytes())
|
|
|
|
elif output_format == "wav":
|
2025-01-10 22:03:16 -07:00
|
|
|
# WAV format with headers
|
2025-01-09 18:41:44 -07:00
|
|
|
sf.write(
|
|
|
|
buffer,
|
|
|
|
normalized_audio,
|
|
|
|
sample_rate,
|
|
|
|
format="WAV",
|
|
|
|
subtype="PCM_16",
|
|
|
|
)
|
2025-01-04 18:09:23 -07:00
|
|
|
elif output_format == "mp3":
|
2025-01-10 22:03:16 -07:00
|
|
|
# MP3 format with proper framing
|
2025-01-06 03:32:41 -07:00
|
|
|
settings = format_settings.get("mp3", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
|
|
|
|
sf.write(
|
2025-01-09 18:41:44 -07:00
|
|
|
buffer, normalized_audio, sample_rate, format="MP3", **settings
|
|
|
|
)
|
2024-12-31 01:57:00 -07:00
|
|
|
elif output_format == "opus":
|
2025-01-10 22:03:16 -07:00
|
|
|
# Opus format in OGG container
|
2025-01-06 03:32:41 -07:00
|
|
|
settings = format_settings.get("opus", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
|
2025-01-09 18:41:44 -07:00
|
|
|
sf.write(
|
|
|
|
buffer,
|
|
|
|
normalized_audio,
|
|
|
|
sample_rate,
|
|
|
|
format="OGG",
|
|
|
|
subtype="OPUS",
|
|
|
|
**settings,
|
|
|
|
)
|
2024-12-31 01:57:00 -07:00
|
|
|
elif output_format == "flac":
|
2025-01-10 22:03:16 -07:00
|
|
|
# FLAC format with proper framing
|
2025-01-06 03:32:41 -07:00
|
|
|
if is_first_chunk:
|
|
|
|
logger.info("Starting FLAC stream...")
|
|
|
|
settings = format_settings.get("flac", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
|
2025-01-09 18:41:44 -07:00
|
|
|
sf.write(
|
|
|
|
buffer,
|
|
|
|
normalized_audio,
|
|
|
|
sample_rate,
|
|
|
|
format="FLAC",
|
|
|
|
subtype="PCM_16",
|
|
|
|
**settings,
|
|
|
|
)
|
2025-01-17 21:43:10 -07:00
|
|
|
elif output_format == "aac":
|
|
|
|
# Convert numpy array directly to AAC using pydub
|
|
|
|
audio_segment = AudioSegment(
|
|
|
|
normalized_audio.tobytes(),
|
|
|
|
frame_rate=sample_rate,
|
|
|
|
sample_width=normalized_audio.dtype.itemsize,
|
|
|
|
channels=1 if len(normalized_audio.shape) == 1 else normalized_audio.shape[1]
|
|
|
|
)
|
|
|
|
|
|
|
|
settings = format_settings.get("aac", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["aac"], **settings}
|
|
|
|
|
|
|
|
audio_segment.export(
|
|
|
|
buffer,
|
|
|
|
format="adts", # ADTS is a common AAC container format
|
|
|
|
bitrate=settings["bitrate"]
|
2025-01-10 22:03:16 -07:00
|
|
|
)
|
2024-12-31 01:52:16 -07:00
|
|
|
else:
|
2025-01-10 22:03:16 -07:00
|
|
|
raise ValueError(
|
|
|
|
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
|
|
|
|
)
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2025-01-01 21:11:23 +05:30
|
|
|
buffer.seek(0)
|
|
|
|
return buffer.getvalue()
|
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error converting audio to {output_format}: {str(e)}")
|
|
|
|
raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")
|