2024-12-31 01:52:16 -07:00
|
|
|
"""Audio conversion service"""
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
from io import BytesIO
|
2024-12-31 02:55:51 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
import numpy as np
|
|
|
|
import soundfile as sf
|
2025-01-06 03:32:41 -07:00
|
|
|
import scipy.io.wavfile as wavfile
|
2024-12-31 02:55:51 -07:00
|
|
|
from loguru import logger
|
2025-01-06 03:32:41 -07:00
|
|
|
from ..core.config import settings
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
class AudioNormalizer:
|
|
|
|
"""Handles audio normalization state for a single stream"""
|
|
|
|
def __init__(self):
|
|
|
|
self.int16_max = np.iinfo(np.int16).max
|
2025-01-06 03:32:41 -07:00
|
|
|
self.chunk_trim_ms = settings.gap_trim_ms
|
|
|
|
self.sample_rate = 24000 # Sample rate of the audio
|
|
|
|
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-06 03:32:41 -07:00
|
|
|
def normalize(self, audio_data: np.ndarray, is_last_chunk: bool = False) -> np.ndarray:
|
|
|
|
"""Normalize audio data to int16 range and trim chunk boundaries"""
|
2025-01-04 17:55:36 -07:00
|
|
|
# Convert to float32 if not already
|
|
|
|
audio_float = audio_data.astype(np.float32)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-04 17:55:36 -07:00
|
|
|
# Normalize to [-1, 1] range first
|
|
|
|
if np.max(np.abs(audio_float)) > 0:
|
|
|
|
audio_float = audio_float / np.max(np.abs(audio_float))
|
2025-01-06 03:32:41 -07:00
|
|
|
|
|
|
|
# Trim end of non-final chunks to reduce gaps
|
|
|
|
if not is_last_chunk and len(audio_float) > self.samples_to_trim:
|
|
|
|
audio_float = audio_float[:-self.samples_to_trim]
|
2025-01-04 17:55:36 -07:00
|
|
|
|
|
|
|
# Scale to int16 range
|
|
|
|
return (audio_float * self.int16_max).astype(np.int16)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
class AudioService:
|
|
|
|
"""Service for audio format conversions"""
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-06 03:32:41 -07:00
|
|
|
# Default audio format settings balanced for speed and compression
|
|
|
|
DEFAULT_SETTINGS = {
|
|
|
|
"mp3": {
|
|
|
|
"bitrate_mode": "CONSTANT", # Faster than variable bitrate
|
|
|
|
"compression_level": 0.0, # Balanced compression
|
|
|
|
},
|
|
|
|
"opus": {
|
|
|
|
"compression_level": 0.0, # Good balance for speech
|
|
|
|
},
|
|
|
|
"flac": {
|
|
|
|
"compression_level": 0.0, # Light compression, still fast
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
@staticmethod
|
2024-12-31 01:57:00 -07:00
|
|
|
def convert_audio(
|
2025-01-04 17:54:54 -07:00
|
|
|
audio_data: np.ndarray,
|
|
|
|
sample_rate: int,
|
|
|
|
output_format: str,
|
|
|
|
is_first_chunk: bool = True,
|
2025-01-06 03:32:41 -07:00
|
|
|
is_last_chunk: bool = False,
|
|
|
|
normalizer: AudioNormalizer = None,
|
|
|
|
format_settings: dict = None,
|
|
|
|
stream: bool = True
|
2024-12-31 01:57:00 -07:00
|
|
|
) -> bytes:
|
2024-12-31 01:52:16 -07:00
|
|
|
"""Convert audio data to specified format
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
Args:
|
|
|
|
audio_data: Numpy array of audio samples
|
|
|
|
sample_rate: Sample rate of the audio
|
2025-01-01 21:11:23 +05:30
|
|
|
output_format: Target format (wav, mp3, opus, flac, pcm)
|
2025-01-04 17:54:54 -07:00
|
|
|
is_first_chunk: Whether this is the first chunk of a stream
|
2025-01-06 03:32:41 -07:00
|
|
|
normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
|
|
|
|
format_settings: Optional dict of format-specific settings to override defaults
|
|
|
|
Example: {
|
|
|
|
"mp3": {
|
|
|
|
"bitrate_mode": "VARIABLE",
|
|
|
|
"compression_level": 0.8
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Default settings balance speed and compression:
|
|
|
|
optimized for localhost @ 0.0
|
|
|
|
- MP3: constant bitrate, no compression (0.0)
|
|
|
|
- OPUS: no compression (0.0)
|
|
|
|
- FLAC: no compression (0.0)
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
Returns:
|
|
|
|
Bytes of the converted audio
|
|
|
|
"""
|
|
|
|
buffer = BytesIO()
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
try:
|
2025-01-04 17:55:36 -07:00
|
|
|
# Always normalize audio to ensure proper amplitude scaling
|
2025-01-09 07:20:14 -07:00
|
|
|
if normalizer is None:
|
|
|
|
normalizer = AudioNormalizer()
|
|
|
|
normalized_audio = normalizer.normalize(audio_data, is_last_chunk=is_last_chunk)
|
2025-01-04 17:54:54 -07:00
|
|
|
|
|
|
|
if output_format == "pcm":
|
|
|
|
# Raw 16-bit PCM samples, no header
|
|
|
|
buffer.write(normalized_audio.tobytes())
|
|
|
|
elif output_format == "wav":
|
2025-01-09 07:20:14 -07:00
|
|
|
# Always use soundfile for WAV to ensure proper headers and normalization
|
|
|
|
sf.write(buffer, normalized_audio, sample_rate, format="WAV", subtype='PCM_16')
|
2025-01-04 18:09:23 -07:00
|
|
|
elif output_format == "mp3":
|
2025-01-06 03:32:41 -07:00
|
|
|
# Use format settings or defaults
|
|
|
|
settings = format_settings.get("mp3", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
|
|
|
|
sf.write(
|
|
|
|
buffer, normalized_audio,
|
|
|
|
sample_rate, format="MP3",
|
|
|
|
**settings
|
|
|
|
)
|
|
|
|
|
2024-12-31 01:57:00 -07:00
|
|
|
elif output_format == "opus":
|
2025-01-06 03:32:41 -07:00
|
|
|
settings = format_settings.get("opus", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
|
|
|
|
sf.write(buffer, normalized_audio, sample_rate, format="OGG",
|
|
|
|
subtype="OPUS", **settings)
|
|
|
|
|
2024-12-31 01:57:00 -07:00
|
|
|
elif output_format == "flac":
|
2025-01-06 03:32:41 -07:00
|
|
|
if is_first_chunk:
|
|
|
|
logger.info("Starting FLAC stream...")
|
|
|
|
settings = format_settings.get("flac", {}) if format_settings else {}
|
|
|
|
settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
|
2025-01-04 17:54:54 -07:00
|
|
|
sf.write(buffer, normalized_audio, sample_rate, format="FLAC",
|
2025-01-06 03:32:41 -07:00
|
|
|
subtype='PCM_16', **settings)
|
2024-12-31 01:52:16 -07:00
|
|
|
else:
|
2025-01-04 18:09:23 -07:00
|
|
|
if output_format == "aac":
|
|
|
|
raise ValueError(
|
|
|
|
"Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm."
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
raise ValueError(
|
|
|
|
f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm."
|
|
|
|
)
|
2024-12-31 01:57:00 -07:00
|
|
|
|
2025-01-01 21:11:23 +05:30
|
|
|
buffer.seek(0)
|
|
|
|
return buffer.getvalue()
|
|
|
|
|
2024-12-31 01:52:16 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error converting audio to {output_format}: {str(e)}")
|
|
|
|
raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")
|