Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes

2025-08-05 16:48:53 +00:00 · 2025-01-27 20:23:35 -07:00 · 2025-01-27 20:23:35 -07:00 · 8a60a2b90c
commit 8a60a2b90c
parent 409a9e9af3
4 changed files with 141 additions and 180 deletions
--- a/MigrationWorkingNotes.md
+++ b/MigrationWorkingNotes.md
@ -1,70 +0,0 @@
 # UV Setup
 Deprecated notes for myself
 ## Structure
 ```
 docker/
  ├── cpu/
  │   ├── pyproject.toml     # CPU deps (torch CPU)
  │   └── requirements.lock  # CPU lockfile
  ├── gpu/
  │   ├── pyproject.toml     # GPU deps (torch CUDA)
  │   └── requirements.lock  # GPU lockfile
  └── shared/
      └── pyproject.toml     # Common deps
 ```
 ## Regenerate Lock Files
 ### CPU
 ```bash
 cd docker/cpu
 uv pip compile pyproject.toml ../shared/pyproject.toml --output-file requirements.lock
 ```
 ### GPU
 ```bash
 cd docker/gpu
 uv pip compile pyproject.toml ../shared/pyproject.toml --output-file requirements.lock
 ```
 ## Local Dev Setup
 ### CPU
 ```bash
 cd docker/cpu
 uv venv
 .venv\Scripts\activate  # Windows
 uv pip sync requirements.lock
 ```
 ### GPU
 ```bash
 cd docker/gpu
 uv venv
 .venv\Scripts\activate  # Windows
 uv pip sync requirements.lock --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match
 ```
 ### Run Server
 ```bash
 # From project root with venv active:
 uvicorn api.src.main:app --reload
 ```
 ## Docker
 ### CPU
 ```bash
 cd docker/cpu
 docker compose up
 ```
 ### GPU
 ```bash
 cd docker/gpu
 docker compose up
 ```
 ## Known Issues
 - Module imports: Run server from project root
 - PyTorch CUDA: Always use --extra-index-url and --index-strategy for GPU env
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -10,7 +10,7 @@ from loguru import logger
 from pydub import AudioSegment
 from ..core.config import settings
-
+from .streaming_audio_writer import StreamingAudioWriter
 class AudioNormalizer:
    """Handles audio normalization state for a single stream"""
@ -45,7 +45,7 @@ class AudioNormalizer:
 class AudioService:
-    """Service for audio format conversions"""
+    """Service for audio format conversions with streaming support"""
    # Default audio format settings balanced for speed and compression
    DEFAULT_SETTINGS = {
@ -64,6 +64,8 @@ class AudioService:
        },
    }
    _writers = {}
    @staticmethod
    async def convert_audio(
        audio_data: np.ndarray,
@ -72,127 +74,46 @@ class AudioService:
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
        format_settings: dict = None,
        stream: bool = True,
    ) -> bytes:
-        """Convert audio data to specified format
+        """Convert audio data to specified format with streaming support
        Args:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
-            output_format: Target format (wav, mp3, opus, flac, pcm)
+            output_format: Target format (wav, mp3, ogg, pcm)
-            is_first_chunk: Whether this is the first chunk of a stream
+            is_first_chunk: Whether this is the first chunk
-            normalizer: Optional AudioNormalizer instance for consistent normalization across chunks
+            is_last_chunk: Whether this is the last chunk
-            format_settings: Optional dict of format-specific settings to override defaults
+            normalizer: Optional AudioNormalizer instance for consistent normalization
                Example: {
                    "mp3": {
                        "bitrate_mode": "VARIABLE",
                        "compression_level": 0.8
                    }
                }
                Default settings balance speed and compression:
                optimized for localhost @ 0.0
                - MP3: constant bitrate, no compression (0.0)
                - OPUS: no compression (0.0)
                - FLAC: no compression (0.0)
        Returns:
-            Bytes of the converted audio
+            Bytes of the converted audio chunk
        """
        buffer = BytesIO()
        try:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = await normalizer.normalize(audio_data)
-            if output_format == "pcm":
+            # Get or create format-specific writer
-                # Raw 16-bit PCM samples, no header
+            writer_key = f"{output_format}_{sample_rate}"
-                buffer.write(normalized_audio.tobytes())
+            if is_first_chunk or writer_key not in AudioService._writers:
-            elif output_format == "wav":
+                AudioService._writers[writer_key] = StreamingAudioWriter(
-                # Write the WAV header ourselves so that we can specify a "fake" data size.
+                    output_format, sample_rate
                # This is necessary for streaming responses to work properly: if we simply
                # concatenated individual WAV files then the initial chunk's header length
                # would be shorter than the full file length and subsequent chunks' RIFF
                # headers would appear in the middle of the audio data.
                if is_first_chunk:
                    # Modified from Python stdlib's wave.py module:
                    buffer.write(b'RIFF')
                    buffer.write(struct.pack('<L4s4sLHHLLHH4s',
                        0xFFFFFFFF,  # total size (set to max)
                        b'WAVE',
                        b'fmt ',
                        16,
                        1,  # PCM format
                        1,  # channels
                        sample_rate,
                        sample_rate * 2,  # byte rate
                        2,  # block align
                        16,  # bits per sample
                        b'data'
                    ))
                    buffer.write(struct.pack('<L', 0xFFFFFFFF))  # data size (set to max)
                # write raw PCM data
                buffer.write(normalized_audio.tobytes())
            elif output_format == "mp3":
                # MP3 format with proper framing
                settings = format_settings.get("mp3", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings}
                sf.write(
                    buffer, normalized_audio, sample_rate, format="MP3", **settings
                )
            elif output_format == "opus":
                # Opus format in OGG container
                settings = format_settings.get("opus", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="OGG",
                    subtype="OPUS",
                    **settings,
                )
            elif output_format == "flac":
                # FLAC format with proper framing
                if is_first_chunk:
                    logger.info("Starting FLAC stream...")
                settings = format_settings.get("flac", {}) if format_settings else {}
                settings = {**AudioService.DEFAULT_SETTINGS["flac"], **settings}
                sf.write(
                    buffer,
                    normalized_audio,
                    sample_rate,
                    format="FLAC",
                    subtype="PCM_16",
                    **settings,
                )
            elif output_format == "aac":           
                # Convert numpy array directly to AAC using pydub
                audio_segment = AudioSegment(
                    normalized_audio.tobytes(), 
                    frame_rate=sample_rate,
                    sample_width=normalized_audio.dtype.itemsize,
                    channels=1 if len(normalized_audio.shape) == 1 else normalized_audio.shape[1]
                )
            writer = AudioService._writers[writer_key]
-                settings = format_settings.get("aac", {}) if format_settings else {}
+            # Write the current chunk
-                settings = {**AudioService.DEFAULT_SETTINGS["aac"], **settings}
+            chunk_data = writer.write_chunk(normalized_audio)
-                audio_segment.export(
+            # Handle last chunk and cleanup
-                    buffer,
+            if is_last_chunk:
-                    format="adts",  # ADTS is a common AAC container format
+                final_data = writer.close()
-                    bitrate=settings["bitrate"]
+                if final_data:
-                )
+                    chunk_data += final_data
-            else:
+                del AudioService._writers[writer_key]
                raise ValueError(
                    f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac."
                )
-            buffer.seek(0)
+            return chunk_data
            return buffer.getvalue()
        except Exception as e:
-            logger.error(f"Error converting audio to {output_format}: {str(e)}")
+            logger.error(f"Error converting audio stream to {output_format}: {str(e)}")
-            raise ValueError(f"Failed to convert audio to {output_format}: {str(e)}")
+            raise ValueError(f"Failed to convert audio stream to {output_format}: {str(e)}")
--- a/api/src/services/streaming_audio_writer.py
+++ b/api/src/services/streaming_audio_writer.py
@ -0,0 +1,111 @@
 """Audio conversion service with proper streaming support"""
 from io import BytesIO
 import struct
 from typing import Generator, Optional
 import numpy as np
 import soundfile as sf
 from loguru import logger
 from pydub import AudioSegment
 class StreamingAudioWriter:
    """Handles streaming audio format conversions"""
    def __init__(self, format: str, sample_rate: int, channels: int = 1):
        self.format = format.lower()
        self.sample_rate = sample_rate
        self.channels = channels
        self.bytes_written = 0
        # Format-specific setup
        if self.format == "wav":
            self._write_wav_header()
        elif self.format == "ogg":
            self.writer = sf.SoundFile(
                file=BytesIO(),
                mode='w',
                samplerate=sample_rate,
                channels=channels,
                format='OGG',
                subtype='VORBIS'
            )
        elif self.format == "mp3":
            # For MP3, we'll use pydub's incremental writer
            self.buffer = BytesIO()
            self.encoder = AudioSegment.from_mono_audiosegments()
    def _write_wav_header(self) -> bytes:
        """Write WAV header with correct streaming format"""
        header = BytesIO()
        header.write(b'RIFF')
        header.write(struct.pack('<L', 0))  # Placeholder for file size
        header.write(b'WAVE')
        header.write(b'fmt ')
        header.write(struct.pack('<L', 16))  # fmt chunk size
        header.write(struct.pack('<H', 1))   # PCM format
        header.write(struct.pack('<H', self.channels))
        header.write(struct.pack('<L', self.sample_rate))
        header.write(struct.pack('<L', self.sample_rate * self.channels * 2))  # Byte rate
        header.write(struct.pack('<H', self.channels * 2))  # Block align
        header.write(struct.pack('<H', 16))  # Bits per sample
        header.write(b'data')
        header.write(struct.pack('<L', 0))  # Placeholder for data size
        return header.getvalue()
    def write_chunk(self, audio_data: np.ndarray) -> bytes:
        """Write a chunk of audio data and return bytes in the target format"""
        buffer = BytesIO()
        if self.format == "wav":
            # For WAV, we write raw PCM after the first chunk
            if self.bytes_written == 0:
                buffer.write(self._write_wav_header())
            buffer.write(audio_data.tobytes())
            self.bytes_written += len(audio_data.tobytes())
        elif self.format == "ogg":
            # OGG/Vorbis handles streaming naturally
            self.writer.write(audio_data)
            self.writer.flush()
            buffer = self.writer.file
            buffer.seek(0, 2)  # Seek to end
            chunk = buffer.getvalue()
            buffer.seek(0)
            buffer.truncate()
            return chunk
        elif self.format == "mp3":
            # Convert chunk to AudioSegment and encode
            segment = AudioSegment(
                audio_data.tobytes(),
                frame_rate=self.sample_rate,
                sample_width=audio_data.dtype.itemsize,
                channels=self.channels
            )
            self.encoder += segment
            self.encoder.export(buffer, format="mp3")
        return buffer.getvalue()
    def close(self) -> Optional[bytes]:
        """Finish the audio file and return any remaining data"""
        if self.format == "wav":
            # Update WAV header with final file size
            buffer = BytesIO()
            buffer.write(b'RIFF')
            buffer.write(struct.pack('<L', self.bytes_written + 36))  # File size
            buffer.write(b'WAVE')
            # ... rest of header ...
            buffer.write(struct.pack('<L', self.bytes_written))  # Data size
            return buffer.getvalue()
        elif self.format == "ogg":
            self.writer.close()
            return None
        elif self.format == "mp3":
            # Flush any remaining MP3 frames
            buffer = BytesIO()
            self.encoder.export(buffer, format="mp3")
            return buffer.getvalue()
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -73,8 +73,7 @@ class TTSService:
                        output_format,
                        is_first_chunk=is_first,
                        normalizer=normalizer,
-                        is_last_chunk=is_last,
+                        is_last_chunk=is_last
                        stream=True
                    )
                return chunk_audio