From 8a60a2b90c9ec28fc3921e3de144e092001cb801 Mon Sep 17 00:00:00 2001 From: remsky Date: Mon, 27 Jan 2025 20:23:35 -0700 Subject: [PATCH] Add StreamingAudioWriter class for audio format conversions and remove deprecated migration notes --- MigrationWorkingNotes.md | 70 ----------- api/src/services/audio.py | 137 +++++---------------- api/src/services/streaming_audio_writer.py | 111 +++++++++++++++++ api/src/services/tts_service.py | 3 +- 4 files changed, 141 insertions(+), 180 deletions(-) delete mode 100644 MigrationWorkingNotes.md create mode 100644 api/src/services/streaming_audio_writer.py diff --git a/MigrationWorkingNotes.md b/MigrationWorkingNotes.md deleted file mode 100644 index 99adf5d..0000000 --- a/MigrationWorkingNotes.md +++ /dev/null @@ -1,70 +0,0 @@ -# UV Setup -Deprecated notes for myself -## Structure -``` -docker/ - ├── cpu/ - │ ├── pyproject.toml # CPU deps (torch CPU) - │ └── requirements.lock # CPU lockfile - ├── gpu/ - │ ├── pyproject.toml # GPU deps (torch CUDA) - │ └── requirements.lock # GPU lockfile - └── shared/ - └── pyproject.toml # Common deps -``` - -## Regenerate Lock Files - -### CPU -```bash -cd docker/cpu -uv pip compile pyproject.toml ../shared/pyproject.toml --output-file requirements.lock -``` - -### GPU -```bash -cd docker/gpu -uv pip compile pyproject.toml ../shared/pyproject.toml --output-file requirements.lock -``` - -## Local Dev Setup - -### CPU -```bash -cd docker/cpu -uv venv -.venv\Scripts\activate # Windows -uv pip sync requirements.lock -``` - -### GPU -```bash -cd docker/gpu -uv venv -.venv\Scripts\activate # Windows -uv pip sync requirements.lock --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match -``` - -### Run Server -```bash -# From project root with venv active: -uvicorn api.src.main:app --reload -``` - -## Docker - -### CPU -```bash -cd docker/cpu -docker compose up -``` - -### GPU -```bash -cd docker/gpu -docker compose up -``` - -## Known Issues -- Module imports: Run server from project root -- PyTorch CUDA: Always use --extra-index-url and --index-strategy for GPU env diff --git a/api/src/services/audio.py b/api/src/services/audio.py index 6c9578b..89bed40 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -10,7 +10,7 @@ from loguru import logger from pydub import AudioSegment from ..core.config import settings - +from .streaming_audio_writer import StreamingAudioWriter class AudioNormalizer: """Handles audio normalization state for a single stream""" @@ -45,7 +45,7 @@ class AudioNormalizer: class AudioService: - """Service for audio format conversions""" + """Service for audio format conversions with streaming support""" # Default audio format settings balanced for speed and compression DEFAULT_SETTINGS = { @@ -64,6 +64,8 @@ class AudioService: }, } + _writers = {} + @staticmethod async def convert_audio( audio_data: np.ndarray, @@ -72,127 +74,46 @@ class AudioService: is_first_chunk: bool = True, is_last_chunk: bool = False, normalizer: AudioNormalizer = None, - format_settings: dict = None, - stream: bool = True, ) -> bytes: - """Convert audio data to specified format + """Convert audio data to specified format with streaming support Args: audio_data: Numpy array of audio samples sample_rate: Sample rate of the audio - output_format: Target format (wav, mp3, opus, flac, pcm) - is_first_chunk: Whether this is the first chunk of a stream - normalizer: Optional AudioNormalizer instance for consistent normalization across chunks - format_settings: Optional dict of format-specific settings to override defaults - Example: { - "mp3": { - "bitrate_mode": "VARIABLE", - "compression_level": 0.8 - } - } - Default settings balance speed and compression: - optimized for localhost @ 0.0 - - MP3: constant bitrate, no compression (0.0) - - OPUS: no compression (0.0) - - FLAC: no compression (0.0) + output_format: Target format (wav, mp3, ogg, pcm) + is_first_chunk: Whether this is the first chunk + is_last_chunk: Whether this is the last chunk + normalizer: Optional AudioNormalizer instance for consistent normalization Returns: - Bytes of the converted audio + Bytes of the converted audio chunk """ - buffer = BytesIO() - try: # Always normalize audio to ensure proper amplitude scaling if normalizer is None: normalizer = AudioNormalizer() normalized_audio = await normalizer.normalize(audio_data) - if output_format == "pcm": - # Raw 16-bit PCM samples, no header - buffer.write(normalized_audio.tobytes()) - elif output_format == "wav": - # Write the WAV header ourselves so that we can specify a "fake" data size. - # This is necessary for streaming responses to work properly: if we simply - # concatenated individual WAV files then the initial chunk's header length - # would be shorter than the full file length and subsequent chunks' RIFF - # headers would appear in the middle of the audio data. - if is_first_chunk: - # Modified from Python stdlib's wave.py module: - buffer.write(b'RIFF') - buffer.write(struct.pack(' bytes: + """Write WAV header with correct streaming format""" + header = BytesIO() + header.write(b'RIFF') + header.write(struct.pack(' bytes: + """Write a chunk of audio data and return bytes in the target format""" + buffer = BytesIO() + + if self.format == "wav": + # For WAV, we write raw PCM after the first chunk + if self.bytes_written == 0: + buffer.write(self._write_wav_header()) + buffer.write(audio_data.tobytes()) + self.bytes_written += len(audio_data.tobytes()) + + elif self.format == "ogg": + # OGG/Vorbis handles streaming naturally + self.writer.write(audio_data) + self.writer.flush() + buffer = self.writer.file + buffer.seek(0, 2) # Seek to end + chunk = buffer.getvalue() + buffer.seek(0) + buffer.truncate() + return chunk + + elif self.format == "mp3": + # Convert chunk to AudioSegment and encode + segment = AudioSegment( + audio_data.tobytes(), + frame_rate=self.sample_rate, + sample_width=audio_data.dtype.itemsize, + channels=self.channels + ) + self.encoder += segment + self.encoder.export(buffer, format="mp3") + + return buffer.getvalue() + + def close(self) -> Optional[bytes]: + """Finish the audio file and return any remaining data""" + if self.format == "wav": + # Update WAV header with final file size + buffer = BytesIO() + buffer.write(b'RIFF') + buffer.write(struct.pack('