Merge pull request #94 from JoshRosen/fix-wav-header-in-streaming-responses

2025-04-13 09:39:17 +00:00 · 2025-01-26 16:03:06 -07:00 · 2025-01-26 16:03:06 -07:00 · 0de22ada38
commit 0de22ada38
parent 55ce88bfb6 b8d592081e
1 changed files with 25 additions and 8 deletions
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -1,6 +1,7 @@
 """Audio conversion service"""
 from io import BytesIO
 import struct
 import numpy as np
 import scipy.io.wavfile as wavfile
@ -107,14 +108,30 @@ class AudioService:
                # Raw 16-bit PCM samples, no header
                buffer.write(normalized_audio.tobytes())
            elif output_format == "wav":
-                # WAV format with headers
+                # Write the WAV header ourselves so that we can specify a "fake" data size.
-                sf.write(
+                # This is necessary for streaming responses to work properly: if we simply
-                    buffer,
+                # concatenated individual WAV files then the initial chunk's header length
-                    normalized_audio,
+                # would be shorter than the full file length and subsequent chunks' RIFF
                # headers would appear in the middle of the audio data.
                if is_first_chunk:
                    # Modified from Python stdlib's wave.py module:
                    buffer.write(b'RIFF')
                    buffer.write(struct.pack('<L4s4sLHHLLHH4s',
                        0xFFFFFFFF,  # total size (set to max)
                        b'WAVE',
                        b'fmt ',
                        16,
                        1,  # PCM format
                        1,  # channels
                        sample_rate,
-                    format="WAV",
+                        sample_rate * 2,  # byte rate
-                    subtype="PCM_16",
+                        2,  # block align
-                )
+                        16,  # bits per sample
                        b'data'
                    ))
                    buffer.write(struct.pack('<L', 0xFFFFFFFF))  # data size (set to max)
                # write raw PCM data
                buffer.write(normalized_audio.tobytes())
            elif output_format == "mp3":
                # MP3 format with proper framing
                settings = format_settings.get("mp3", {}) if format_settings else {}