Merge pull request #94 from JoshRosen/fix-wav-header-in-streaming-responses

This commit is contained in:
remsky 2025-01-26 16:03:06 -07:00 committed by GitHub
commit 0de22ada38
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,6 +1,7 @@
"""Audio conversion service""" """Audio conversion service"""
from io import BytesIO from io import BytesIO
import struct
import numpy as np import numpy as np
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
@ -107,14 +108,30 @@ class AudioService:
# Raw 16-bit PCM samples, no header # Raw 16-bit PCM samples, no header
buffer.write(normalized_audio.tobytes()) buffer.write(normalized_audio.tobytes())
elif output_format == "wav": elif output_format == "wav":
# WAV format with headers # Write the WAV header ourselves so that we can specify a "fake" data size.
sf.write( # This is necessary for streaming responses to work properly: if we simply
buffer, # concatenated individual WAV files then the initial chunk's header length
normalized_audio, # would be shorter than the full file length and subsequent chunks' RIFF
# headers would appear in the middle of the audio data.
if is_first_chunk:
# Modified from Python stdlib's wave.py module:
buffer.write(b'RIFF')
buffer.write(struct.pack('<L4s4sLHHLLHH4s',
0xFFFFFFFF, # total size (set to max)
b'WAVE',
b'fmt ',
16,
1, # PCM format
1, # channels
sample_rate, sample_rate,
format="WAV", sample_rate * 2, # byte rate
subtype="PCM_16", 2, # block align
) 16, # bits per sample
b'data'
))
buffer.write(struct.pack('<L', 0xFFFFFFFF)) # data size (set to max)
# write raw PCM data
buffer.write(normalized_audio.tobytes())
elif output_format == "mp3": elif output_format == "mp3":
# MP3 format with proper framing # MP3 format with proper framing
settings = format_settings.get("mp3", {}) if format_settings else {} settings = format_settings.get("mp3", {}) if format_settings else {}