Kokoro-FastAPI/api/src/services/streaming_audio_writer.py

117 lines
4.1 KiB
Python
Raw Normal View History

"""Audio conversion service with proper streaming support"""
import struct
2025-02-09 18:32:17 -07:00
from io import BytesIO
from typing import Optional
2025-04-04 16:50:46 -06:00
import av
import numpy as np
import soundfile as sf
from loguru import logger
from pydub import AudioSegment
2025-04-04 16:50:46 -06:00
2025-02-09 18:32:17 -07:00
class StreamingAudioWriter:
"""Handles streaming audio format conversions"""
def __init__(self, format: str, sample_rate: int, channels: int = 1):
self.format = format.lower()
self.sample_rate = sample_rate
self.channels = channels
self.bytes_written = 0
2025-04-04 16:58:07 -06:00
self.pts = 0
2025-04-04 16:58:07 -06:00
codec_map = {
"wav": "pcm_s16le",
"mp3": "mp3",
"opus": "libopus",
"flac": "flac",
"aac": "aac",
}
# Format-specific setup
2025-04-04 16:58:07 -06:00
if self.format in ["wav", "flac", "mp3", "pcm", "aac", "opus"]:
if self.format != "pcm":
self.output_buffer = BytesIO()
container_options = {}
# Try disabling Xing VBR header for MP3 to fix iOS timeline reading issues
if self.format == 'mp3':
# Disable Xing VBR header
container_options = {'write_xing': '0'}
logger.debug("Disabling Xing VBR header for MP3 encoding.")
2025-04-04 16:58:07 -06:00
self.container = av.open(
self.output_buffer,
mode="w",
format=self.format if self.format != "aac" else "adts",
options=container_options # Pass options here
2025-04-04 16:58:07 -06:00
)
self.stream = self.container.add_stream(
codec_map[self.format],
rate=self.sample_rate, # Correct parameter name is 'rate'
2025-04-04 16:58:07 -06:00
layout="mono" if self.channels == 1 else "stereo",
)
# Set bit_rate only for codecs where it's applicable and useful
if self.format in ['mp3', 'aac', 'opus']:
self.stream.bit_rate = 128000 # Example bitrate, can be configured
2025-01-28 13:52:57 -07:00
else:
raise ValueError(f"Unsupported format: {self.format}") # Use self.format here
2025-03-20 19:15:07 +00:00
def close(self):
if hasattr(self, "container"):
self.container.close()
if hasattr(self, "output_buffer"):
self.output_buffer.close()
2025-02-09 18:32:17 -07:00
def write_chunk(
self, audio_data: Optional[np.ndarray] = None, finalize: bool = False
) -> bytes:
"""Write a chunk of audio data and return bytes in the target format.
2025-02-09 18:32:17 -07:00
Args:
audio_data: Audio data to write, or None if finalizing
finalize: Whether this is the final write to close the stream
"""
if finalize:
if self.format != "pcm":
# Flush stream encoder
packets = self.stream.encode(None)
for packet in packets:
self.container.mux(packet)
2025-04-04 16:58:07 -06:00
# Closing the container handles writing the trailer and finalizing the file.
# No explicit flush method is available or needed here.
logger.debug("Muxed final packets.")
# Get the final bytes from the buffer *before* closing it
2025-04-04 16:58:07 -06:00
data = self.output_buffer.getvalue()
self.close() # Close container and buffer
return data
2025-02-09 18:32:17 -07:00
if audio_data is None or len(audio_data) == 0:
2025-02-09 18:32:17 -07:00
return b""
if self.format == "pcm":
# Write raw bytes
2025-01-28 13:52:57 -07:00
return audio_data.tobytes()
else:
2025-04-04 16:58:07 -06:00
frame = av.AudioFrame.from_ndarray(
audio_data.reshape(1, -1),
format="s16",
layout="mono" if self.channels == 1 else "stereo",
)
frame.sample_rate = self.sample_rate
frame.pts = self.pts
self.pts += frame.samples
2025-04-04 16:58:07 -06:00
packets = self.stream.encode(frame)
for packet in packets:
self.container.mux(packet)
2025-04-04 16:58:07 -06:00
data = self.output_buffer.getvalue()
self.output_buffer.seek(0)
self.output_buffer.truncate(0)
2025-04-04 16:58:07 -06:00
return data