Kokoro-FastAPI/api/src/services/streaming_audio_writer.py
CodePothunter e67264f789 Fix BUGs of streaming non-wav format audio; improve robustness of releasing audio container
Refactor StreamingAudioWriter to improve audio encoding reliability

- Restructure audio encoding logic for better error handling
- Create a new method `_create_container()` to manage container creation
- Improve handling of different audio formats and encoding scenarios
- Add error logging for audio chunk encoding failures
- Simplify container and stream management in write_chunk method
2025-03-10 13:26:55 +08:00

106 lines
3.4 KiB
Python

"""Audio conversion service with proper streaming support"""
import struct
from io import BytesIO
from typing import Optional, Dict
import numpy as np
import soundfile as sf
from loguru import logger
from pydub import AudioSegment
import av
class StreamingAudioWriter:
"""Handles streaming audio format conversions"""
def __init__(self, format: str, sample_rate: int, channels: int = 1):
self.format = format.lower()
self.sample_rate = sample_rate
self.channels = channels
self.bytes_written = 0
self.pts = 0
# Format-specific setup
if self.format not in ["wav", "opus", "flac", "mp3", "aac", "pcm"]:
raise ValueError(f"Unsupported format: {format}")
# Codec mapping
self.codec_map = {
"wav": "pcm_s16le",
"mp3": "mp3",
"opus": "libopus",
"flac": "flac",
"aac": "aac"
}
def _create_container(self):
"""Create a new container for each write operation"""
if self.format == "pcm":
return None, None
buffer = BytesIO()
container = av.open(buffer, mode="w", format=self.format)
stream = container.add_stream(
self.codec_map[self.format],
sample_rate=self.sample_rate,
layout='mono' if self.channels == 1 else 'stereo'
)
stream.bit_rate = 96000
return container, buffer
def write_chunk(
self, audio_data: Optional[np.ndarray] = None, finalize: bool = False
) -> bytes:
"""Write a chunk of audio data and return bytes in the target format.
Args:
audio_data: Audio data to write, or None if finalizing
finalize: Whether this is the final write to close the stream
"""
# Handle PCM format separately as it doesn't use PyAV
if self.format == "pcm":
if finalize or audio_data is None or len(audio_data) == 0:
return b""
return audio_data.tobytes()
# Handle empty input
if not finalize and (audio_data is None or len(audio_data) == 0):
return b""
try:
# Create a new container for this operation
container, buffer = self._create_container()
stream = container.streams[0]
if finalize:
# Just return empty bytes for finalize in the new design
return b""
# Create audio frame
frame = av.AudioFrame.from_ndarray(
audio_data.reshape(1, -1),
format='s16',
layout='mono' if self.channels == 1 else 'stereo'
)
frame.sample_rate = self.sample_rate
frame.pts = self.pts
self.pts += frame.samples
# Encode the frame
for packet in stream.encode(frame):
container.mux(packet)
# Flush any remaining packets
for packet in stream.encode(None):
container.mux(packet)
# Close the container and get the data
container.close()
data = buffer.getvalue()
return data
except Exception as e:
logger.error(f"Error encoding audio chunk: {e}")
return b""