fixed no stream file writing

This commit is contained in:
Fireblade 2025-02-13 16:12:51 -05:00
parent dbf2b99026
commit 7772dbc2e4
4 changed files with 28 additions and 28 deletions

View file

@ -21,7 +21,7 @@ class AudioChunk:
output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps) output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
for audio_chunk in audio_chunk_list[1:]: for audio_chunk in audio_chunk_list[1:]:
output.audio=np.concatenate((output.audio,audio_chunk.audio)) output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16)
if output.word_timestamps is not None: if output.word_timestamps is not None:
output.word_timestamps+=output.word_timestamps output.word_timestamps+=output.word_timestamps

View file

@ -7,6 +7,7 @@ import re
import tempfile import tempfile
from typing import AsyncGenerator, Dict, List, Union, Tuple from typing import AsyncGenerator, Dict, List, Union, Tuple
from urllib import response from urllib import response
import numpy as np
import aiofiles import aiofiles
from ..inference.base import AudioChunk from ..inference.base import AudioChunk
@ -259,24 +260,31 @@ async def create_speech(
) )
else: else:
# Generate complete audio using public interface # Generate complete audio using public interface
audio, audio_data = await tts_service.generate_audio( _, audio_data = await tts_service.generate_audio(
text=request.input, text=request.input,
voice=voice_name, voice=voice_name,
speed=request.speed, speed=request.speed,
lang_code=request.lang_code, lang_code=request.lang_code,
) )
# Convert to requested format with proper finalization
content, audio_data = await AudioService.convert_audio( content, audio_data = await AudioService.convert_audio(
audio_data, audio_data,
24000, 24000,
request.response_format, request.response_format,
is_first_chunk=True, is_first_chunk=True,
is_last_chunk=False,
)
# Convert to requested format with proper finalization
final, _ = await AudioService.convert_audio(
AudioChunk(np.array([], dtype=np.int16)),
24000,
request.response_format,
is_first_chunk=False,
is_last_chunk=True, is_last_chunk=True,
) )
print(content,request.response_format) output=content+final
return Response( return Response(
content=content, content=output,
media_type=content_type, media_type=content_type,
headers={ headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}", "Content-Disposition": f"attachment; filename=speech.{request.response_format}",

View file

@ -72,7 +72,7 @@ class AudioNormalizer:
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data)) return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
async def normalize(self, audio_data: np.ndarray) -> np.ndarray: def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range """Convert audio data to int16 range
Args: Args:
@ -80,12 +80,10 @@ class AudioNormalizer:
Returns: Returns:
Normalized audio data Normalized audio data
""" """
if len(audio_data) == 0: if audio_data.dtype != np.int16:
raise ValueError("Empty audio data") # Scale directly to int16 range with clipping
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
# Scale directly to int16 range with clipping return audio_data
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
class AudioService: class AudioService:
"""Service for audio format conversions with streaming support""" """Service for audio format conversions with streaming support"""
@ -148,11 +146,9 @@ class AudioService:
if normalizer is None: if normalizer is None:
normalizer = AudioNormalizer() normalizer = AudioNormalizer()
print(len(audio_chunk.audio),"1") audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
print(len(audio_chunk.audio),"2")
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
print(len(audio_chunk.audio),"3")
# Get or create format-specific writer # Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}" writer_key = f"{output_format}_{sample_rate}"
if is_first_chunk or writer_key not in AudioService._writers: if is_first_chunk or writer_key not in AudioService._writers:
@ -169,7 +165,6 @@ class AudioService:
if is_last_chunk: if is_last_chunk:
final_data = writer.write_chunk(finalize=True) final_data = writer.write_chunk(finalize=True)
del AudioService._writers[writer_key] del AudioService._writers[writer_key]
print(audio_chunk.audio)
return final_data if final_data else b"", audio_chunk return final_data if final_data else b"", audio_chunk
return chunk_data if chunk_data else b"", audio_chunk return chunk_data if chunk_data else b"", audio_chunk
@ -196,6 +191,7 @@ class AudioService:
if normalizer is None: if normalizer is None:
normalizer = AudioNormalizer() normalizer = AudioNormalizer()
audio_chunk.audio=normalizer.normalize(audio_chunk.audio)
# Trim start and end if enough samples # Trim start and end if enough samples
if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim): if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim] audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim]

View file

@ -62,7 +62,7 @@ class TTSService:
if is_last: if is_last:
# Skip format conversion for raw audio mode # Skip format conversion for raw audio mode
if not output_format: if not output_format:
yield np.array([], dtype=np.float32) yield np.array([], dtype=np.int16), AudioChunk(np.array([], dtype=np.int16))
return return
result, chunk_data = await AudioService.convert_audio( result, chunk_data = await AudioService.convert_audio(
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
@ -111,7 +111,7 @@ class TTSService:
except Exception as e: except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}") logger.error(f"Failed to convert audio: {str(e)}")
else: else:
chunk_data = await AudioService.trim_audio(chunk_data, chunk_data = AudioService.trim_audio(chunk_data,
chunk_text, chunk_text,
speed, speed,
is_last, is_last,
@ -152,7 +152,7 @@ class TTSService:
except Exception as e: except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}") logger.error(f"Failed to convert audio: {str(e)}")
else: else:
trimmed = await AudioService.trim_audio(chunk_data, trimmed = AudioService.trim_audio(chunk_data,
chunk_text, chunk_text,
speed, speed,
is_last, is_last,
@ -288,7 +288,6 @@ class TTSService:
current_offset+=len(chunk_data.audio) / 24000 current_offset+=len(chunk_data.audio) / 24000
if result is not None: if result is not None:
print(chunk_data.word_timestamps)
yield result,chunk_data yield result,chunk_data
chunk_index += 1 chunk_index += 1
else: else:
@ -342,17 +341,14 @@ class TTSService:
audio_data_chunks=[] audio_data_chunks=[]
try: try:
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code): async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
audio_chunks.append(audio_stream_data.audio) audio_chunks.append(audio_stream_data.audio)
audio_data_chunks.append(audio_stream_data) audio_data_chunks.append(audio_stream_data)
print(audio_data_chunks[0].audio.shape)
combined_audio=np.concatenate(audio_chunks) combined_audio=np.concatenate(audio_chunks,dtype=np.int16)
print("1")
combined_audio_data=AudioChunk.combine(audio_data_chunks) combined_audio_data=AudioChunk.combine(audio_data_chunks)
print("2")
print(len(combined_audio_data.audio))
return combined_audio,combined_audio_data return combined_audio,combined_audio_data
""" """
# Get backend and voice path # Get backend and voice path