Fixes a couple of issues with audio triming and prevents errors with single voice weights

This commit is contained in:
Fireblade 2025-02-18 18:12:49 -05:00
parent f2b2f41412
commit 4ee4d36822
4 changed files with 11 additions and 3 deletions

View file

@ -294,6 +294,7 @@ async def create_captioned_speech(
request.response_format,
is_first_chunk=True,
is_last_chunk=False,
trim_audio=False,
)
# Convert to requested format with proper finalization

View file

@ -294,6 +294,7 @@ async def create_speech(
request.response_format,
is_first_chunk=True,
is_last_chunk=False,
trim_audio=False
)
# Convert to requested format with proper finalization

View file

@ -119,6 +119,7 @@ class AudioService:
chunk_text: str = "",
is_first_chunk: bool = True,
is_last_chunk: bool = False,
trim_audio: bool = True,
normalizer: AudioNormalizer = None,
) -> Tuple[AudioChunk]:
"""Convert audio data to specified format with streaming support
@ -147,7 +148,9 @@ class AudioService:
normalizer = AudioNormalizer()
audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
if trim_audio == True:
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
# Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}"

View file

@ -65,7 +65,7 @@ class TTSService:
yield AudioChunk(np.array([], dtype=np.int16),output=b'')
return
chunk_data = await AudioService.convert_audio(
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking
24000,
output_format,
speed,
@ -225,6 +225,8 @@ class TTSService:
return voice, combined_path
else:
# Single voice
if "(" in voice and ")" in voice:
voice = voice.split("(")[0].strip()
path = await self._voice_manager.get_voice_path(voice)
if not path:
raise RuntimeError(f"Voice not found: {voice}")
@ -341,8 +343,9 @@ class TTSService:
try:
async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
if len(audio_stream_data.audio) > 0:
audio_data_chunks.append(audio_stream_data)
audio_data_chunks.append(audio_stream_data)
combined_audio_data=AudioChunk.combine(audio_data_chunks)
return combined_audio_data