Fixed not returning enough values

This commit is contained in:
Fireblade2534 2025-02-12 15:06:11 +00:00
parent 5cc9d140fe
commit 51b6b01589
3 changed files with 16 additions and 18 deletions

View file

@ -11,7 +11,7 @@ class AudioChunk:
def __init__(self, def __init__(self,
audio: np.ndarray, audio: np.ndarray,
word_timestamps: Optional[List]=None word_timestamps: Optional[List]=[]
): ):
self.audio=audio self.audio=audio
self.word_timestamps=word_timestamps self.word_timestamps=word_timestamps

View file

@ -57,7 +57,6 @@ class AudioNormalizer:
non_silent_index_start, non_silent_index_end = None,None non_silent_index_start, non_silent_index_end = None,None
for X in range(0,len(audio_data)): for X in range(0,len(audio_data)):
#print(audio_data[X])
if audio_data[X] > amplitude_threshold: if audio_data[X] > amplitude_threshold:
non_silent_index_start=X non_silent_index_start=X
break break
@ -149,11 +148,9 @@ class AudioService:
if normalizer is None: if normalizer is None:
normalizer = AudioNormalizer() normalizer = AudioNormalizer()
print("1")
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio) audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
print("2")
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
print("3")
# Get or create format-specific writer # Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}" writer_key = f"{output_format}_{sample_rate}"
if is_first_chunk or writer_key not in AudioService._writers: if is_first_chunk or writer_key not in AudioService._writers:
@ -161,18 +158,16 @@ class AudioService:
output_format, sample_rate output_format, sample_rate
) )
writer = AudioService._writers[writer_key] writer = AudioService._writers[writer_key]
print("4")
# Write audio data first # Write audio data first
if len(audio_chunk.audio) > 0: if len(audio_chunk.audio) > 0:
chunk_data = writer.write_chunk(audio_chunk.audio) chunk_data = writer.write_chunk(audio_chunk.audio)
print("5")
# Then finalize if this is the last chunk # Then finalize if this is the last chunk
if is_last_chunk: if is_last_chunk:
print("6")
final_data = writer.write_chunk(finalize=True) final_data = writer.write_chunk(finalize=True)
print("7")
del AudioService._writers[writer_key] del AudioService._writers[writer_key]
return final_data if final_data else b"" return final_data if final_data else b"", audio_chunk
return chunk_data if chunk_data else b"", audio_chunk return chunk_data if chunk_data else b"", audio_chunk
@ -206,8 +201,10 @@ class AudioService:
start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk) start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk)
audio_chunk.audio=audio_chunk.audio[start_index:end_index] audio_chunk.audio=audio_chunk.audio[start_index:end_index]
if audio_chunk.word_timestamps is not None:
for timestamp in audio_chunk.word_timestamps: for timestamp in audio_chunk.word_timestamps:
timestamp["start_time"]-=start_index * 24000 timestamp["start_time"]-=start_index / 24000
timestamp["end_time"]-=start_index * 24000 timestamp["end_time"]-=start_index / 24000
return audio_chunk return audio_chunk

View file

@ -6,6 +6,7 @@ import tempfile
import time import time
from typing import AsyncGenerator, List, Optional, Tuple, Union from typing import AsyncGenerator, List, Optional, Tuple, Union
from ..inference.base import AudioChunk
import numpy as np import numpy as np
import torch import torch
from kokoro import KPipeline from kokoro import KPipeline
@ -62,9 +63,8 @@ class TTSService:
if not output_format: if not output_format:
yield np.array([], dtype=np.float32) yield np.array([], dtype=np.float32)
return return
result, _ = await AudioService.convert_audio(
result = await AudioService.convert_audio( AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
np.array([0], dtype=np.float32), # Dummy data for type checking
24000, 24000,
output_format, output_format,
speed, speed,
@ -119,7 +119,7 @@ class TTSService:
print(chunk_data.word_timestamps) print(chunk_data.word_timestamps)
yield chunk_data.audio yield chunk_data.audio
else: else:
print("old backend")
# For legacy backends, load voice tensor # For legacy backends, load voice tensor
voice_tensor = await self._voice_manager.load_voice( voice_tensor = await self._voice_manager.load_voice(
voice_name, device=backend.device voice_name, device=backend.device
@ -315,7 +315,8 @@ class TTSService:
except Exception as e: except Exception as e:
logger.error(f"Error in phoneme audio generation: {str(e)}") logger.error(f"Error in phoneme audio generation: {str(e)}")
raise raise e
async def generate_audio( async def generate_audio(
self, self,