More work on timestamps (Does not maintain accuracy over multiple chunks)

This commit is contained in:
Fireblade 2025-02-12 21:36:35 -05:00
parent 6985f6ef99
commit 5b20602b8e
5 changed files with 22 additions and 7 deletions

View file

@ -295,7 +295,8 @@ class KokoroV1(BaseModelBackend):
logger.error(
f"Failed to process timestamps for chunk: {e}"
)
yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps)
else:
logger.warning("No audio in chunk")

View file

@ -128,7 +128,7 @@ async def process_voices(
async def stream_audio_chunks(
tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request
) -> AsyncGenerator[Tuple[bytes,AudioChunk], None]:
) -> AsyncGenerator[list, None]:
"""Stream audio chunks as they're generated with client disconnect handling"""
voice_name = await process_voices(request.voice, tts_service)
@ -140,8 +140,10 @@ async def stream_audio_chunks(
speed=request.speed,
output_format=request.response_format,
lang_code=request.lang_code or request.voice[0],
normalization_options=request.normalization_options
normalization_options=request.normalization_options,
return_timestamps=True,
):
# Check if client is still connected
is_disconnected = client_request.is_disconnected
if callable(is_disconnected):
@ -149,7 +151,8 @@ async def stream_audio_chunks(
if is_disconnected:
logger.info("Client disconnected, stopping audio generation")
break
yield chunk, chunk_data
yield chunk
except Exception as e:
logger.error(f"Error in audio streaming: {str(e)}")
# Let the exception propagate to trigger cleanup
@ -158,6 +161,7 @@ async def stream_audio_chunks(
@router.post("/audio/speech")
async def create_speech(
request: OpenAISpeechRequest,
client_request: Request,
x_raw_response: str = Header(None, alias="x-raw-response"),
@ -217,7 +221,7 @@ async def create_speech(
async def dual_output():
try:
# Write chunks to temp file and stream
async for chunk, chunk_data in generator:
async for chunk in generator:
if chunk: # Skip empty chunks
await temp_writer.write(chunk)
#if return_json:

View file

@ -247,7 +247,7 @@ class TTSService:
"""Generate and stream audio chunks."""
stream_normalizer = AudioNormalizer()
chunk_index = 0
current_offset=0.0
try:
# Get backend
backend = self.model_manager.get_backend()
@ -261,7 +261,8 @@ class TTSService:
logger.info(
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
)
# Process text in chunks with smart splitting
async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
try:
@ -277,8 +278,17 @@ class TTSService:
is_last=False, # We'll update the last chunk later
normalizer=stream_normalizer,
lang_code=pipeline_lang_code, # Pass lang_code
return_timestamps=return_timestamps,
):
if chunk_data.word_timestamps is not None:
for timestamp in chunk_data.word_timestamps:
timestamp["start_time"]+=current_offset
timestamp["end_time"]+=current_offset
current_offset+=len(chunk_data.audio) / 24000
if result is not None:
print(chunk_data.word_timestamps)
yield result,chunk_data
chunk_index += 1
else:

Binary file not shown.

BIN
peaks/output.mp3.reapeaks Normal file

Binary file not shown.