mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Fix streaming a wav file with captions not reaturning any captions (This is only a problem because wav streaming does not acually work)
This commit is contained in:
parent
e3dc959775
commit
cb22aab239
1 changed files with 22 additions and 1 deletions
|
@ -210,12 +210,22 @@ async def create_captioned_speech(
|
|||
try:
|
||||
# Write chunks to temp file and stream
|
||||
async for chunk_data in generator:
|
||||
# The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
|
||||
timestamp_acumulator=[]
|
||||
|
||||
if chunk_data.output: # Skip empty chunks
|
||||
await temp_writer.write(chunk_data.output)
|
||||
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
||||
|
||||
# Add any chunks that may be in the acumulator into the return word_timestamps
|
||||
chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
|
||||
timestamp_acumulator=[]
|
||||
|
||||
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
||||
|
||||
else:
|
||||
if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
|
||||
timestamp_acumulator+=chunk_data.word_timestamps
|
||||
|
||||
# Finalize the temp file
|
||||
await temp_writer.finalize()
|
||||
except Exception as e:
|
||||
|
@ -234,13 +244,24 @@ async def create_captioned_speech(
|
|||
|
||||
async def single_output():
|
||||
try:
|
||||
# The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
|
||||
timestamp_acumulator=[]
|
||||
|
||||
# Stream chunks
|
||||
async for chunk_data in generator:
|
||||
if chunk_data.output: # Skip empty chunks
|
||||
# Encode the chunk bytes into base 64
|
||||
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
||||
|
||||
# Add any chunks that may be in the acumulator into the return word_timestamps
|
||||
chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
|
||||
timestamp_acumulator=[]
|
||||
|
||||
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
||||
else:
|
||||
if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
|
||||
timestamp_acumulator+=chunk_data.word_timestamps
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in single output streaming: {e}")
|
||||
raise
|
||||
|
|
Loading…
Add table
Reference in a new issue