mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Fix streaming a wav file with captions not reaturning any captions (This is only a problem because wav streaming does not acually work)
This commit is contained in:
parent
e3dc959775
commit
cb22aab239
1 changed files with 22 additions and 1 deletions
|
@ -210,11 +210,21 @@ async def create_captioned_speech(
|
||||||
try:
|
try:
|
||||||
# Write chunks to temp file and stream
|
# Write chunks to temp file and stream
|
||||||
async for chunk_data in generator:
|
async for chunk_data in generator:
|
||||||
|
# The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
|
||||||
|
timestamp_acumulator=[]
|
||||||
|
|
||||||
if chunk_data.output: # Skip empty chunks
|
if chunk_data.output: # Skip empty chunks
|
||||||
await temp_writer.write(chunk_data.output)
|
await temp_writer.write(chunk_data.output)
|
||||||
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
||||||
|
|
||||||
|
# Add any chunks that may be in the acumulator into the return word_timestamps
|
||||||
|
chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
|
||||||
|
timestamp_acumulator=[]
|
||||||
|
|
||||||
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
||||||
|
else:
|
||||||
|
if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
|
||||||
|
timestamp_acumulator+=chunk_data.word_timestamps
|
||||||
|
|
||||||
# Finalize the temp file
|
# Finalize the temp file
|
||||||
await temp_writer.finalize()
|
await temp_writer.finalize()
|
||||||
|
@ -234,13 +244,24 @@ async def create_captioned_speech(
|
||||||
|
|
||||||
async def single_output():
|
async def single_output():
|
||||||
try:
|
try:
|
||||||
|
# The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
|
||||||
|
timestamp_acumulator=[]
|
||||||
|
|
||||||
# Stream chunks
|
# Stream chunks
|
||||||
async for chunk_data in generator:
|
async for chunk_data in generator:
|
||||||
if chunk_data.output: # Skip empty chunks
|
if chunk_data.output: # Skip empty chunks
|
||||||
# Encode the chunk bytes into base 64
|
# Encode the chunk bytes into base 64
|
||||||
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
|
||||||
|
|
||||||
|
# Add any chunks that may be in the acumulator into the return word_timestamps
|
||||||
|
chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
|
||||||
|
timestamp_acumulator=[]
|
||||||
|
|
||||||
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
|
||||||
|
else:
|
||||||
|
if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
|
||||||
|
timestamp_acumulator+=chunk_data.word_timestamps
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in single output streaming: {e}")
|
logger.error(f"Error in single output streaming: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
Loading…
Add table
Reference in a new issue