mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
More work on timestamps (Does not maintain accuracy over multiple chunks)
This commit is contained in:
parent
6985f6ef99
commit
5b20602b8e
5 changed files with 22 additions and 7 deletions
|
@ -295,7 +295,8 @@ class KokoroV1(BaseModelBackend):
|
|||
logger.error(
|
||||
f"Failed to process timestamps for chunk: {e}"
|
||||
)
|
||||
|
||||
|
||||
|
||||
yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps)
|
||||
else:
|
||||
logger.warning("No audio in chunk")
|
||||
|
|
|
@ -128,7 +128,7 @@ async def process_voices(
|
|||
|
||||
async def stream_audio_chunks(
|
||||
tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request
|
||||
) -> AsyncGenerator[Tuple[bytes,AudioChunk], None]:
|
||||
) -> AsyncGenerator[list, None]:
|
||||
"""Stream audio chunks as they're generated with client disconnect handling"""
|
||||
voice_name = await process_voices(request.voice, tts_service)
|
||||
|
||||
|
@ -140,8 +140,10 @@ async def stream_audio_chunks(
|
|||
speed=request.speed,
|
||||
output_format=request.response_format,
|
||||
lang_code=request.lang_code or request.voice[0],
|
||||
normalization_options=request.normalization_options
|
||||
normalization_options=request.normalization_options,
|
||||
return_timestamps=True,
|
||||
):
|
||||
|
||||
# Check if client is still connected
|
||||
is_disconnected = client_request.is_disconnected
|
||||
if callable(is_disconnected):
|
||||
|
@ -149,7 +151,8 @@ async def stream_audio_chunks(
|
|||
if is_disconnected:
|
||||
logger.info("Client disconnected, stopping audio generation")
|
||||
break
|
||||
yield chunk, chunk_data
|
||||
|
||||
yield chunk
|
||||
except Exception as e:
|
||||
logger.error(f"Error in audio streaming: {str(e)}")
|
||||
# Let the exception propagate to trigger cleanup
|
||||
|
@ -158,6 +161,7 @@ async def stream_audio_chunks(
|
|||
|
||||
@router.post("/audio/speech")
|
||||
async def create_speech(
|
||||
|
||||
request: OpenAISpeechRequest,
|
||||
client_request: Request,
|
||||
x_raw_response: str = Header(None, alias="x-raw-response"),
|
||||
|
@ -217,7 +221,7 @@ async def create_speech(
|
|||
async def dual_output():
|
||||
try:
|
||||
# Write chunks to temp file and stream
|
||||
async for chunk, chunk_data in generator:
|
||||
async for chunk in generator:
|
||||
if chunk: # Skip empty chunks
|
||||
await temp_writer.write(chunk)
|
||||
#if return_json:
|
||||
|
|
|
@ -247,7 +247,7 @@ class TTSService:
|
|||
"""Generate and stream audio chunks."""
|
||||
stream_normalizer = AudioNormalizer()
|
||||
chunk_index = 0
|
||||
|
||||
current_offset=0.0
|
||||
try:
|
||||
# Get backend
|
||||
backend = self.model_manager.get_backend()
|
||||
|
@ -261,7 +261,8 @@ class TTSService:
|
|||
logger.info(
|
||||
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Process text in chunks with smart splitting
|
||||
async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
|
||||
try:
|
||||
|
@ -277,8 +278,17 @@ class TTSService:
|
|||
is_last=False, # We'll update the last chunk later
|
||||
normalizer=stream_normalizer,
|
||||
lang_code=pipeline_lang_code, # Pass lang_code
|
||||
return_timestamps=return_timestamps,
|
||||
):
|
||||
if chunk_data.word_timestamps is not None:
|
||||
for timestamp in chunk_data.word_timestamps:
|
||||
timestamp["start_time"]+=current_offset
|
||||
timestamp["end_time"]+=current_offset
|
||||
|
||||
current_offset+=len(chunk_data.audio) / 24000
|
||||
|
||||
if result is not None:
|
||||
print(chunk_data.word_timestamps)
|
||||
yield result,chunk_data
|
||||
chunk_index += 1
|
||||
else:
|
||||
|
|
BIN
output.mp3
BIN
output.mp3
Binary file not shown.
BIN
peaks/output.mp3.reapeaks
Normal file
BIN
peaks/output.mp3.reapeaks
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue