more work on streaming timestamps (not working weird error) :(

This commit is contained in:
Fireblade2534 2025-02-12 20:34:55 +00:00
parent 91d370d97f
commit 6985f6ef99
5 changed files with 21 additions and 23 deletions

View file

@ -12,10 +12,7 @@ response = requests.post(
"input": "http://localhost:8880/web/",
"voice": "af_heart",
"response_format": "mp3", # Supported: mp3, wav, opus, flac
"speed": 1.0,
"normalization_options": {
"normalize": True
}
"speed": 1.0
}
)

View file

@ -291,17 +291,6 @@ class KokoroV1(BaseModelBackend):
f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
)
# Update offset for next chunk based on pred_dur
chunk_duration = (
float(result.pred_dur.sum()) / 80
) # Convert frames to seconds
current_offset = max(
current_offset + chunk_duration, end_time
)
logger.debug(
f"Updated time offset to {current_offset:.3f}s"
)
except Exception as e:
logger.error(
f"Failed to process timestamps for chunk: {e}"

View file

@ -8,7 +8,7 @@ import tempfile
from typing import AsyncGenerator, Dict, List, Union, Tuple
import aiofiles
from inference.base import AudioChunk
from ..inference.base import AudioChunk
import torch
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
from fastapi.responses import FileResponse, StreamingResponse
@ -214,15 +214,15 @@ async def create_speech(
}
# Create async generator for streaming
async def dual_output(return_json:bool=False):
async def dual_output():
try:
# Write chunks to temp file and stream
async for chunk, chunk_data in generator:
if chunk: # Skip empty chunks
await temp_writer.write(chunk)
if return_json:
yield chunk, chunk_data
else:
#if return_json:
# yield chunk, chunk_data
#else:
yield chunk
# Finalize the temp file

View file

@ -328,10 +328,21 @@ class TTSService:
) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
"""Generate complete audio for text using streaming internally."""
start_time = time.time()
chunks = []
audio_chunks = []
audio_data_chunks=[]
word_timestamps = []
start_time = time.time()
chunks = []
word_timestamps = []
try:
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
print("common")
audio_chunks.append(audio_stream_data.audio)
audio_data_chunks.append(audio_stream_data)
print(audio_data_chunks)
"""
# Get backend and voice path
backend = self.model_manager.get_backend()
voice_name, voice_path = await self._get_voice_path(voice)
@ -574,11 +585,12 @@ class TTSService:
[],
) # Empty timestamps for legacy backends
return audio, processing_time
"""
except Exception as e:
logger.error(f"Error in audio generation: {str(e)}")
raise
async def combine_voices(self, voices: List[str]) -> torch.Tensor:
"""Combine multiple voices.

Binary file not shown.