mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
more work on streaming timestamps (not working weird error) :(
This commit is contained in:
parent
91d370d97f
commit
6985f6ef99
5 changed files with 21 additions and 23 deletions
5
Test.py
5
Test.py
|
@ -12,10 +12,7 @@ response = requests.post(
|
||||||
"input": "http://localhost:8880/web/",
|
"input": "http://localhost:8880/web/",
|
||||||
"voice": "af_heart",
|
"voice": "af_heart",
|
||||||
"response_format": "mp3", # Supported: mp3, wav, opus, flac
|
"response_format": "mp3", # Supported: mp3, wav, opus, flac
|
||||||
"speed": 1.0,
|
"speed": 1.0
|
||||||
"normalization_options": {
|
|
||||||
"normalize": True
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -291,17 +291,6 @@ class KokoroV1(BaseModelBackend):
|
||||||
f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
|
f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update offset for next chunk based on pred_dur
|
|
||||||
chunk_duration = (
|
|
||||||
float(result.pred_dur.sum()) / 80
|
|
||||||
) # Convert frames to seconds
|
|
||||||
current_offset = max(
|
|
||||||
current_offset + chunk_duration, end_time
|
|
||||||
)
|
|
||||||
logger.debug(
|
|
||||||
f"Updated time offset to {current_offset:.3f}s"
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Failed to process timestamps for chunk: {e}"
|
f"Failed to process timestamps for chunk: {e}"
|
||||||
|
|
|
@ -8,7 +8,7 @@ import tempfile
|
||||||
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from inference.base import AudioChunk
|
from ..inference.base import AudioChunk
|
||||||
import torch
|
import torch
|
||||||
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
|
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
|
||||||
from fastapi.responses import FileResponse, StreamingResponse
|
from fastapi.responses import FileResponse, StreamingResponse
|
||||||
|
@ -214,16 +214,16 @@ async def create_speech(
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create async generator for streaming
|
# Create async generator for streaming
|
||||||
async def dual_output(return_json:bool=False):
|
async def dual_output():
|
||||||
try:
|
try:
|
||||||
# Write chunks to temp file and stream
|
# Write chunks to temp file and stream
|
||||||
async for chunk, chunk_data in generator:
|
async for chunk, chunk_data in generator:
|
||||||
if chunk: # Skip empty chunks
|
if chunk: # Skip empty chunks
|
||||||
await temp_writer.write(chunk)
|
await temp_writer.write(chunk)
|
||||||
if return_json:
|
#if return_json:
|
||||||
yield chunk, chunk_data
|
# yield chunk, chunk_data
|
||||||
else:
|
#else:
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
# Finalize the temp file
|
# Finalize the temp file
|
||||||
await temp_writer.finalize()
|
await temp_writer.finalize()
|
||||||
|
|
|
@ -327,11 +327,22 @@ class TTSService:
|
||||||
lang_code: Optional[str] = None,
|
lang_code: Optional[str] = None,
|
||||||
) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
|
) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
|
||||||
"""Generate complete audio for text using streaming internally."""
|
"""Generate complete audio for text using streaming internally."""
|
||||||
|
start_time = time.time()
|
||||||
|
audio_chunks = []
|
||||||
|
audio_data_chunks=[]
|
||||||
|
word_timestamps = []
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
chunks = []
|
chunks = []
|
||||||
word_timestamps = []
|
word_timestamps = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
|
||||||
|
print("common")
|
||||||
|
audio_chunks.append(audio_stream_data.audio)
|
||||||
|
audio_data_chunks.append(audio_stream_data)
|
||||||
|
|
||||||
|
print(audio_data_chunks)
|
||||||
|
"""
|
||||||
# Get backend and voice path
|
# Get backend and voice path
|
||||||
backend = self.model_manager.get_backend()
|
backend = self.model_manager.get_backend()
|
||||||
voice_name, voice_path = await self._get_voice_path(voice)
|
voice_name, voice_path = await self._get_voice_path(voice)
|
||||||
|
@ -574,10 +585,11 @@ class TTSService:
|
||||||
[],
|
[],
|
||||||
) # Empty timestamps for legacy backends
|
) # Empty timestamps for legacy backends
|
||||||
return audio, processing_time
|
return audio, processing_time
|
||||||
|
"""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in audio generation: {str(e)}")
|
logger.error(f"Error in audio generation: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
async def combine_voices(self, voices: List[str]) -> torch.Tensor:
|
async def combine_voices(self, voices: List[str]) -> torch.Tensor:
|
||||||
"""Combine multiple voices.
|
"""Combine multiple voices.
|
||||||
|
|
BIN
output.mp3
BIN
output.mp3
Binary file not shown.
Loading…
Add table
Reference in a new issue