diff --git a/Test.py b/Test.py index 0944861..fd51b5a 100644 --- a/Test.py +++ b/Test.py @@ -12,10 +12,7 @@ response = requests.post( "input": "http://localhost:8880/web/", "voice": "af_heart", "response_format": "mp3", # Supported: mp3, wav, opus, flac - "speed": 1.0, - "normalization_options": { - "normalize": True - } + "speed": 1.0 } ) diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py index 91fb44d..84f4f79 100644 --- a/api/src/inference/kokoro_v1.py +++ b/api/src/inference/kokoro_v1.py @@ -291,17 +291,6 @@ class KokoroV1(BaseModelBackend): f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s" ) - # Update offset for next chunk based on pred_dur - chunk_duration = ( - float(result.pred_dur.sum()) / 80 - ) # Convert frames to seconds - current_offset = max( - current_offset + chunk_duration, end_time - ) - logger.debug( - f"Updated time offset to {current_offset:.3f}s" - ) - except Exception as e: logger.error( f"Failed to process timestamps for chunk: {e}" diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 5090a4b..e285234 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -8,7 +8,7 @@ import tempfile from typing import AsyncGenerator, Dict, List, Union, Tuple import aiofiles -from inference.base import AudioChunk +from ..inference.base import AudioChunk import torch from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response from fastapi.responses import FileResponse, StreamingResponse @@ -214,16 +214,16 @@ async def create_speech( } # Create async generator for streaming - async def dual_output(return_json:bool=False): + async def dual_output(): try: # Write chunks to temp file and stream async for chunk, chunk_data in generator: if chunk: # Skip empty chunks await temp_writer.write(chunk) - if return_json: - yield chunk, chunk_data - else: - yield chunk + #if return_json: + # yield chunk, chunk_data + #else: + yield chunk # Finalize the temp file await temp_writer.finalize() diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 7f2dcaa..470523e 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -327,11 +327,22 @@ class TTSService: lang_code: Optional[str] = None, ) -> Tuple[Tuple[np.ndarray,AudioChunk]]: """Generate complete audio for text using streaming internally.""" + start_time = time.time() + audio_chunks = [] + audio_data_chunks=[] + word_timestamps = [] + start_time = time.time() chunks = [] word_timestamps = [] - try: + async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code): + print("common") + audio_chunks.append(audio_stream_data.audio) + audio_data_chunks.append(audio_stream_data) + + print(audio_data_chunks) + """ # Get backend and voice path backend = self.model_manager.get_backend() voice_name, voice_path = await self._get_voice_path(voice) @@ -574,10 +585,11 @@ class TTSService: [], ) # Empty timestamps for legacy backends return audio, processing_time - + """ except Exception as e: logger.error(f"Error in audio generation: {str(e)}") raise + async def combine_voices(self, voices: List[str]) -> torch.Tensor: """Combine multiple voices. diff --git a/output.mp3 b/output.mp3 index b6d7635..580f1b9 100644 Binary files a/output.mp3 and b/output.mp3 differ