mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Fixes a couple of issues with audio triming and prevents errors with single voice weights
This commit is contained in:
parent
f2b2f41412
commit
4ee4d36822
4 changed files with 11 additions and 3 deletions
|
@ -294,6 +294,7 @@ async def create_captioned_speech(
|
|||
request.response_format,
|
||||
is_first_chunk=True,
|
||||
is_last_chunk=False,
|
||||
trim_audio=False,
|
||||
)
|
||||
|
||||
# Convert to requested format with proper finalization
|
||||
|
|
|
@ -294,6 +294,7 @@ async def create_speech(
|
|||
request.response_format,
|
||||
is_first_chunk=True,
|
||||
is_last_chunk=False,
|
||||
trim_audio=False
|
||||
)
|
||||
|
||||
# Convert to requested format with proper finalization
|
||||
|
|
|
@ -119,6 +119,7 @@ class AudioService:
|
|||
chunk_text: str = "",
|
||||
is_first_chunk: bool = True,
|
||||
is_last_chunk: bool = False,
|
||||
trim_audio: bool = True,
|
||||
normalizer: AudioNormalizer = None,
|
||||
) -> Tuple[AudioChunk]:
|
||||
"""Convert audio data to specified format with streaming support
|
||||
|
@ -147,7 +148,9 @@ class AudioService:
|
|||
normalizer = AudioNormalizer()
|
||||
|
||||
audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
|
||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||
|
||||
if trim_audio == True:
|
||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||
|
||||
# Get or create format-specific writer
|
||||
writer_key = f"{output_format}_{sample_rate}"
|
||||
|
|
|
@ -65,7 +65,7 @@ class TTSService:
|
|||
yield AudioChunk(np.array([], dtype=np.int16),output=b'')
|
||||
return
|
||||
chunk_data = await AudioService.convert_audio(
|
||||
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
|
||||
AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking
|
||||
24000,
|
||||
output_format,
|
||||
speed,
|
||||
|
@ -225,6 +225,8 @@ class TTSService:
|
|||
return voice, combined_path
|
||||
else:
|
||||
# Single voice
|
||||
if "(" in voice and ")" in voice:
|
||||
voice = voice.split("(")[0].strip()
|
||||
path = await self._voice_manager.get_voice_path(voice)
|
||||
if not path:
|
||||
raise RuntimeError(f"Voice not found: {voice}")
|
||||
|
@ -341,8 +343,9 @@ class TTSService:
|
|||
|
||||
try:
|
||||
async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
|
||||
if len(audio_stream_data.audio) > 0:
|
||||
audio_data_chunks.append(audio_stream_data)
|
||||
|
||||
audio_data_chunks.append(audio_stream_data)
|
||||
|
||||
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
||||
return combined_audio_data
|
||||
|
|
Loading…
Add table
Reference in a new issue