mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Fixes a couple of issues with audio triming and prevents errors with single voice weights
This commit is contained in:
parent
f2b2f41412
commit
4ee4d36822
4 changed files with 11 additions and 3 deletions
|
@ -294,6 +294,7 @@ async def create_captioned_speech(
|
||||||
request.response_format,
|
request.response_format,
|
||||||
is_first_chunk=True,
|
is_first_chunk=True,
|
||||||
is_last_chunk=False,
|
is_last_chunk=False,
|
||||||
|
trim_audio=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to requested format with proper finalization
|
# Convert to requested format with proper finalization
|
||||||
|
|
|
@ -294,6 +294,7 @@ async def create_speech(
|
||||||
request.response_format,
|
request.response_format,
|
||||||
is_first_chunk=True,
|
is_first_chunk=True,
|
||||||
is_last_chunk=False,
|
is_last_chunk=False,
|
||||||
|
trim_audio=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to requested format with proper finalization
|
# Convert to requested format with proper finalization
|
||||||
|
|
|
@ -119,6 +119,7 @@ class AudioService:
|
||||||
chunk_text: str = "",
|
chunk_text: str = "",
|
||||||
is_first_chunk: bool = True,
|
is_first_chunk: bool = True,
|
||||||
is_last_chunk: bool = False,
|
is_last_chunk: bool = False,
|
||||||
|
trim_audio: bool = True,
|
||||||
normalizer: AudioNormalizer = None,
|
normalizer: AudioNormalizer = None,
|
||||||
) -> Tuple[AudioChunk]:
|
) -> Tuple[AudioChunk]:
|
||||||
"""Convert audio data to specified format with streaming support
|
"""Convert audio data to specified format with streaming support
|
||||||
|
@ -147,6 +148,8 @@ class AudioService:
|
||||||
normalizer = AudioNormalizer()
|
normalizer = AudioNormalizer()
|
||||||
|
|
||||||
audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
|
audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
|
||||||
|
|
||||||
|
if trim_audio == True:
|
||||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||||
|
|
||||||
# Get or create format-specific writer
|
# Get or create format-specific writer
|
||||||
|
|
|
@ -65,7 +65,7 @@ class TTSService:
|
||||||
yield AudioChunk(np.array([], dtype=np.int16),output=b'')
|
yield AudioChunk(np.array([], dtype=np.int16),output=b'')
|
||||||
return
|
return
|
||||||
chunk_data = await AudioService.convert_audio(
|
chunk_data = await AudioService.convert_audio(
|
||||||
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
|
AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking
|
||||||
24000,
|
24000,
|
||||||
output_format,
|
output_format,
|
||||||
speed,
|
speed,
|
||||||
|
@ -225,6 +225,8 @@ class TTSService:
|
||||||
return voice, combined_path
|
return voice, combined_path
|
||||||
else:
|
else:
|
||||||
# Single voice
|
# Single voice
|
||||||
|
if "(" in voice and ")" in voice:
|
||||||
|
voice = voice.split("(")[0].strip()
|
||||||
path = await self._voice_manager.get_voice_path(voice)
|
path = await self._voice_manager.get_voice_path(voice)
|
||||||
if not path:
|
if not path:
|
||||||
raise RuntimeError(f"Voice not found: {voice}")
|
raise RuntimeError(f"Voice not found: {voice}")
|
||||||
|
@ -341,9 +343,10 @@ class TTSService:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
|
async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
|
||||||
|
if len(audio_stream_data.audio) > 0:
|
||||||
audio_data_chunks.append(audio_stream_data)
|
audio_data_chunks.append(audio_stream_data)
|
||||||
|
|
||||||
|
|
||||||
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
||||||
return combined_audio_data
|
return combined_audio_data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
Loading…
Add table
Reference in a new issue