mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-21 05:44:06 +00:00
Fixed not returning enough values
This commit is contained in:
parent
5cc9d140fe
commit
51b6b01589
3 changed files with 16 additions and 18 deletions
|
@ -11,7 +11,7 @@ class AudioChunk:
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
word_timestamps: Optional[List]=None
|
word_timestamps: Optional[List]=[]
|
||||||
):
|
):
|
||||||
self.audio=audio
|
self.audio=audio
|
||||||
self.word_timestamps=word_timestamps
|
self.word_timestamps=word_timestamps
|
||||||
|
|
|
@ -57,7 +57,6 @@ class AudioNormalizer:
|
||||||
non_silent_index_start, non_silent_index_end = None,None
|
non_silent_index_start, non_silent_index_end = None,None
|
||||||
|
|
||||||
for X in range(0,len(audio_data)):
|
for X in range(0,len(audio_data)):
|
||||||
#print(audio_data[X])
|
|
||||||
if audio_data[X] > amplitude_threshold:
|
if audio_data[X] > amplitude_threshold:
|
||||||
non_silent_index_start=X
|
non_silent_index_start=X
|
||||||
break
|
break
|
||||||
|
@ -149,11 +148,9 @@ class AudioService:
|
||||||
if normalizer is None:
|
if normalizer is None:
|
||||||
normalizer = AudioNormalizer()
|
normalizer = AudioNormalizer()
|
||||||
|
|
||||||
print("1")
|
|
||||||
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
|
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
|
||||||
print("2")
|
|
||||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||||
print("3")
|
|
||||||
# Get or create format-specific writer
|
# Get or create format-specific writer
|
||||||
writer_key = f"{output_format}_{sample_rate}"
|
writer_key = f"{output_format}_{sample_rate}"
|
||||||
if is_first_chunk or writer_key not in AudioService._writers:
|
if is_first_chunk or writer_key not in AudioService._writers:
|
||||||
|
@ -161,18 +158,16 @@ class AudioService:
|
||||||
output_format, sample_rate
|
output_format, sample_rate
|
||||||
)
|
)
|
||||||
writer = AudioService._writers[writer_key]
|
writer = AudioService._writers[writer_key]
|
||||||
print("4")
|
|
||||||
# Write audio data first
|
# Write audio data first
|
||||||
if len(audio_chunk.audio) > 0:
|
if len(audio_chunk.audio) > 0:
|
||||||
chunk_data = writer.write_chunk(audio_chunk.audio)
|
chunk_data = writer.write_chunk(audio_chunk.audio)
|
||||||
print("5")
|
|
||||||
# Then finalize if this is the last chunk
|
# Then finalize if this is the last chunk
|
||||||
if is_last_chunk:
|
if is_last_chunk:
|
||||||
print("6")
|
|
||||||
final_data = writer.write_chunk(finalize=True)
|
final_data = writer.write_chunk(finalize=True)
|
||||||
print("7")
|
|
||||||
del AudioService._writers[writer_key]
|
del AudioService._writers[writer_key]
|
||||||
return final_data if final_data else b""
|
return final_data if final_data else b"", audio_chunk
|
||||||
|
|
||||||
return chunk_data if chunk_data else b"", audio_chunk
|
return chunk_data if chunk_data else b"", audio_chunk
|
||||||
|
|
||||||
|
@ -206,8 +201,10 @@ class AudioService:
|
||||||
start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk)
|
start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk)
|
||||||
|
|
||||||
audio_chunk.audio=audio_chunk.audio[start_index:end_index]
|
audio_chunk.audio=audio_chunk.audio[start_index:end_index]
|
||||||
for timestamp in audio_chunk.word_timestamps:
|
|
||||||
timestamp["start_time"]-=start_index * 24000
|
if audio_chunk.word_timestamps is not None:
|
||||||
timestamp["end_time"]-=start_index * 24000
|
for timestamp in audio_chunk.word_timestamps:
|
||||||
|
timestamp["start_time"]-=start_index / 24000
|
||||||
|
timestamp["end_time"]-=start_index / 24000
|
||||||
return audio_chunk
|
return audio_chunk
|
||||||
|
|
|
@ -6,6 +6,7 @@ import tempfile
|
||||||
import time
|
import time
|
||||||
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from ..inference.base import AudioChunk
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from kokoro import KPipeline
|
from kokoro import KPipeline
|
||||||
|
@ -62,9 +63,8 @@ class TTSService:
|
||||||
if not output_format:
|
if not output_format:
|
||||||
yield np.array([], dtype=np.float32)
|
yield np.array([], dtype=np.float32)
|
||||||
return
|
return
|
||||||
|
result, _ = await AudioService.convert_audio(
|
||||||
result = await AudioService.convert_audio(
|
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
|
||||||
np.array([0], dtype=np.float32), # Dummy data for type checking
|
|
||||||
24000,
|
24000,
|
||||||
output_format,
|
output_format,
|
||||||
speed,
|
speed,
|
||||||
|
@ -119,7 +119,7 @@ class TTSService:
|
||||||
print(chunk_data.word_timestamps)
|
print(chunk_data.word_timestamps)
|
||||||
yield chunk_data.audio
|
yield chunk_data.audio
|
||||||
else:
|
else:
|
||||||
print("old backend")
|
|
||||||
# For legacy backends, load voice tensor
|
# For legacy backends, load voice tensor
|
||||||
voice_tensor = await self._voice_manager.load_voice(
|
voice_tensor = await self._voice_manager.load_voice(
|
||||||
voice_name, device=backend.device
|
voice_name, device=backend.device
|
||||||
|
@ -315,7 +315,8 @@ class TTSService:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in phoneme audio generation: {str(e)}")
|
logger.error(f"Error in phoneme audio generation: {str(e)}")
|
||||||
raise
|
raise e
|
||||||
|
|
||||||
|
|
||||||
async def generate_audio(
|
async def generate_audio(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Add table
Reference in a new issue