mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Simplifed generate_audio in tts_service mostly working (audio conversion does not work)
This commit is contained in:
parent
5b20602b8e
commit
dbf2b99026
7 changed files with 34 additions and 16 deletions
3
Test.py
3
Test.py
|
@ -12,7 +12,8 @@ response = requests.post(
|
||||||
"input": "http://localhost:8880/web/",
|
"input": "http://localhost:8880/web/",
|
||||||
"voice": "af_heart",
|
"voice": "af_heart",
|
||||||
"response_format": "mp3", # Supported: mp3, wav, opus, flac
|
"response_format": "mp3", # Supported: mp3, wav, opus, flac
|
||||||
"speed": 1.0
|
"speed": 1.0,
|
||||||
|
"stream":False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,17 @@ class AudioChunk:
|
||||||
self.audio=audio
|
self.audio=audio
|
||||||
self.word_timestamps=word_timestamps
|
self.word_timestamps=word_timestamps
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def combine(audio_chunk_list: List):
|
||||||
|
output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
|
||||||
|
|
||||||
|
for audio_chunk in audio_chunk_list[1:]:
|
||||||
|
output.audio=np.concatenate((output.audio,audio_chunk.audio))
|
||||||
|
if output.word_timestamps is not None:
|
||||||
|
output.word_timestamps+=output.word_timestamps
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
class ModelBackend(ABC):
|
class ModelBackend(ABC):
|
||||||
"""Abstract base class for model inference backend."""
|
"""Abstract base class for model inference backend."""
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
||||||
|
from urllib import response
|
||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from ..inference.base import AudioChunk
|
from ..inference.base import AudioChunk
|
||||||
|
@ -141,7 +142,7 @@ async def stream_audio_chunks(
|
||||||
output_format=request.response_format,
|
output_format=request.response_format,
|
||||||
lang_code=request.lang_code or request.voice[0],
|
lang_code=request.lang_code or request.voice[0],
|
||||||
normalization_options=request.normalization_options,
|
normalization_options=request.normalization_options,
|
||||||
return_timestamps=True,
|
return_timestamps=False,
|
||||||
):
|
):
|
||||||
|
|
||||||
# Check if client is still connected
|
# Check if client is still connected
|
||||||
|
@ -258,7 +259,7 @@ async def create_speech(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Generate complete audio using public interface
|
# Generate complete audio using public interface
|
||||||
audio, _ = await tts_service.generate_audio(
|
audio, audio_data = await tts_service.generate_audio(
|
||||||
text=request.input,
|
text=request.input,
|
||||||
voice=voice_name,
|
voice=voice_name,
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
|
@ -266,14 +267,14 @@ async def create_speech(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to requested format with proper finalization
|
# Convert to requested format with proper finalization
|
||||||
content = await AudioService.convert_audio(
|
content, audio_data = await AudioService.convert_audio(
|
||||||
audio,
|
audio_data,
|
||||||
24000,
|
24000,
|
||||||
request.response_format,
|
request.response_format,
|
||||||
is_first_chunk=True,
|
is_first_chunk=True,
|
||||||
is_last_chunk=True,
|
is_last_chunk=True,
|
||||||
)
|
)
|
||||||
|
print(content,request.response_format)
|
||||||
return Response(
|
return Response(
|
||||||
content=content,
|
content=content,
|
||||||
media_type=content_type,
|
media_type=content_type,
|
||||||
|
|
|
@ -148,9 +148,11 @@ class AudioService:
|
||||||
if normalizer is None:
|
if normalizer is None:
|
||||||
normalizer = AudioNormalizer()
|
normalizer = AudioNormalizer()
|
||||||
|
|
||||||
|
print(len(audio_chunk.audio),"1")
|
||||||
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
|
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
|
||||||
|
print(len(audio_chunk.audio),"2")
|
||||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||||
|
print(len(audio_chunk.audio),"3")
|
||||||
# Get or create format-specific writer
|
# Get or create format-specific writer
|
||||||
writer_key = f"{output_format}_{sample_rate}"
|
writer_key = f"{output_format}_{sample_rate}"
|
||||||
if is_first_chunk or writer_key not in AudioService._writers:
|
if is_first_chunk or writer_key not in AudioService._writers:
|
||||||
|
@ -167,6 +169,7 @@ class AudioService:
|
||||||
if is_last_chunk:
|
if is_last_chunk:
|
||||||
final_data = writer.write_chunk(finalize=True)
|
final_data = writer.write_chunk(finalize=True)
|
||||||
del AudioService._writers[writer_key]
|
del AudioService._writers[writer_key]
|
||||||
|
print(audio_chunk.audio)
|
||||||
return final_data if final_data else b"", audio_chunk
|
return final_data if final_data else b"", audio_chunk
|
||||||
|
|
||||||
return chunk_data if chunk_data else b"", audio_chunk
|
return chunk_data if chunk_data else b"", audio_chunk
|
||||||
|
|
|
@ -333,25 +333,27 @@ class TTSService:
|
||||||
text: str,
|
text: str,
|
||||||
voice: str,
|
voice: str,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
return_timestamps: bool = False,
|
return_timestamps: bool = True,
|
||||||
lang_code: Optional[str] = None,
|
lang_code: Optional[str] = None,
|
||||||
) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
|
) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
|
||||||
"""Generate complete audio for text using streaming internally."""
|
"""Generate complete audio for text using streaming internally."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
audio_chunks = []
|
audio_chunks = []
|
||||||
audio_data_chunks=[]
|
audio_data_chunks=[]
|
||||||
word_timestamps = []
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
chunks = []
|
|
||||||
word_timestamps = []
|
|
||||||
try:
|
try:
|
||||||
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
|
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
|
||||||
print("common")
|
|
||||||
audio_chunks.append(audio_stream_data.audio)
|
audio_chunks.append(audio_stream_data.audio)
|
||||||
audio_data_chunks.append(audio_stream_data)
|
audio_data_chunks.append(audio_stream_data)
|
||||||
|
|
||||||
print(audio_data_chunks)
|
print(audio_data_chunks[0].audio.shape)
|
||||||
|
|
||||||
|
combined_audio=np.concatenate(audio_chunks)
|
||||||
|
print("1")
|
||||||
|
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
||||||
|
print("2")
|
||||||
|
print(len(combined_audio_data.audio))
|
||||||
|
return combined_audio,combined_audio_data
|
||||||
"""
|
"""
|
||||||
# Get backend and voice path
|
# Get backend and voice path
|
||||||
backend = self.model_manager.get_backend()
|
backend = self.model_manager.get_backend()
|
||||||
|
|
BIN
output.mp3
BIN
output.mp3
Binary file not shown.
Binary file not shown.
Loading…
Add table
Reference in a new issue