Simplifed generate_audio in tts_service mostly working (audio conversion does not work)

This commit is contained in:
Fireblade 2025-02-12 22:42:41 -05:00
parent 5b20602b8e
commit dbf2b99026
7 changed files with 34 additions and 16 deletions

View file

@ -12,7 +12,8 @@ response = requests.post(
"input": "http://localhost:8880/web/", "input": "http://localhost:8880/web/",
"voice": "af_heart", "voice": "af_heart",
"response_format": "mp3", # Supported: mp3, wav, opus, flac "response_format": "mp3", # Supported: mp3, wav, opus, flac
"speed": 1.0 "speed": 1.0,
"stream":False,
} }
) )

View file

@ -16,6 +16,17 @@ class AudioChunk:
self.audio=audio self.audio=audio
self.word_timestamps=word_timestamps self.word_timestamps=word_timestamps
@staticmethod
def combine(audio_chunk_list: List):
output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
for audio_chunk in audio_chunk_list[1:]:
output.audio=np.concatenate((output.audio,audio_chunk.audio))
if output.word_timestamps is not None:
output.word_timestamps+=output.word_timestamps
return output
class ModelBackend(ABC): class ModelBackend(ABC):
"""Abstract base class for model inference backend.""" """Abstract base class for model inference backend."""

View file

@ -6,6 +6,7 @@ import os
import re import re
import tempfile import tempfile
from typing import AsyncGenerator, Dict, List, Union, Tuple from typing import AsyncGenerator, Dict, List, Union, Tuple
from urllib import response
import aiofiles import aiofiles
from ..inference.base import AudioChunk from ..inference.base import AudioChunk
@ -141,7 +142,7 @@ async def stream_audio_chunks(
output_format=request.response_format, output_format=request.response_format,
lang_code=request.lang_code or request.voice[0], lang_code=request.lang_code or request.voice[0],
normalization_options=request.normalization_options, normalization_options=request.normalization_options,
return_timestamps=True, return_timestamps=False,
): ):
# Check if client is still connected # Check if client is still connected
@ -258,7 +259,7 @@ async def create_speech(
) )
else: else:
# Generate complete audio using public interface # Generate complete audio using public interface
audio, _ = await tts_service.generate_audio( audio, audio_data = await tts_service.generate_audio(
text=request.input, text=request.input,
voice=voice_name, voice=voice_name,
speed=request.speed, speed=request.speed,
@ -266,14 +267,14 @@ async def create_speech(
) )
# Convert to requested format with proper finalization # Convert to requested format with proper finalization
content = await AudioService.convert_audio( content, audio_data = await AudioService.convert_audio(
audio, audio_data,
24000, 24000,
request.response_format, request.response_format,
is_first_chunk=True, is_first_chunk=True,
is_last_chunk=True, is_last_chunk=True,
) )
print(content,request.response_format)
return Response( return Response(
content=content, content=content,
media_type=content_type, media_type=content_type,

View file

@ -148,9 +148,11 @@ class AudioService:
if normalizer is None: if normalizer is None:
normalizer = AudioNormalizer() normalizer = AudioNormalizer()
print(len(audio_chunk.audio),"1")
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio) audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
print(len(audio_chunk.audio),"2")
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
print(len(audio_chunk.audio),"3")
# Get or create format-specific writer # Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}" writer_key = f"{output_format}_{sample_rate}"
if is_first_chunk or writer_key not in AudioService._writers: if is_first_chunk or writer_key not in AudioService._writers:
@ -167,6 +169,7 @@ class AudioService:
if is_last_chunk: if is_last_chunk:
final_data = writer.write_chunk(finalize=True) final_data = writer.write_chunk(finalize=True)
del AudioService._writers[writer_key] del AudioService._writers[writer_key]
print(audio_chunk.audio)
return final_data if final_data else b"", audio_chunk return final_data if final_data else b"", audio_chunk
return chunk_data if chunk_data else b"", audio_chunk return chunk_data if chunk_data else b"", audio_chunk

View file

@ -333,25 +333,27 @@ class TTSService:
text: str, text: str,
voice: str, voice: str,
speed: float = 1.0, speed: float = 1.0,
return_timestamps: bool = False, return_timestamps: bool = True,
lang_code: Optional[str] = None, lang_code: Optional[str] = None,
) -> Tuple[Tuple[np.ndarray,AudioChunk]]: ) -> Tuple[Tuple[np.ndarray,AudioChunk]]:
"""Generate complete audio for text using streaming internally.""" """Generate complete audio for text using streaming internally."""
start_time = time.time() start_time = time.time()
audio_chunks = [] audio_chunks = []
audio_data_chunks=[] audio_data_chunks=[]
word_timestamps = []
start_time = time.time()
chunks = []
word_timestamps = []
try: try:
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code): async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
print("common")
audio_chunks.append(audio_stream_data.audio) audio_chunks.append(audio_stream_data.audio)
audio_data_chunks.append(audio_stream_data) audio_data_chunks.append(audio_stream_data)
print(audio_data_chunks) print(audio_data_chunks[0].audio.shape)
combined_audio=np.concatenate(audio_chunks)
print("1")
combined_audio_data=AudioChunk.combine(audio_data_chunks)
print("2")
print(len(combined_audio_data.audio))
return combined_audio,combined_audio_data
""" """
# Get backend and voice path # Get backend and voice path
backend = self.model_manager.get_backend() backend = self.model_manager.get_backend()

Binary file not shown.

Binary file not shown.