2025-01-20 22:42:29 -07:00
|
|
|
"""TTS service using model and voice managers."""
|
|
|
|
|
2025-01-03 00:53:41 -07:00
|
|
|
import io
|
|
|
|
import time
|
2025-01-22 02:33:29 -07:00
|
|
|
from typing import List, Tuple, Optional
|
2025-01-20 22:42:29 -07:00
|
|
|
|
2025-01-13 20:15:46 -07:00
|
|
|
import numpy as np
|
2025-01-03 00:53:41 -07:00
|
|
|
import scipy.io.wavfile as wavfile
|
2025-01-21 21:44:21 -07:00
|
|
|
import torch
|
2025-01-03 00:53:41 -07:00
|
|
|
from loguru import logger
|
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
from ..core.config import settings
|
2025-01-20 22:42:29 -07:00
|
|
|
from ..inference.model_manager import get_manager as get_model_manager
|
|
|
|
from ..inference.voice_manager import get_manager as get_voice_manager
|
2025-01-13 20:15:46 -07:00
|
|
|
from .audio import AudioNormalizer, AudioService
|
2025-01-21 21:44:21 -07:00
|
|
|
from .text_processing import chunker, normalize_text, process_text
|
2025-01-03 00:53:41 -07:00
|
|
|
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
import asyncio
|
|
|
|
|
2025-01-03 00:53:41 -07:00
|
|
|
class TTSService:
|
2025-01-20 22:42:29 -07:00
|
|
|
"""Text-to-speech service."""
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Limit concurrent chunk processing
|
|
|
|
_chunk_semaphore = asyncio.Semaphore(4)
|
|
|
|
|
2025-01-03 00:53:41 -07:00
|
|
|
def __init__(self, output_dir: str = None):
|
2025-01-20 22:42:29 -07:00
|
|
|
"""Initialize service.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
output_dir: Optional output directory for saving audio
|
|
|
|
"""
|
2025-01-03 00:53:41 -07:00
|
|
|
self.output_dir = output_dir
|
2025-01-22 02:33:29 -07:00
|
|
|
self.model_manager = None
|
|
|
|
self._voice_manager = None
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
@classmethod
|
|
|
|
async def create(cls, output_dir: str = None) -> 'TTSService':
|
|
|
|
"""Create and initialize TTSService instance.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
output_dir: Optional output directory for saving audio
|
2025-01-20 22:42:29 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
Returns:
|
|
|
|
Initialized TTSService instance
|
|
|
|
"""
|
|
|
|
service = cls(output_dir)
|
|
|
|
# Initialize managers
|
|
|
|
service.model_manager = await get_model_manager()
|
|
|
|
service._voice_manager = await get_voice_manager()
|
|
|
|
return service
|
2025-01-20 22:42:29 -07:00
|
|
|
|
|
|
|
async def generate_audio(
|
2025-01-22 05:00:38 -07:00
|
|
|
self, text: str, voice: str, speed: float = 1.0, stitch_long_output: bool = True
|
2025-01-20 22:42:29 -07:00
|
|
|
) -> Tuple[np.ndarray, float]:
|
|
|
|
"""Generate audio for text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
text: Input text
|
|
|
|
voice: Voice name
|
|
|
|
speed: Speed multiplier
|
2025-01-22 05:00:38 -07:00
|
|
|
stitch_long_output: Whether to stitch together long outputs
|
2025-01-20 22:42:29 -07:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
Audio samples and processing time
|
2025-01-22 05:00:38 -07:00
|
|
|
|
|
|
|
Raises:
|
|
|
|
ValueError: If text is empty after preprocessing or no chunks generated
|
|
|
|
RuntimeError: If audio generation fails
|
2025-01-20 22:42:29 -07:00
|
|
|
"""
|
2025-01-03 00:53:41 -07:00
|
|
|
start_time = time.time()
|
2025-01-22 02:33:29 -07:00
|
|
|
voice_tensor = None
|
2025-01-03 00:53:41 -07:00
|
|
|
|
|
|
|
try:
|
2025-01-20 22:42:29 -07:00
|
|
|
# Normalize text
|
2025-01-03 03:16:42 -07:00
|
|
|
normalized = normalize_text(text)
|
|
|
|
if not normalized:
|
|
|
|
raise ValueError("Text is empty after preprocessing")
|
|
|
|
text = str(normalized)
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Get backend and load voice
|
|
|
|
backend = self.model_manager.get_backend()
|
|
|
|
voice_tensor = await self._voice_manager.load_voice(voice, device=backend.device)
|
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
# Get chunks using async generator
|
|
|
|
chunks = []
|
|
|
|
async for chunk in chunker.split_text(text):
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
if not chunks:
|
|
|
|
raise ValueError("No text chunks to process")
|
|
|
|
|
|
|
|
# Process chunk with concurrency control
|
|
|
|
async def process_chunk(chunk: str) -> Optional[np.ndarray]:
|
|
|
|
async with self._chunk_semaphore:
|
|
|
|
try:
|
|
|
|
tokens = process_text(chunk)
|
|
|
|
if not tokens:
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Generate audio
|
|
|
|
return await self.model_manager.generate(
|
|
|
|
tokens,
|
|
|
|
voice_tensor,
|
|
|
|
speed=speed
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to process chunk: '{chunk}'. Error: {str(e)}")
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Process all chunks concurrently
|
|
|
|
chunk_results = await asyncio.gather(*[
|
|
|
|
process_chunk(chunk) for chunk in chunks
|
|
|
|
])
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Filter out None results and combine
|
|
|
|
audio_chunks = [chunk for chunk in chunk_results if chunk is not None]
|
2025-01-20 22:42:29 -07:00
|
|
|
if not audio_chunks:
|
|
|
|
raise ValueError("No audio chunks were generated successfully")
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-20 22:42:29 -07:00
|
|
|
# Combine chunks
|
|
|
|
audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
|
2025-01-03 00:53:41 -07:00
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return audio, processing_time
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in audio generation: {str(e)}")
|
|
|
|
raise
|
2025-01-22 02:33:29 -07:00
|
|
|
finally:
|
|
|
|
# Always clean up voice tensor
|
|
|
|
if voice_tensor is not None:
|
|
|
|
del voice_tensor
|
|
|
|
torch.cuda.empty_cache()
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
async def generate_audio_stream(
|
2025-01-09 18:41:44 -07:00
|
|
|
self,
|
|
|
|
text: str,
|
|
|
|
voice: str,
|
2025-01-20 22:42:29 -07:00
|
|
|
speed: float = 1.0,
|
2025-01-09 18:41:44 -07:00
|
|
|
output_format: str = "wav",
|
2025-01-04 17:54:54 -07:00
|
|
|
):
|
2025-01-20 22:42:29 -07:00
|
|
|
"""Generate and stream audio chunks.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
text: Input text
|
|
|
|
voice: Voice name
|
|
|
|
speed: Speed multiplier
|
|
|
|
output_format: Output audio format
|
|
|
|
|
|
|
|
Yields:
|
|
|
|
Audio chunks as bytes
|
|
|
|
"""
|
2025-01-22 02:33:29 -07:00
|
|
|
# Setup audio processing
|
|
|
|
stream_normalizer = AudioNormalizer()
|
|
|
|
voice_tensor = None
|
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
try:
|
2025-01-20 22:42:29 -07:00
|
|
|
# Normalize text
|
2025-01-04 17:54:54 -07:00
|
|
|
normalized = normalize_text(text)
|
|
|
|
if not normalized:
|
|
|
|
raise ValueError("Text is empty after preprocessing")
|
|
|
|
text = str(normalized)
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Get backend and load voice
|
|
|
|
backend = self.model_manager.get_backend()
|
|
|
|
voice_tensor = await self._voice_manager.load_voice(voice, device=backend.device)
|
|
|
|
|
2025-01-24 04:06:47 -07:00
|
|
|
# Get chunks using async generator
|
|
|
|
chunks = []
|
|
|
|
async for chunk in chunker.split_text(text):
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
if not chunks:
|
|
|
|
raise ValueError("No text chunks to process")
|
|
|
|
|
|
|
|
# Process chunk with concurrency control
|
|
|
|
async def process_chunk(chunk: str, is_first: bool, is_last: bool) -> Optional[bytes]:
|
|
|
|
async with self._chunk_semaphore:
|
|
|
|
try:
|
|
|
|
tokens = process_text(chunk)
|
|
|
|
if not tokens:
|
|
|
|
return None
|
|
|
|
|
2025-01-20 22:42:29 -07:00
|
|
|
# Generate audio
|
|
|
|
chunk_audio = await self.model_manager.generate(
|
2025-01-21 21:44:21 -07:00
|
|
|
tokens,
|
|
|
|
voice_tensor,
|
2025-01-20 22:42:29 -07:00
|
|
|
speed=speed
|
2025-01-04 17:54:54 -07:00
|
|
|
)
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-20 22:42:29 -07:00
|
|
|
if chunk_audio is not None:
|
|
|
|
# Convert to bytes
|
2025-01-24 04:06:47 -07:00
|
|
|
return await AudioService.convert_audio(
|
2025-01-20 22:42:29 -07:00
|
|
|
chunk_audio,
|
|
|
|
24000,
|
|
|
|
output_format,
|
|
|
|
is_first_chunk=is_first,
|
|
|
|
normalizer=stream_normalizer,
|
2025-01-22 02:33:29 -07:00
|
|
|
is_last_chunk=is_last,
|
2025-01-20 22:42:29 -07:00
|
|
|
stream=True
|
|
|
|
)
|
2025-01-22 02:33:29 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to generate audio for chunk: '{chunk}'. Error: {str(e)}")
|
|
|
|
return None
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Create tasks for all chunks
|
|
|
|
tasks = [
|
|
|
|
process_chunk(chunk, i==0, i==len(chunks)-1)
|
|
|
|
for i, chunk in enumerate(chunks)
|
|
|
|
]
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Process chunks concurrently and yield results in order
|
|
|
|
for chunk_bytes in await asyncio.gather(*tasks):
|
|
|
|
if chunk_bytes is not None:
|
|
|
|
yield chunk_bytes
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in audio generation stream: {str(e)}")
|
|
|
|
raise
|
2025-01-22 02:33:29 -07:00
|
|
|
finally:
|
|
|
|
# Always clean up voice tensor
|
|
|
|
if voice_tensor is not None:
|
|
|
|
del voice_tensor
|
|
|
|
torch.cuda.empty_cache()
|
2025-01-04 17:54:54 -07:00
|
|
|
|
2025-01-07 03:50:08 -07:00
|
|
|
async def combine_voices(self, voices: List[str]) -> str:
|
2025-01-20 22:42:29 -07:00
|
|
|
"""Combine multiple voices.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
voices: List of voice names
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Name of combined voice
|
|
|
|
"""
|
2025-01-22 02:33:29 -07:00
|
|
|
return await self._voice_manager.combine_voices(voices)
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-07 03:50:08 -07:00
|
|
|
async def list_voices(self) -> List[str]:
|
2025-01-20 22:42:29 -07:00
|
|
|
"""List available voices.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of voice names
|
|
|
|
"""
|
2025-01-22 02:33:29 -07:00
|
|
|
return await self._voice_manager.list_voices()
|
2025-01-20 22:42:29 -07:00
|
|
|
|
|
|
|
def _audio_to_bytes(self, audio: np.ndarray) -> bytes:
|
|
|
|
"""Convert audio to WAV bytes.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
audio: Audio samples
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
WAV bytes
|
|
|
|
"""
|
|
|
|
buffer = io.BytesIO()
|
|
|
|
wavfile.write(buffer, 24000, audio)
|
|
|
|
return buffer.getvalue()
|