2025-01-20 22:42:29 -07:00
|
|
|
"""TTS service using model and voice managers."""
|
|
|
|
|
2025-02-09 18:32:17 -07:00
|
|
|
import asyncio
|
2025-02-03 03:33:12 -07:00
|
|
|
import os
|
|
|
|
import tempfile
|
2025-02-09 18:32:17 -07:00
|
|
|
import time
|
|
|
|
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
2025-01-20 22:42:29 -07:00
|
|
|
|
2025-02-12 15:06:11 +00:00
|
|
|
from ..inference.base import AudioChunk
|
2025-01-13 20:15:46 -07:00
|
|
|
import numpy as np
|
2025-01-21 21:44:21 -07:00
|
|
|
import torch
|
2025-02-09 18:32:17 -07:00
|
|
|
from kokoro import KPipeline
|
2025-01-03 00:53:41 -07:00
|
|
|
from loguru import logger
|
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
from ..core.config import settings
|
2025-02-09 18:32:17 -07:00
|
|
|
from ..inference.kokoro_v1 import KokoroV1
|
2025-01-20 22:42:29 -07:00
|
|
|
from ..inference.model_manager import get_manager as get_model_manager
|
|
|
|
from ..inference.voice_manager import get_manager as get_voice_manager
|
2025-01-13 20:15:46 -07:00
|
|
|
from .audio import AudioNormalizer, AudioService
|
2025-01-30 04:44:04 -07:00
|
|
|
from .text_processing import tokenize
|
2025-02-09 18:32:17 -07:00
|
|
|
from .text_processing.text_processor import process_text_chunk, smart_split
|
2025-02-11 19:09:35 -05:00
|
|
|
from ..structures.schemas import NormalizationOptions
|
2025-01-22 02:33:29 -07:00
|
|
|
|
2025-01-03 00:53:41 -07:00
|
|
|
class TTSService:
|
2025-01-20 22:42:29 -07:00
|
|
|
"""Text-to-speech service."""
|
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
# Limit concurrent chunk processing
|
|
|
|
_chunk_semaphore = asyncio.Semaphore(4)
|
|
|
|
|
2025-01-03 00:53:41 -07:00
|
|
|
def __init__(self, output_dir: str = None):
|
2025-01-25 05:25:13 -07:00
|
|
|
"""Initialize service."""
|
2025-01-03 00:53:41 -07:00
|
|
|
self.output_dir = output_dir
|
2025-01-22 02:33:29 -07:00
|
|
|
self.model_manager = None
|
|
|
|
self._voice_manager = None
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
@classmethod
|
2025-02-09 18:32:17 -07:00
|
|
|
async def create(cls, output_dir: str = None) -> "TTSService":
|
2025-01-25 05:25:13 -07:00
|
|
|
"""Create and initialize TTSService instance."""
|
2025-01-22 02:33:29 -07:00
|
|
|
service = cls(output_dir)
|
|
|
|
service.model_manager = await get_model_manager()
|
|
|
|
service._voice_manager = await get_voice_manager()
|
|
|
|
return service
|
2025-01-20 22:42:29 -07:00
|
|
|
|
2025-01-25 05:25:13 -07:00
|
|
|
async def _process_chunk(
|
|
|
|
self,
|
2025-02-03 03:33:12 -07:00
|
|
|
chunk_text: str,
|
2025-01-27 20:23:42 -07:00
|
|
|
tokens: List[int],
|
2025-02-03 03:33:12 -07:00
|
|
|
voice_name: str,
|
|
|
|
voice_path: str,
|
2025-01-25 05:25:13 -07:00
|
|
|
speed: float,
|
|
|
|
output_format: Optional[str] = None,
|
|
|
|
is_first: bool = False,
|
|
|
|
is_last: bool = False,
|
|
|
|
normalizer: Optional[AudioNormalizer] = None,
|
2025-02-08 01:29:15 -07:00
|
|
|
lang_code: Optional[str] = None,
|
2025-02-12 17:13:56 +00:00
|
|
|
return_timestamps: Optional[bool] = False,
|
2025-02-16 15:37:01 -05:00
|
|
|
) -> AsyncGenerator[AudioChunk, None]:
|
2025-01-27 20:23:42 -07:00
|
|
|
"""Process tokens into audio."""
|
2025-01-25 05:25:13 -07:00
|
|
|
async with self._chunk_semaphore:
|
|
|
|
try:
|
2025-01-27 20:23:42 -07:00
|
|
|
# Handle stream finalization
|
|
|
|
if is_last:
|
2025-01-30 05:47:28 -07:00
|
|
|
# Skip format conversion for raw audio mode
|
|
|
|
if not output_format:
|
2025-02-16 15:37:01 -05:00
|
|
|
yield AudioChunk(np.array([], dtype=np.int16),output=b'')
|
2025-02-03 03:33:12 -07:00
|
|
|
return
|
2025-02-16 15:37:01 -05:00
|
|
|
chunk_data = await AudioService.convert_audio(
|
2025-02-18 18:12:49 -05:00
|
|
|
AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking
|
2025-01-27 20:23:42 -07:00
|
|
|
24000,
|
|
|
|
output_format,
|
2025-02-10 21:45:05 -05:00
|
|
|
speed,
|
|
|
|
"",
|
2025-01-27 20:23:42 -07:00
|
|
|
is_first_chunk=False,
|
|
|
|
normalizer=normalizer,
|
2025-02-09 18:32:17 -07:00
|
|
|
is_last_chunk=True,
|
2025-01-27 20:23:42 -07:00
|
|
|
)
|
2025-02-16 15:37:01 -05:00
|
|
|
yield chunk_data
|
2025-02-03 03:33:12 -07:00
|
|
|
return
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-01-27 20:23:42 -07:00
|
|
|
# Skip empty chunks
|
2025-02-03 03:33:12 -07:00
|
|
|
if not tokens and not chunk_text:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Get backend
|
|
|
|
backend = self.model_manager.get_backend()
|
2025-01-27 20:23:42 -07:00
|
|
|
|
2025-01-25 05:25:13 -07:00
|
|
|
# Generate audio using pre-warmed model
|
2025-02-03 03:33:12 -07:00
|
|
|
if isinstance(backend, KokoroV1):
|
2025-02-19 23:10:51 -05:00
|
|
|
chunk_index=0
|
2025-02-08 01:29:15 -07:00
|
|
|
# For Kokoro V1, pass text and voice info with lang_code
|
2025-02-11 22:32:10 -05:00
|
|
|
async for chunk_data in self.model_manager.generate(
|
2025-02-03 03:33:12 -07:00
|
|
|
chunk_text,
|
|
|
|
(voice_name, voice_path),
|
2025-02-08 01:29:15 -07:00
|
|
|
speed=speed,
|
2025-02-09 18:32:17 -07:00
|
|
|
lang_code=lang_code,
|
2025-02-12 17:13:56 +00:00
|
|
|
return_timestamps=return_timestamps,
|
2025-02-03 03:33:12 -07:00
|
|
|
):
|
|
|
|
# For streaming, convert to bytes
|
|
|
|
if output_format:
|
|
|
|
try:
|
2025-02-16 15:37:01 -05:00
|
|
|
chunk_data = await AudioService.convert_audio(
|
2025-02-11 22:32:10 -05:00
|
|
|
chunk_data,
|
2025-02-03 03:33:12 -07:00
|
|
|
24000,
|
|
|
|
output_format,
|
2025-02-10 21:45:05 -05:00
|
|
|
speed,
|
|
|
|
chunk_text,
|
2025-02-19 23:10:51 -05:00
|
|
|
is_first_chunk=is_first and chunk_index == 0,
|
2025-02-09 18:32:17 -07:00
|
|
|
is_last_chunk=is_last,
|
2025-02-10 21:45:05 -05:00
|
|
|
normalizer=normalizer,
|
2025-02-03 03:33:12 -07:00
|
|
|
)
|
2025-02-16 15:37:01 -05:00
|
|
|
yield chunk_data
|
2025-02-03 03:33:12 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to convert audio: {str(e)}")
|
|
|
|
else:
|
2025-02-13 16:12:51 -05:00
|
|
|
chunk_data = AudioService.trim_audio(chunk_data,
|
2025-02-10 21:45:05 -05:00
|
|
|
chunk_text,
|
|
|
|
speed,
|
|
|
|
is_last,
|
|
|
|
normalizer)
|
2025-02-16 15:37:01 -05:00
|
|
|
yield chunk_data
|
2025-02-19 23:10:51 -05:00
|
|
|
chunk_index+=1
|
2025-02-03 03:33:12 -07:00
|
|
|
else:
|
2025-02-12 15:06:11 +00:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
# For legacy backends, load voice tensor
|
2025-02-09 18:32:17 -07:00
|
|
|
voice_tensor = await self._voice_manager.load_voice(
|
|
|
|
voice_name, device=backend.device
|
|
|
|
)
|
2025-02-12 17:13:56 +00:00
|
|
|
chunk_data = await self.model_manager.generate(
|
|
|
|
tokens, voice_tensor, speed=speed, return_timestamps=return_timestamps
|
2025-02-03 03:33:12 -07:00
|
|
|
)
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-12 17:13:56 +00:00
|
|
|
if chunk_data.audio is None:
|
2025-02-03 03:33:12 -07:00
|
|
|
logger.error("Model generated None for audio chunk")
|
|
|
|
return
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-12 17:13:56 +00:00
|
|
|
if len(chunk_data.audio) == 0:
|
2025-02-03 03:33:12 -07:00
|
|
|
logger.error("Model generated empty audio chunk")
|
|
|
|
return
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
# For streaming, convert to bytes
|
|
|
|
if output_format:
|
|
|
|
try:
|
2025-02-16 15:37:01 -05:00
|
|
|
chunk_data = await AudioService.convert_audio(
|
2025-02-12 17:13:56 +00:00
|
|
|
chunk_data,
|
2025-02-03 03:33:12 -07:00
|
|
|
24000,
|
|
|
|
output_format,
|
2025-02-10 21:45:05 -05:00
|
|
|
speed,
|
|
|
|
chunk_text,
|
2025-02-03 03:33:12 -07:00
|
|
|
is_first_chunk=is_first,
|
|
|
|
normalizer=normalizer,
|
2025-02-09 18:32:17 -07:00
|
|
|
is_last_chunk=is_last,
|
2025-02-03 03:33:12 -07:00
|
|
|
)
|
2025-02-16 15:37:01 -05:00
|
|
|
yield chunk_data
|
2025-02-03 03:33:12 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to convert audio: {str(e)}")
|
|
|
|
else:
|
2025-02-13 16:12:51 -05:00
|
|
|
trimmed = AudioService.trim_audio(chunk_data,
|
2025-02-10 21:45:05 -05:00
|
|
|
chunk_text,
|
|
|
|
speed,
|
|
|
|
is_last,
|
|
|
|
normalizer)
|
2025-02-16 15:37:01 -05:00
|
|
|
yield trimmed
|
2025-01-25 05:25:13 -07:00
|
|
|
except Exception as e:
|
2025-01-27 20:23:42 -07:00
|
|
|
logger.error(f"Failed to process tokens: {str(e)}")
|
2025-02-03 03:33:12 -07:00
|
|
|
|
|
|
|
async def _get_voice_path(self, voice: str) -> Tuple[str, str]:
|
|
|
|
"""Get voice path, handling combined voices.
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
Args:
|
|
|
|
voice: Voice name or combined voice names (e.g., 'af_jadzia+af_jessica')
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
Returns:
|
|
|
|
Tuple of (voice name to use, voice path to use)
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
Raises:
|
|
|
|
RuntimeError: If voice not found
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# Check if it's a combined voice
|
|
|
|
if "+" in voice:
|
2025-02-04 19:41:41 -07:00
|
|
|
# Split on + but preserve any parentheses
|
|
|
|
voice_parts = []
|
|
|
|
weights = []
|
|
|
|
for part in voice.split("+"):
|
|
|
|
part = part.strip()
|
|
|
|
if not part:
|
|
|
|
continue
|
|
|
|
# Extract voice name and weight if present
|
|
|
|
if "(" in part and ")" in part:
|
|
|
|
voice_name = part.split("(")[0].strip()
|
|
|
|
weight = float(part.split("(")[1].split(")")[0])
|
|
|
|
else:
|
|
|
|
voice_name = part
|
|
|
|
weight = 1.0
|
|
|
|
voice_parts.append(voice_name)
|
|
|
|
weights.append(weight)
|
|
|
|
|
|
|
|
if len(voice_parts) < 2:
|
2025-02-03 03:33:12 -07:00
|
|
|
raise RuntimeError(f"Invalid combined voice name: {voice}")
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
# Normalize weights to sum to 1
|
|
|
|
total_weight = sum(weights)
|
2025-02-09 18:32:17 -07:00
|
|
|
weights = [w / total_weight for w in weights]
|
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
# Load and combine voices
|
|
|
|
voice_tensors = []
|
2025-02-04 19:41:41 -07:00
|
|
|
for v, w in zip(voice_parts, weights):
|
2025-02-04 03:37:56 -07:00
|
|
|
path = await self._voice_manager.get_voice_path(v)
|
2025-02-03 03:33:12 -07:00
|
|
|
if not path:
|
|
|
|
raise RuntimeError(f"Voice not found: {v}")
|
|
|
|
logger.debug(f"Loading voice tensor from: {path}")
|
|
|
|
voice_tensor = torch.load(path, map_location="cpu")
|
2025-02-04 19:41:41 -07:00
|
|
|
voice_tensors.append(voice_tensor * w)
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
# Sum the weighted voice tensors
|
2025-02-09 18:32:17 -07:00
|
|
|
logger.debug(
|
|
|
|
f"Combining {len(voice_tensors)} voice tensors with weights {weights}"
|
|
|
|
)
|
2025-02-04 19:41:41 -07:00
|
|
|
combined = torch.sum(torch.stack(voice_tensors), dim=0)
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
# Save combined tensor
|
|
|
|
temp_dir = tempfile.gettempdir()
|
|
|
|
combined_path = os.path.join(temp_dir, f"{voice}.pt")
|
|
|
|
logger.debug(f"Saving combined voice to: {combined_path}")
|
|
|
|
torch.save(combined, combined_path)
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
return voice, combined_path
|
|
|
|
else:
|
|
|
|
# Single voice
|
2025-02-18 18:12:49 -05:00
|
|
|
if "(" in voice and ")" in voice:
|
|
|
|
voice = voice.split("(")[0].strip()
|
2025-02-04 03:37:56 -07:00
|
|
|
path = await self._voice_manager.get_voice_path(voice)
|
2025-02-03 03:33:12 -07:00
|
|
|
if not path:
|
|
|
|
raise RuntimeError(f"Voice not found: {voice}")
|
|
|
|
logger.debug(f"Using single voice path: {path}")
|
|
|
|
return voice, path
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to get voice path: {e}")
|
|
|
|
raise
|
2025-01-03 00:53:41 -07:00
|
|
|
|
2025-01-04 17:54:54 -07:00
|
|
|
async def generate_audio_stream(
|
2025-01-09 18:41:44 -07:00
|
|
|
self,
|
|
|
|
text: str,
|
|
|
|
voice: str,
|
2025-01-20 22:42:29 -07:00
|
|
|
speed: float = 1.0,
|
2025-01-09 18:41:44 -07:00
|
|
|
output_format: str = "wav",
|
2025-02-08 01:29:15 -07:00
|
|
|
lang_code: Optional[str] = None,
|
2025-02-12 17:13:56 +00:00
|
|
|
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
|
|
|
return_timestamps: Optional[bool] = False,
|
2025-02-16 15:37:01 -05:00
|
|
|
) -> AsyncGenerator[AudioChunk, None]:
|
2025-01-25 05:25:13 -07:00
|
|
|
"""Generate and stream audio chunks."""
|
2025-01-22 02:33:29 -07:00
|
|
|
stream_normalizer = AudioNormalizer()
|
2025-01-27 20:23:42 -07:00
|
|
|
chunk_index = 0
|
2025-02-12 21:36:35 -05:00
|
|
|
current_offset=0.0
|
2025-01-04 17:54:54 -07:00
|
|
|
try:
|
2025-02-03 03:33:12 -07:00
|
|
|
# Get backend
|
2025-01-22 02:33:29 -07:00
|
|
|
backend = self.model_manager.get_backend()
|
2025-02-03 03:33:12 -07:00
|
|
|
|
|
|
|
# Get voice path, handling combined voices
|
|
|
|
voice_name, voice_path = await self._get_voice_path(voice)
|
|
|
|
logger.debug(f"Using voice path: {voice_path}")
|
2025-01-22 02:33:29 -07:00
|
|
|
|
2025-02-08 01:29:15 -07:00
|
|
|
# Use provided lang_code or determine from voice name
|
|
|
|
pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
|
2025-02-09 18:32:17 -07:00
|
|
|
logger.info(
|
|
|
|
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
|
|
|
|
)
|
2025-02-12 21:36:35 -05:00
|
|
|
|
|
|
|
|
2025-01-27 20:23:42 -07:00
|
|
|
# Process text in chunks with smart splitting
|
2025-02-16 14:16:18 -05:00
|
|
|
async for chunk_text, tokens in smart_split(text,lang_code=lang_code,normalization_options=normalization_options):
|
2025-01-27 20:23:42 -07:00
|
|
|
try:
|
|
|
|
# Process audio for chunk
|
2025-02-16 15:37:01 -05:00
|
|
|
async for chunk_data in self._process_chunk(
|
2025-02-03 03:33:12 -07:00
|
|
|
chunk_text, # Pass text for Kokoro V1
|
2025-02-09 18:32:17 -07:00
|
|
|
tokens, # Pass tokens for legacy backends
|
2025-02-03 03:33:12 -07:00
|
|
|
voice_name, # Pass voice name
|
|
|
|
voice_path, # Pass voice path
|
2025-01-25 05:25:13 -07:00
|
|
|
speed,
|
|
|
|
output_format,
|
2025-01-27 20:23:42 -07:00
|
|
|
is_first=(chunk_index == 0),
|
|
|
|
is_last=False, # We'll update the last chunk later
|
2025-02-08 01:29:15 -07:00
|
|
|
normalizer=stream_normalizer,
|
2025-02-09 18:32:17 -07:00
|
|
|
lang_code=pipeline_lang_code, # Pass lang_code
|
2025-02-12 21:36:35 -05:00
|
|
|
return_timestamps=return_timestamps,
|
2025-02-03 03:33:12 -07:00
|
|
|
):
|
2025-02-12 21:36:35 -05:00
|
|
|
if chunk_data.word_timestamps is not None:
|
|
|
|
for timestamp in chunk_data.word_timestamps:
|
2025-02-14 13:37:42 -05:00
|
|
|
timestamp.start_time+=current_offset
|
|
|
|
timestamp.end_time+=current_offset
|
2025-02-12 21:36:35 -05:00
|
|
|
|
|
|
|
current_offset+=len(chunk_data.audio) / 24000
|
|
|
|
|
2025-02-16 15:37:01 -05:00
|
|
|
if chunk_data.output is not None:
|
|
|
|
yield chunk_data
|
2025-02-19 23:10:51 -05:00
|
|
|
|
2025-02-03 03:33:12 -07:00
|
|
|
else:
|
2025-02-09 18:32:17 -07:00
|
|
|
logger.warning(
|
|
|
|
f"No audio generated for chunk: '{chunk_text[:100]}...'"
|
|
|
|
)
|
2025-02-19 23:10:51 -05:00
|
|
|
chunk_index += 1
|
2025-01-27 20:23:42 -07:00
|
|
|
except Exception as e:
|
2025-02-09 18:32:17 -07:00
|
|
|
logger.error(
|
|
|
|
f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}"
|
|
|
|
)
|
2025-01-27 20:23:42 -07:00
|
|
|
continue
|
|
|
|
|
|
|
|
# Only finalize if we successfully processed at least one chunk
|
|
|
|
if chunk_index > 0:
|
|
|
|
try:
|
|
|
|
# Empty tokens list to finalize audio
|
2025-02-16 15:37:01 -05:00
|
|
|
async for chunk_data in self._process_chunk(
|
2025-02-03 03:33:12 -07:00
|
|
|
"", # Empty text
|
|
|
|
[], # Empty tokens
|
|
|
|
voice_name,
|
|
|
|
voice_path,
|
2025-01-27 20:23:42 -07:00
|
|
|
speed,
|
|
|
|
output_format,
|
|
|
|
is_first=False,
|
2025-02-03 03:33:12 -07:00
|
|
|
is_last=True, # Signal this is the last chunk
|
2025-02-08 01:29:15 -07:00
|
|
|
normalizer=stream_normalizer,
|
2025-02-09 18:32:17 -07:00
|
|
|
lang_code=pipeline_lang_code, # Pass lang_code
|
2025-02-03 03:33:12 -07:00
|
|
|
):
|
2025-02-16 15:37:01 -05:00
|
|
|
if chunk_data.output is not None:
|
|
|
|
yield chunk_data
|
2025-01-27 20:23:42 -07:00
|
|
|
except Exception as e:
|
2025-02-03 03:33:12 -07:00
|
|
|
logger.error(f"Failed to finalize audio stream: {str(e)}")
|
2025-01-30 04:44:04 -07:00
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in phoneme audio generation: {str(e)}")
|
2025-02-12 15:06:11 +00:00
|
|
|
raise e
|
|
|
|
|
2025-01-30 04:44:04 -07:00
|
|
|
|
2025-01-25 05:25:13 -07:00
|
|
|
async def generate_audio(
|
2025-02-09 18:32:17 -07:00
|
|
|
self,
|
|
|
|
text: str,
|
|
|
|
voice: str,
|
|
|
|
speed: float = 1.0,
|
2025-02-13 18:00:03 -05:00
|
|
|
return_timestamps: bool = False,
|
2025-02-14 14:29:47 -05:00
|
|
|
normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
|
2025-02-09 18:32:17 -07:00
|
|
|
lang_code: Optional[str] = None,
|
2025-02-16 15:37:01 -05:00
|
|
|
) -> AudioChunk:
|
2025-01-25 05:25:13 -07:00
|
|
|
"""Generate complete audio for text using streaming internally."""
|
2025-02-12 20:34:55 +00:00
|
|
|
audio_data_chunks=[]
|
2025-02-12 22:42:41 -05:00
|
|
|
|
2025-01-25 05:25:13 -07:00
|
|
|
try:
|
2025-02-16 15:37:01 -05:00
|
|
|
async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
|
2025-02-18 18:12:49 -05:00
|
|
|
if len(audio_stream_data.audio) > 0:
|
|
|
|
audio_data_chunks.append(audio_stream_data)
|
2025-02-13 18:00:03 -05:00
|
|
|
|
|
|
|
|
2025-02-12 22:42:41 -05:00
|
|
|
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
2025-02-16 15:37:01 -05:00
|
|
|
return combined_audio_data
|
2025-01-25 05:25:13 -07:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in audio generation: {str(e)}")
|
|
|
|
raise
|
2025-02-12 20:34:55 +00:00
|
|
|
|
2025-01-25 05:25:13 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
async def combine_voices(self, voices: List[str]) -> torch.Tensor:
|
|
|
|
"""Combine multiple voices.
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
Returns:
|
|
|
|
Combined voice tensor
|
|
|
|
"""
|
2025-01-22 02:33:29 -07:00
|
|
|
return await self._voice_manager.combine_voices(voices)
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-07 03:50:08 -07:00
|
|
|
async def list_voices(self) -> List[str]:
|
2025-01-25 05:25:13 -07:00
|
|
|
"""List available voices."""
|
2025-02-03 03:33:12 -07:00
|
|
|
return await self._voice_manager.list_voices()
|
2025-02-04 19:41:41 -07:00
|
|
|
|
|
|
|
async def generate_from_phonemes(
|
|
|
|
self,
|
|
|
|
phonemes: str,
|
|
|
|
voice: str,
|
2025-02-08 01:29:15 -07:00
|
|
|
speed: float = 1.0,
|
2025-02-09 18:32:17 -07:00
|
|
|
lang_code: Optional[str] = None,
|
2025-02-04 19:41:41 -07:00
|
|
|
) -> Tuple[np.ndarray, float]:
|
|
|
|
"""Generate audio directly from phonemes.
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
Args:
|
|
|
|
phonemes: Phonemes in Kokoro format
|
|
|
|
voice: Voice name
|
|
|
|
speed: Speed multiplier
|
2025-02-08 01:29:15 -07:00
|
|
|
lang_code: Optional language code override
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-04 19:41:41 -07:00
|
|
|
Returns:
|
|
|
|
Tuple of (audio array, processing time)
|
|
|
|
"""
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
|
|
# Get backend and voice path
|
|
|
|
backend = self.model_manager.get_backend()
|
|
|
|
voice_name, voice_path = await self._get_voice_path(voice)
|
|
|
|
|
2025-02-05 02:45:28 -07:00
|
|
|
if isinstance(backend, KokoroV1):
|
|
|
|
# For Kokoro V1, use generate_from_tokens with raw phonemes
|
|
|
|
result = None
|
2025-02-08 01:29:15 -07:00
|
|
|
# Use provided lang_code or determine from voice name
|
|
|
|
pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
|
2025-02-09 18:32:17 -07:00
|
|
|
logger.info(
|
|
|
|
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline"
|
|
|
|
)
|
|
|
|
|
2025-02-08 20:36:50 -07:00
|
|
|
try:
|
|
|
|
# Use backend's pipeline management
|
2025-02-09 18:32:17 -07:00
|
|
|
for r in backend._get_pipeline(
|
|
|
|
pipeline_lang_code
|
|
|
|
).generate_from_tokens(
|
2025-02-08 20:36:50 -07:00
|
|
|
tokens=phonemes, # Pass raw phonemes string
|
|
|
|
voice=voice_path,
|
2025-02-09 18:32:17 -07:00
|
|
|
speed=speed,
|
2025-02-08 20:36:50 -07:00
|
|
|
):
|
|
|
|
if r.audio is not None:
|
|
|
|
result = r
|
|
|
|
break
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to generate from phonemes: {e}")
|
|
|
|
raise RuntimeError(f"Phoneme generation failed: {e}")
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-05 02:45:28 -07:00
|
|
|
if result is None or result.audio is None:
|
|
|
|
raise ValueError("No audio generated")
|
2025-02-09 18:32:17 -07:00
|
|
|
|
2025-02-05 02:45:28 -07:00
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return result.audio.numpy(), processing_time
|
|
|
|
else:
|
2025-02-09 18:32:17 -07:00
|
|
|
raise ValueError(
|
|
|
|
"Phoneme generation only supported with Kokoro V1 backend"
|
|
|
|
)
|
2025-02-04 19:41:41 -07:00
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in phoneme audio generation: {str(e)}")
|
|
|
|
raise
|