2025-01-09 07:20:14 -07:00
|
|
|
from typing import List
|
2025-01-09 18:41:44 -07:00
|
|
|
|
|
|
|
import numpy as np
|
2025-01-22 17:43:38 -07:00
|
|
|
import torch
|
2025-01-22 02:33:29 -07:00
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Request, Response
|
2025-01-30 04:44:04 -07:00
|
|
|
from fastapi.responses import StreamingResponse
|
2025-01-09 07:20:14 -07:00
|
|
|
from loguru import logger
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-30 04:44:04 -07:00
|
|
|
from ..services.audio import AudioService, AudioNormalizer
|
|
|
|
from ..services.streaming_audio_writer import StreamingAudioWriter
|
|
|
|
from ..services.text_processing import phonemize, smart_split
|
|
|
|
from ..services.text_processing.vocabulary import tokenize
|
2025-01-09 18:41:44 -07:00
|
|
|
from ..services.tts_service import TTSService
|
|
|
|
from ..structures.text_schemas import (
|
2025-01-13 20:15:46 -07:00
|
|
|
GenerateFromPhonemesRequest,
|
2025-01-09 18:41:44 -07:00
|
|
|
PhonemeRequest,
|
|
|
|
PhonemeResponse,
|
|
|
|
)
|
2025-01-09 07:20:14 -07:00
|
|
|
|
|
|
|
router = APIRouter(tags=["text processing"])
|
|
|
|
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-22 02:33:29 -07:00
|
|
|
async def get_tts_service() -> TTSService:
|
2025-01-09 07:20:14 -07:00
|
|
|
"""Dependency to get TTSService instance"""
|
2025-01-22 02:33:29 -07:00
|
|
|
return await TTSService.create() # Create service with properly initialized managers
|
2025-01-09 07:20:14 -07:00
|
|
|
|
|
|
|
@router.post("/dev/phonemize", response_model=PhonemeResponse)
|
2025-01-09 18:41:44 -07:00
|
|
|
async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
|
2025-01-09 07:20:14 -07:00
|
|
|
"""Convert text to phonemes and tokens
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-09 07:20:14 -07:00
|
|
|
Args:
|
|
|
|
request: Request containing text and language
|
|
|
|
tts_service: Injected TTSService instance
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-09 07:20:14 -07:00
|
|
|
Returns:
|
|
|
|
Phonemes and token IDs
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
if not request.text:
|
|
|
|
raise ValueError("Text cannot be empty")
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-09 07:20:14 -07:00
|
|
|
# Get phonemes
|
|
|
|
phonemes = phonemize(request.text, request.language)
|
|
|
|
if not phonemes:
|
|
|
|
raise ValueError("Failed to generate phonemes")
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-22 17:43:38 -07:00
|
|
|
# Get tokens (without adding start/end tokens to match process_text behavior)
|
2025-01-09 07:20:14 -07:00
|
|
|
tokens = tokenize(phonemes)
|
2025-01-09 18:41:44 -07:00
|
|
|
return PhonemeResponse(phonemes=phonemes, tokens=tokens)
|
2025-01-09 07:20:14 -07:00
|
|
|
except ValueError as e:
|
|
|
|
logger.error(f"Error in phoneme generation: {str(e)}")
|
|
|
|
raise HTTPException(
|
2025-01-09 18:41:44 -07:00
|
|
|
status_code=500, detail={"error": "Server error", "message": str(e)}
|
2025-01-09 07:20:14 -07:00
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in phoneme generation: {str(e)}")
|
|
|
|
raise HTTPException(
|
2025-01-09 18:41:44 -07:00
|
|
|
status_code=500, detail={"error": "Server error", "message": str(e)}
|
2025-01-09 07:20:14 -07:00
|
|
|
)
|
|
|
|
@router.post("/dev/generate_from_phonemes")
|
|
|
|
async def generate_from_phonemes(
|
|
|
|
request: GenerateFromPhonemesRequest,
|
2025-01-30 04:44:04 -07:00
|
|
|
client_request: Request,
|
2025-01-09 18:41:44 -07:00
|
|
|
tts_service: TTSService = Depends(get_tts_service),
|
2025-01-30 04:44:04 -07:00
|
|
|
) -> StreamingResponse:
|
|
|
|
"""Generate audio directly from phonemes with proper streaming"""
|
2025-01-09 07:20:14 -07:00
|
|
|
try:
|
2025-01-30 04:44:04 -07:00
|
|
|
# Basic validation
|
|
|
|
if not isinstance(request.phonemes, str):
|
|
|
|
raise ValueError("Phonemes must be a string")
|
|
|
|
if not request.phonemes:
|
|
|
|
raise ValueError("Phonemes cannot be empty")
|
|
|
|
|
|
|
|
# Create streaming audio writer and normalizer
|
|
|
|
writer = StreamingAudioWriter(format="wav", sample_rate=24000, channels=1)
|
|
|
|
normalizer = AudioNormalizer()
|
|
|
|
|
|
|
|
async def generate_chunks():
|
|
|
|
try:
|
|
|
|
has_data = False
|
|
|
|
# Process phonemes in chunks
|
|
|
|
async for chunk_text, _ in smart_split(request.phonemes):
|
|
|
|
# Check if client is still connected
|
|
|
|
is_disconnected = client_request.is_disconnected
|
|
|
|
if callable(is_disconnected):
|
|
|
|
is_disconnected = await is_disconnected()
|
|
|
|
if is_disconnected:
|
|
|
|
logger.info("Client disconnected, stopping audio generation")
|
|
|
|
break
|
|
|
|
|
|
|
|
chunk_audio, _ = await tts_service.generate_from_phonemes(
|
|
|
|
phonemes=chunk_text,
|
|
|
|
voice=request.voice,
|
|
|
|
speed=1.0
|
2025-01-22 17:43:38 -07:00
|
|
|
)
|
2025-01-30 04:44:04 -07:00
|
|
|
if chunk_audio is not None:
|
|
|
|
has_data = True
|
|
|
|
# Normalize audio before writing
|
|
|
|
normalized_audio = await normalizer.normalize(chunk_audio)
|
|
|
|
# Write chunk and yield bytes
|
|
|
|
chunk_bytes = writer.write_chunk(normalized_audio)
|
|
|
|
if chunk_bytes:
|
|
|
|
yield chunk_bytes
|
|
|
|
|
|
|
|
if not has_data:
|
|
|
|
raise ValueError("Failed to generate any audio data")
|
|
|
|
|
|
|
|
# Finalize and yield remaining bytes if we still have a connection
|
|
|
|
if not (callable(is_disconnected) and await is_disconnected()):
|
|
|
|
final_bytes = writer.write_chunk(finalize=True)
|
|
|
|
if final_bytes:
|
|
|
|
yield final_bytes
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in audio chunk generation: {str(e)}")
|
|
|
|
# Clean up writer on error
|
|
|
|
writer.write_chunk(finalize=True)
|
|
|
|
# Re-raise the original exception
|
|
|
|
raise
|
|
|
|
|
|
|
|
return StreamingResponse(
|
|
|
|
generate_chunks(),
|
2025-01-09 07:20:14 -07:00
|
|
|
media_type="audio/wav",
|
|
|
|
headers={
|
|
|
|
"Content-Disposition": "attachment; filename=speech.wav",
|
2025-01-30 04:44:04 -07:00
|
|
|
"X-Accel-Buffering": "no",
|
2025-01-09 07:20:14 -07:00
|
|
|
"Cache-Control": "no-cache",
|
2025-01-30 04:44:04 -07:00
|
|
|
"Transfer-Encoding": "chunked"
|
|
|
|
}
|
2025-01-09 07:20:14 -07:00
|
|
|
)
|
2025-01-09 18:41:44 -07:00
|
|
|
|
2025-01-30 04:44:04 -07:00
|
|
|
|
2025-01-09 07:20:14 -07:00
|
|
|
except ValueError as e:
|
2025-01-30 04:44:04 -07:00
|
|
|
logger.error(f"Error generating audio: {str(e)}")
|
2025-01-09 07:20:14 -07:00
|
|
|
raise HTTPException(
|
2025-01-30 04:44:04 -07:00
|
|
|
status_code=400,
|
|
|
|
detail={
|
|
|
|
"error": "validation_error",
|
|
|
|
"message": str(e),
|
|
|
|
"type": "invalid_request_error"
|
|
|
|
}
|
2025-01-09 07:20:14 -07:00
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error generating audio: {str(e)}")
|
|
|
|
raise HTTPException(
|
2025-01-30 04:44:04 -07:00
|
|
|
status_code=500,
|
|
|
|
detail={
|
|
|
|
"error": "processing_error",
|
|
|
|
"message": str(e),
|
|
|
|
"type": "server_error"
|
|
|
|
}
|
2025-01-09 07:20:14 -07:00
|
|
|
)
|