From 6c234a3b67352351a2956d1d844959b627789f1b Mon Sep 17 00:00:00 2001
From: remsky <jeremy.braun@ucalgary.ca>
Date: Tue, 4 Feb 2025 19:41:41 -0700
Subject: [PATCH] Update dependencies, enhance voice management, and add
 captioned speech support

---
 .gitignore                                    |   1 +
 api/src/core/model_config.py                  |  37 +--
 api/src/core/paths.py                         |   4 +-
 api/src/inference/voice_manager.py            |  10 +-
 api/src/routers/development.py                | 172 ++++++++---
 api/src/routers/openai_compatible.py          | 123 +++++---
 api/src/services/tts_service.py               | 273 ++++++++++++++++--
 api/src/structures/__init__.py                |  18 +-
 api/src/structures/schemas.py                 |  38 +++
 api/src/voices/v1_0/{v0_af.pt => af_v0.pt}    | Bin 524355 -> 524370 bytes
 .../v1_0/{v0_af_bella.pt => af_v0bella.pt}    | Bin 524449 -> 524464 bytes
 .../v1_0/{af_irulan.pt => af_v0irulan.pt}     | Bin 524484 -> 524469 bytes
 .../v1_0/{v0_af_nicole.pt => af_v0nicole.pt}  | Bin 524454 -> 524469 bytes
 .../v1_0/{v0_af_sarah.pt => af_v0sarah.pt}    | Bin 524449 -> 524464 bytes
 .../voices/v1_0/{v0_af_sky.pt => af_v0sky.pt} | Bin 524375 -> 524454 bytes
 .../v1_0/{v0_am_adam.pt => am_v0adam.pt}      | Bin 524444 -> 524459 bytes
 .../v1_0/{v0_am_gurney.pt => am_v0gurney.pt}  | Bin 524474 -> 524469 bytes
 .../{v0_am_michael.pt => am_v0michael.pt}     | Bin 524459 -> 524474 bytes
 .../v1_0/{v0_bf_emma.pt => bf_v0emma.pt}      | Bin 524365 -> 524459 bytes
 .../{v0_bf_isabella.pt => bf_v0isabella.pt}   | Bin 524365 -> 524479 bytes
 .../v1_0/{v0_bm_george.pt => bm_v0george.pt}  | Bin 524464 -> 524469 bytes
 .../v1_0/{v0_bm_lewis.pt => bm_v0lewis.pt}    | Bin 524449 -> 524464 bytes
 api/src/voices/v1_0/v0_af_irulan.pt           | Bin 524484 -> 0 bytes
 .../test_analyze_combined_voices.py           |  98 ++++++-
 .../test_combinations/test_download_voice.py  |  74 +++++
 .../test_voices/analyze_voice_dimensions.py   |  54 ++++
 .../test_voices/trim_voice_dimensions.py      |  85 ++++++
 examples/captioned_speech_example.py          |  98 +++++++
 pyproject.toml                                |   4 +-
 web/src/components/VoiceSelector.js           |  32 +-
 web/src/services/VoiceService.js              |  27 +-
 31 files changed, 979 insertions(+), 169 deletions(-)
 rename api/src/voices/v1_0/{v0_af.pt => af_v0.pt} (99%)
 rename api/src/voices/v1_0/{v0_af_bella.pt => af_v0bella.pt} (99%)
 rename api/src/voices/v1_0/{af_irulan.pt => af_v0irulan.pt} (99%)
 rename api/src/voices/v1_0/{v0_af_nicole.pt => af_v0nicole.pt} (99%)
 rename api/src/voices/v1_0/{v0_af_sarah.pt => af_v0sarah.pt} (99%)
 rename api/src/voices/v1_0/{v0_af_sky.pt => af_v0sky.pt} (99%)
 rename api/src/voices/v1_0/{v0_am_adam.pt => am_v0adam.pt} (99%)
 rename api/src/voices/v1_0/{v0_am_gurney.pt => am_v0gurney.pt} (99%)
 rename api/src/voices/v1_0/{v0_am_michael.pt => am_v0michael.pt} (99%)
 rename api/src/voices/v1_0/{v0_bf_emma.pt => bf_v0emma.pt} (99%)
 rename api/src/voices/v1_0/{v0_bf_isabella.pt => bf_v0isabella.pt} (99%)
 rename api/src/voices/v1_0/{v0_bm_george.pt => bm_v0george.pt} (99%)
 rename api/src/voices/v1_0/{v0_bm_lewis.pt => bm_v0lewis.pt} (99%)
 delete mode 100644 api/src/voices/v1_0/v0_af_irulan.pt
 create mode 100644 examples/assorted_checks/test_combinations/test_download_voice.py
 create mode 100644 examples/assorted_checks/test_voices/analyze_voice_dimensions.py
 create mode 100644 examples/assorted_checks/test_voices/trim_voice_dimensions.py
 create mode 100644 examples/captioned_speech_example.py

diff --git a/.gitignore b/.gitignore
index 56669e5..3a439db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,4 @@ examples/*.ogg
 examples/speech.mp3
 examples/phoneme_examples/output/*.wav
 examples/assorted_checks/benchmarks/output_audio/*
+uv.lock
diff --git a/api/src/core/model_config.py b/api/src/core/model_config.py
index 893c045..76c7578 100644
--- a/api/src/core/model_config.py
+++ b/api/src/core/model_config.py
@@ -1,4 +1,9 @@
-"""Model configuration for Kokoro V1."""
+"""Model configuration for Kokoro V1.
+
+This module provides model-specific configuration settings that complement the application-level
+settings in config.py. While config.py handles general application settings (API, paths, etc.),
+this module focuses on memory management and model file paths.
+"""
 
 from pydantic import BaseModel, Field
 
@@ -9,51 +14,29 @@ class KokoroV1Config(BaseModel):
     class Config:
         frozen = True
 
-class PyTorchCPUConfig(BaseModel):
-    """PyTorch CPU backend configuration."""
-    
+class PyTorchConfig(BaseModel):
+    """PyTorch backend configuration."""
     memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
     retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
-    num_threads: int = Field(8, description="Number of threads for parallel operations")
-    pin_memory: bool = Field(True, description="Whether to pin memory for faster CPU-GPU transfer")
 
     class Config:
         frozen = True
 
-
-class PyTorchGPUConfig(BaseModel):
-    """PyTorch GPU backend configuration."""
-    
-    device_id: int = Field(0, description="CUDA device ID")
-    use_triton: bool = Field(True, description="Whether to use Triton for CUDA kernels")
-    memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
-    retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
-    sync_cuda: bool = Field(True, description="Whether to synchronize CUDA operations")
-    cuda_streams: int = Field(2, description="Number of CUDA streams for inference")
-    stream_timeout: int = Field(60, description="Stream timeout in seconds")
-
-    class Config:
-        frozen = True
-
-
 class ModelConfig(BaseModel):
     """Kokoro V1 model configuration."""
     
     # General settings
-    device_type: str = Field("cpu", description="Device type ('cpu' or 'gpu')")
     cache_voices: bool = Field(True, description="Whether to cache voice tensors")
     voice_cache_size: int = Field(2, description="Maximum number of cached voices")
     
     # Model filename
     pytorch_kokoro_v1_file: str = Field("v1_0/kokoro-v1_0.pth", description="PyTorch Kokoro V1 model filename")
     
-    # Backend configs
-    pytorch_cpu: PyTorchCPUConfig = Field(default_factory=PyTorchCPUConfig)
-    pytorch_gpu: PyTorchGPUConfig = Field(default_factory=PyTorchGPUConfig)
+    # Backend config
+    pytorch_gpu: PyTorchConfig = Field(default_factory=PyTorchConfig)
 
     class Config:
         frozen = True
 
-
 # Global instance
 model_config = ModelConfig()
\ No newline at end of file
diff --git a/api/src/core/paths.py b/api/src/core/paths.py
index dddf225..a534764 100644
--- a/api/src/core/paths.py
+++ b/api/src/core/paths.py
@@ -160,7 +160,7 @@ async def list_voices() -> List[str]:
     return sorted([name[:-3] for name in voices])  # Remove .pt extension
 
 
-async def load_voice_tensor(voice_path: str, device: str = "cpu") -> torch.Tensor:
+async def load_voice_tensor(voice_path: str, device: str = "cpu", weights_only=False) -> torch.Tensor:
     """Load voice tensor from file.
     
     Args:
@@ -179,7 +179,7 @@ async def load_voice_tensor(voice_path: str, device: str = "cpu") -> torch.Tenso
             return torch.load(
                 io.BytesIO(data),
                 map_location=device,
-                weights_only=True
+                weights_only=weights_only
             )
     except Exception as e:
         raise RuntimeError(f"Failed to load voice tensor from {voice_path}: {e}")
diff --git a/api/src/inference/voice_manager.py b/api/src/inference/voice_manager.py
index f4797d9..2b77be5 100644
--- a/api/src/inference/voice_manager.py
+++ b/api/src/inference/voice_manager.py
@@ -3,6 +3,7 @@
 from typing import Dict, List, Optional
 
 import torch
+import aiofiles
 from loguru import logger
 
 from ..core import paths
@@ -57,7 +58,7 @@ class VoiceManager:
         except Exception as e:
             raise RuntimeError(f"Failed to load voice {voice_name}: {e}")
 
-    async def combine_voices(self, voices: List[str], device: Optional[str] = None) -> str:
+    async def combine_voices(self, voices: List[str], device: Optional[str] = None) -> torch.Tensor:
         """Combine multiple voices.
         
         Args:
@@ -65,7 +66,7 @@ class VoiceManager:
             device: Optional override for target device
             
         Returns:
-            Name of combined voice
+            Combined voice tensor
             
         Raises:
             RuntimeError: If any voice not found
@@ -80,10 +81,7 @@ class VoiceManager:
             voice_tensors.append(voice)
             
         combined = torch.mean(torch.stack(voice_tensors), dim=0)
-        combined_name = "+".join(voices)
-        self._voices[combined_name] = combined
-        
-        return combined_name
+        return combined
 
     async def list_voices(self) -> List[str]:
         """List available voice names.
diff --git a/api/src/routers/development.py b/api/src/routers/development.py
index 8c7a927..6b91746 100644
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@@ -8,14 +8,19 @@ from loguru import logger
 
 from ..services.audio import AudioService, AudioNormalizer
 from ..services.streaming_audio_writer import StreamingAudioWriter
-from ..services.text_processing import phonemize, smart_split
-from ..services.text_processing.vocabulary import tokenize
+from ..services.text_processing import smart_split
+from kokoro import KPipeline
 from ..services.tts_service import TTSService
 from ..structures.text_schemas import (
     GenerateFromPhonemesRequest,
     PhonemeRequest,
     PhonemeResponse,
 )
+from ..structures import (
+    CaptionedSpeechRequest,
+    CaptionedSpeechResponse,
+    WordTimestamp
+)
 
 router = APIRouter(tags=["text processing"])
 
@@ -26,11 +31,10 @@ async def get_tts_service() -> TTSService:
 
 @router.post("/dev/phonemize", response_model=PhonemeResponse)
 async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
-    """Convert text to phonemes and tokens
+    """Convert text to phonemes using Kokoro's quiet mode.
 
     Args:
         request: Request containing text and language
-        tts_service: Injected TTSService instance
 
     Returns:
         Phonemes and token IDs
@@ -39,14 +43,17 @@ async def phonemize_text(request: PhonemeRequest) -> PhonemeResponse:
         if not request.text:
             raise ValueError("Text cannot be empty")
 
-        # Get phonemes
-        phonemes = phonemize(request.text, request.language)
-        if not phonemes:
-            raise ValueError("Failed to generate phonemes")
+        # Initialize Kokoro pipeline in quiet mode (no model)
+        pipeline = KPipeline(lang_code=request.language, model=False)
+        
+        # Get first result from pipeline (we only need one since we're not chunking)
+        for result in pipeline(request.text):
+            # result.graphemes = original text
+            # result.phonemes = phonemized text
+            # result.tokens = token objects (if available)
+            return PhonemeResponse(phonemes=result.phonemes, tokens=[])
 
-        # Get tokens (without adding start/end tokens to match process_text behavior)
-        tokens = tokenize(phonemes)
-        return PhonemeResponse(phonemes=phonemes, tokens=tokens)
+        raise ValueError("Failed to generate phonemes")
     except ValueError as e:
         logger.error(f"Error in phoneme generation: {str(e)}")
         raise HTTPException(
@@ -63,7 +70,7 @@ async def generate_from_phonemes(
     client_request: Request,
     tts_service: TTSService = Depends(get_tts_service),
 ) -> StreamingResponse:
-    """Generate audio directly from phonemes with proper streaming"""
+    """Generate audio directly from phonemes using Kokoro's phoneme format"""
     try:
         # Basic validation
         if not isinstance(request.phonemes, str):
@@ -77,41 +84,30 @@ async def generate_from_phonemes(
 
         async def generate_chunks():
             try:
-                has_data = False
-                # Process phonemes in chunks
-                async for chunk_text, _ in smart_split(request.phonemes):
-                    # Check if client is still connected
-                    is_disconnected = client_request.is_disconnected
-                    if callable(is_disconnected):
-                        is_disconnected = await is_disconnected()
-                    if is_disconnected:
-                        logger.info("Client disconnected, stopping audio generation")
-                        break
-
-                    chunk_audio, _ = await tts_service.generate_from_phonemes(
-                        phonemes=chunk_text,
-                        voice=request.voice,
-                        speed=1.0
-                    )
-                    if chunk_audio is not None:
-                        has_data = True
-                        # Normalize audio before writing
-                        normalized_audio = await normalizer.normalize(chunk_audio)
-                        # Write chunk and yield bytes
-                        chunk_bytes = writer.write_chunk(normalized_audio)
-                        if chunk_bytes:
-                            yield chunk_bytes
-
-                if not has_data:
-                    raise ValueError("Failed to generate any audio data")
-
-                # Finalize and yield remaining bytes if we still have a connection
-                if not (callable(is_disconnected) and await is_disconnected()):
+                # Generate audio from phonemes
+                chunk_audio, _ = await tts_service.generate_from_phonemes(
+                    phonemes=request.phonemes,  # Pass complete phoneme string
+                    voice=request.voice,
+                    speed=1.0
+                )
+                
+                if chunk_audio is not None:
+                    # Normalize audio before writing
+                    normalized_audio = await normalizer.normalize(chunk_audio)
+                    # Write chunk and yield bytes
+                    chunk_bytes = writer.write_chunk(normalized_audio)
+                    if chunk_bytes:
+                        yield chunk_bytes
+                    
+                    # Finalize and yield remaining bytes
                     final_bytes = writer.write_chunk(finalize=True)
                     if final_bytes:
                         yield final_bytes
+                else:
+                    raise ValueError("Failed to generate audio data")
+                    
             except Exception as e:
-                logger.error(f"Error in audio chunk generation: {str(e)}")
+                logger.error(f"Error in audio generation: {str(e)}")
                 # Clean up writer on error
                 writer.write_chunk(finalize=True)
                 # Re-raise the original exception
@@ -128,7 +124,6 @@ async def generate_from_phonemes(
             }
         )
 
-
     except ValueError as e:
         logger.error(f"Error generating audio: {str(e)}")
         raise HTTPException(
@@ -149,3 +144,92 @@ async def generate_from_phonemes(
                 "type": "server_error"
             }
         )
+
+@router.post("/dev/captioned_speech")
+async def create_captioned_speech(
+    request: CaptionedSpeechRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+) -> StreamingResponse:
+    """Generate audio with word-level timestamps using Kokoro's output"""
+    try:
+        # Get voice path
+        voice_name, voice_path = await tts_service._get_voice_path(request.voice)
+
+        # Generate audio with timestamps
+        audio, _, word_timestamps = await tts_service.generate_audio(
+            text=request.input,
+            voice=voice_name,
+            speed=request.speed,
+            return_timestamps=True
+        )
+
+        # Create streaming audio writer
+        writer = StreamingAudioWriter(format=request.response_format, sample_rate=24000, channels=1)
+        normalizer = AudioNormalizer()
+
+        async def generate_chunks():
+            try:
+                if audio is not None:
+                    # Normalize audio before writing
+                    normalized_audio = await normalizer.normalize(audio)
+                    # Write chunk and yield bytes
+                    chunk_bytes = writer.write_chunk(normalized_audio)
+                    if chunk_bytes:
+                        yield chunk_bytes
+                    
+                    # Finalize and yield remaining bytes
+                    final_bytes = writer.write_chunk(finalize=True)
+                    if final_bytes:
+                        yield final_bytes
+                else:
+                    raise ValueError("Failed to generate audio data")
+                    
+            except Exception as e:
+                logger.error(f"Error in audio generation: {str(e)}")
+                # Clean up writer on error
+                writer.write_chunk(finalize=True)
+                # Re-raise the original exception
+                raise
+
+        # Convert timestamps to JSON and add as header
+        import json
+        logger.debug(f"Processing {len(word_timestamps)} word timestamps")
+        timestamps_json = json.dumps([{
+            'word': str(ts['word']),  # Ensure string for text
+            'start_time': float(ts['start_time']),  # Ensure float for timestamps
+            'end_time': float(ts['end_time'])
+        } for ts in word_timestamps])
+        logger.debug(f"Generated timestamps JSON: {timestamps_json}")
+
+        return StreamingResponse(
+            generate_chunks(),
+            media_type=f"audio/{request.response_format}",
+            headers={
+                "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                "X-Accel-Buffering": "no",
+                "Cache-Control": "no-cache",
+                "Transfer-Encoding": "chunked",
+                "X-Word-Timestamps": timestamps_json
+            }
+        )
+
+    except ValueError as e:
+        logger.error(f"Error in captioned speech generation: {str(e)}")
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "validation_error",
+                "message": str(e),
+                "type": "invalid_request_error"
+            }
+        )
+    except Exception as e:
+        logger.error(f"Error in captioned speech generation: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "processing_error",
+                "message": str(e),
+                "type": "server_error"
+            }
+        )
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
index cd4234e..1f2f92f 100644
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@@ -2,15 +2,19 @@
 
 import json
 import os
+import io
+import tempfile
 from typing import AsyncGenerator, Dict, List, Union
 
+import torch
+import aiofiles
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
 from fastapi.responses import StreamingResponse, FileResponse
 from loguru import logger
 
 from ..services.audio import AudioService
 from ..services.tts_service import TTSService
-from ..structures.schemas import OpenAISpeechRequest
+from ..structures import OpenAISpeechRequest
 from ..core.config import settings
 
 # Load OpenAI mappings
@@ -72,55 +76,65 @@ def get_model_name(model: str) -> str:
 async def process_voices(
     voice_input: Union[str, List[str]], tts_service: TTSService
 ) -> str:
-    """Process voice input into a combined voice, handling both string and list formats"""
+    """Process voice input, handling both string and list formats
+    
+    Returns:
+        Voice name to use (with weights if specified)
+    """
     # Convert input to list of voices
     if isinstance(voice_input, str):
         # Check if it's an OpenAI voice name
         mapped_voice = _openai_mappings["voices"].get(voice_input)
         if mapped_voice:
             voice_input = mapped_voice
-        voices = [v.strip() for v in voice_input.split("+") if v.strip()]
+        # Split on + but preserve any parentheses
+        voices = []
+        for part in voice_input.split("+"):
+            part = part.strip()
+            if not part:
+                continue
+            # Extract voice name without weight
+            voice_name = part.split("(")[0].strip()
+            # Check if it's a valid voice
+            available_voices = await tts_service.list_voices()
+            if voice_name not in available_voices:
+                raise ValueError(
+                    f"Voice '{voice_name}' not found. Available voices: {', '.join(sorted(available_voices))}"
+                )
+            voices.append(part)
     else:
         # For list input, map each voice if it's an OpenAI voice name
-        voices = [_openai_mappings["voices"].get(v, v) for v in voice_input]
-        voices = [v.strip() for v in voices if v.strip()]
+        voices = []
+        for v in voice_input:
+            mapped = _openai_mappings["voices"].get(v, v)
+            voice_name = mapped.split("(")[0].strip()
+            # Check if it's a valid voice
+            available_voices = await tts_service.list_voices()
+            if voice_name not in available_voices:
+                raise ValueError(
+                    f"Voice '{voice_name}' not found. Available voices: {', '.join(sorted(available_voices))}"
+                )
+            voices.append(mapped)
 
     if not voices:
         raise ValueError("No voices provided")
 
-    # If single voice, validate and return it
-    if len(voices) == 1:
-        available_voices = await tts_service.list_voices()
-        if voices[0] not in available_voices:
-            raise ValueError(
-                f"Voice '{voices[0]}' not found. Available voices: {', '.join(sorted(available_voices))}"
-            )
-        return voices[0]
-
-    # For multiple voices, validate base voices exist
-    available_voices = await tts_service.list_voices()
-    for voice in voices:
-        if voice not in available_voices:
-            raise ValueError(
-                f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
-            )
-
-    # Combine voices
-    return await tts_service.combine_voices(voices=voices)
+    # For multiple voices, combine them with +
+    return "+".join(voices)
 
 
 async def stream_audio_chunks(
-    tts_service: TTSService, 
+    tts_service: TTSService,
     request: OpenAISpeechRequest,
     client_request: Request
 ) -> AsyncGenerator[bytes, None]:
     """Stream audio chunks as they're generated with client disconnect handling"""
-    voice_to_use = await process_voices(request.voice, tts_service)
+    voice_name = await process_voices(request.voice, tts_service)
     
     try:
         async for chunk in tts_service.generate_audio_stream(
             text=request.input,
-            voice=voice_to_use,
+            voice=voice_name,
             speed=request.speed,
             output_format=request.response_format,
         ):
@@ -159,7 +173,7 @@ async def create_speech(
     try:
         # model_name = get_model_name(request.model)
         tts_service = await get_tts_service()
-        voice_to_use = await process_voices(request.voice, tts_service)
+        voice_name = await process_voices(request.voice, tts_service)
 
         # Set content type based on format
         content_type = {
@@ -237,7 +251,7 @@ async def create_speech(
             # Generate complete audio using public interface
             audio, _ = await tts_service.generate_audio(
                 text=request.input,
-                voice=voice_to_use,
+                voice=voice_name,
                 speed=request.speed
             )
 
@@ -350,14 +364,14 @@ async def list_voices():
 
 @router.post("/audio/voices/combine")
 async def combine_voices(request: Union[str, List[str]]):
-    """Combine multiple voices into a new voice.
+    """Combine multiple voices into a new voice and return the .pt file.
 
     Args:
         request: Either a string with voices separated by + (e.g. "voice1+voice2")
                 or a list of voice names to combine
 
     Returns:
-        Dict with combined voice name and list of all available voices
+        FileResponse with the combined voice .pt file
 
     Raises:
         HTTPException:
@@ -365,10 +379,51 @@ async def combine_voices(request: Union[str, List[str]]):
             - 500: Server error (file system issues, combination failed)
     """
     try:
+        # Convert input to list of voices
+        if isinstance(request, str):
+            # Check if it's an OpenAI voice name
+            mapped_voice = _openai_mappings["voices"].get(request)
+            if mapped_voice:
+                request = mapped_voice
+            voices = [v.strip() for v in request.split("+") if v.strip()]
+        else:
+            # For list input, map each voice if it's an OpenAI voice name
+            voices = [_openai_mappings["voices"].get(v, v) for v in request]
+            voices = [v.strip() for v in voices if v.strip()]
+
+        if not voices:
+            raise ValueError("No voices provided")
+
+        # For multiple voices, validate base voices exist
         tts_service = await get_tts_service()
-        combined_voice = await process_voices(request, tts_service)
-        voices = await tts_service.list_voices()
-        return {"voices": voices, "voice": combined_voice}
+        available_voices = await tts_service.list_voices()
+        for voice in voices:
+            if voice not in available_voices:
+                raise ValueError(
+                    f"Base voice '{voice}' not found. Available voices: {', '.join(sorted(available_voices))}"
+                )
+
+        # Combine voices
+        combined_tensor = await tts_service.combine_voices(voices=voices)
+        combined_name = "+".join(voices)
+
+        # Save to temp file
+        temp_dir = tempfile.gettempdir()
+        voice_path = os.path.join(temp_dir, f"{combined_name}.pt")
+        buffer = io.BytesIO()
+        torch.save(combined_tensor, buffer)
+        async with aiofiles.open(voice_path, 'wb') as f:
+            await f.write(buffer.getvalue())
+        
+        return FileResponse(
+            voice_path,
+            media_type="application/octet-stream",
+            filename=f"{combined_name}.pt",
+            headers={
+                "Content-Disposition": f"attachment; filename={combined_name}.pt",
+                "Cache-Control": "no-cache"
+            }
+        )
 
     except ValueError as e:
         logger.warning(f"Invalid voice combination request: {str(e)}")
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index 078314f..8497cc6 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -17,6 +17,7 @@ from .audio import AudioNormalizer, AudioService
 from .text_processing.text_processor import process_text_chunk, smart_split
 from .text_processing import tokenize
 from ..inference.kokoro_v1 import KokoroV1
+from kokoro import KPipeline
 
 
 class TTSService:
@@ -154,23 +155,43 @@ class TTSService:
         try:
             # Check if it's a combined voice
             if "+" in voice:
-                voices = [v.strip() for v in voice.split("+") if v.strip()]
-                if len(voices) < 2:
+                # Split on + but preserve any parentheses
+                voice_parts = []
+                weights = []
+                for part in voice.split("+"):
+                    part = part.strip()
+                    if not part:
+                        continue
+                    # Extract voice name and weight if present
+                    if "(" in part and ")" in part:
+                        voice_name = part.split("(")[0].strip()
+                        weight = float(part.split("(")[1].split(")")[0])
+                    else:
+                        voice_name = part
+                        weight = 1.0
+                    voice_parts.append(voice_name)
+                    weights.append(weight)
+
+                if len(voice_parts) < 2:
                     raise RuntimeError(f"Invalid combined voice name: {voice}")
                 
+                # Normalize weights to sum to 1
+                total_weight = sum(weights)
+                weights = [w/total_weight for w in weights]
+                
                 # Load and combine voices
                 voice_tensors = []
-                for v in voices:
+                for v, w in zip(voice_parts, weights):
                     path = await self._voice_manager.get_voice_path(v)
                     if not path:
                         raise RuntimeError(f"Voice not found: {v}")
                     logger.debug(f"Loading voice tensor from: {path}")
                     voice_tensor = torch.load(path, map_location="cpu")
-                    voice_tensors.append(voice_tensor)
+                    voice_tensors.append(voice_tensor * w)
                 
-                # Average the voice tensors
-                logger.debug(f"Combining {len(voice_tensors)} voice tensors")
-                combined = torch.mean(torch.stack(voice_tensors), dim=0)
+                # Sum the weighted voice tensors
+                logger.debug(f"Combining {len(voice_tensors)} voice tensors with weights {weights}")
+                combined = torch.sum(torch.stack(voice_tensors), dim=0)
                 
                 # Save combined tensor
                 temp_dir = tempfile.gettempdir()
@@ -259,43 +280,237 @@ class TTSService:
             raise
 
     async def generate_audio(
-        self, text: str, voice: str, speed: float = 1.0
-    ) -> Tuple[np.ndarray, float]:
+        self, text: str, voice: str, speed: float = 1.0, return_timestamps: bool = False
+    ) -> Union[Tuple[np.ndarray, float], Tuple[np.ndarray, float, List[dict]]]:
         """Generate complete audio for text using streaming internally."""
         start_time = time.time()
         chunks = []
+        word_timestamps = []
         
         try:
-            # Use streaming generator but collect all valid chunks
-            async for chunk in self.generate_audio_stream(
-                text, voice, speed,  # Default to WAV for raw audio
-            ):
-                if chunk is not None:
-                    chunks.append(chunk)
+            # Get backend and voice path
+            backend = self.model_manager.get_backend()
+            voice_name, voice_path = await self._get_voice_path(voice)
 
-            if not chunks:
-                raise ValueError("No audio chunks were generated successfully")
+            if isinstance(backend, KokoroV1):
+                # Initialize quiet pipeline for text chunking
+                quiet_pipeline = KPipeline(lang_code='a', model=False)
+                
+                # Split text into chunks and get initial tokens
+                text_chunks = []
+                current_offset = 0.0  # Track time offset for timestamps
+                
+                logger.debug("Splitting text into chunks...")
+                for result in quiet_pipeline(text):
+                    if result.graphemes and result.phonemes:
+                        text_chunks.append((result.graphemes, result.phonemes))
+                logger.debug(f"Split text into {len(text_chunks)} chunks")
+                
+                # Process each chunk
+                for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
+                    logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
+                    
+                    # Generate audio and timestamps for this chunk
+                    for result in backend._pipeline(
+                        chunk_text,
+                        voice=voice_path,
+                        speed=speed,
+                        model=backend._model
+                    ):
+                        # Collect audio chunks
+                        if result.audio is not None:
+                            chunks.append(result.audio.numpy())
+                        
+                        # Process timestamps for this chunk
+                        if return_timestamps and hasattr(result, 'tokens') and result.tokens:
+                            logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens")
+                            if result.pred_dur is not None:
+                                try:
+                                    # Join timestamps for this chunk's tokens
+                                    KPipeline.join_timestamps(result.tokens, result.pred_dur)
+                                    
+                                    # Add timestamps with offset
+                                    for token in result.tokens:
+                                        if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
+                                            continue
+                                        if not token.text or not token.text.strip():
+                                            continue
+                                            
+                                        # Apply offset to timestamps
+                                        start_time = float(token.start_ts) + current_offset
+                                        end_time = float(token.end_ts) + current_offset
+                                        
+                                        word_timestamps.append({
+                                            'word': str(token.text).strip(),
+                                            'start_time': start_time,
+                                            'end_time': end_time
+                                        })
+                                        logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s")
+                                    
+                                    # Update offset for next chunk based on pred_dur
+                                    chunk_duration = float(result.pred_dur.sum()) / 80  # Convert frames to seconds
+                                    current_offset = max(current_offset + chunk_duration, end_time)
+                                    logger.debug(f"Updated time offset to {current_offset:.3f}s")
+                                    
+                                except Exception as e:
+                                    logger.error(f"Failed to process timestamps for chunk: {e}")
+                            logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}")
+                            try:
+                                # Join timestamps for this chunk's tokens
+                                KPipeline.join_timestamps(result.tokens, result.pred_dur)
+                                logger.debug("Successfully joined timestamps for chunk")
+                            except Exception as e:
+                                logger.error(f"Failed to join timestamps for chunk: {e}")
+                                continue
+                        
+                        # Convert tokens to timestamps
+                        for token in result.tokens:
+                            try:
+                                # Skip tokens without required attributes
+                                if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
+                                    logger.debug(f"Skipping token missing attributes: {dir(token)}")
+                                    continue
+                                
+                                # Get and validate text
+                                text = str(token.text).strip() if token.text is not None else ''
+                                if not text:
+                                    logger.debug("Skipping empty token")
+                                    continue
+                                
+                                # Get and validate timestamps
+                                start_ts = getattr(token, 'start_ts', None)
+                                end_ts = getattr(token, 'end_ts', None)
+                                if start_ts is None or end_ts is None:
+                                    logger.debug(f"Skipping token with None timestamps: {text}")
+                                    continue
+                                
+                                # Convert timestamps to float
+                                try:
+                                    start_time = float(start_ts)
+                                    end_time = float(end_ts)
+                                except (TypeError, ValueError):
+                                    logger.debug(f"Skipping token with invalid timestamps: {text}")
+                                    continue
+                                
+                                # Add timestamp
+                                word_timestamps.append({
+                                    'word': text,
+                                    'start_time': start_time,
+                                    'end_time': end_time
+                                })
+                                logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s")
+                            except Exception as e:
+                                logger.warning(f"Error processing token: {e}")
+                                continue
+
+                if not chunks:
+                    raise ValueError("No audio chunks were generated successfully")
+
+                # Combine chunks
+                audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
+                processing_time = time.time() - start_time
+                
+                if return_timestamps:
+                    # Validate timestamps before returning
+                    if not word_timestamps:
+                        logger.warning("No valid timestamps were generated")
+                    else:
+                        # Sort timestamps by start time to ensure proper order
+                        word_timestamps.sort(key=lambda x: x['start_time'])
+                        # Validate timestamp sequence
+                        for i in range(1, len(word_timestamps)):
+                            prev = word_timestamps[i-1]
+                            curr = word_timestamps[i]
+                            if curr['start_time'] < prev['end_time']:
+                                logger.warning(f"Overlapping timestamps detected: '{prev['word']}' ({prev['start_time']:.3f}-{prev['end_time']:.3f}) and '{curr['word']}' ({curr['start_time']:.3f}-{curr['end_time']:.3f})")
+                        
+                        logger.debug(f"Returning {len(word_timestamps)} word timestamps")
+                        logger.debug(f"First timestamp: {word_timestamps[0]['word']} at {word_timestamps[0]['start_time']:.3f}s")
+                        logger.debug(f"Last timestamp: {word_timestamps[-1]['word']} at {word_timestamps[-1]['end_time']:.3f}s")
+                    
+                    return audio, processing_time, word_timestamps
+                return audio, processing_time
 
-            # Combine chunks, ensuring we have valid arrays
-            if len(chunks) == 1:
-                audio = chunks[0]
             else:
-                # Filter out any zero-dimensional arrays
-                valid_chunks = [c for c in chunks if c.ndim > 0]
-                if not valid_chunks:
-                    raise ValueError("No valid audio chunks to concatenate")
-                audio = np.concatenate(valid_chunks)
-            processing_time = time.time() - start_time
-            return audio, processing_time
+                # For legacy backends
+                async for chunk in self.generate_audio_stream(
+                    text, voice, speed,  # Default to WAV for raw audio
+                ):
+                    if chunk is not None:
+                        chunks.append(chunk)
+
+                if not chunks:
+                    raise ValueError("No audio chunks were generated successfully")
+
+                # Combine chunks
+                audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
+                processing_time = time.time() - start_time
+                
+                if return_timestamps:
+                    return audio, processing_time, []  # Empty timestamps for legacy backends
+                return audio, processing_time
 
         except Exception as e:
             logger.error(f"Error in audio generation: {str(e)}")
             raise
 
-    async def combine_voices(self, voices: List[str]) -> str:
-        """Combine multiple voices."""
+    async def combine_voices(self, voices: List[str]) -> torch.Tensor:
+        """Combine multiple voices.
+        
+        Returns:
+            Combined voice tensor
+        """
         return await self._voice_manager.combine_voices(voices)
 
     async def list_voices(self) -> List[str]:
         """List available voices."""
         return await self._voice_manager.list_voices()
+
+    async def generate_from_phonemes(
+        self,
+        phonemes: str,
+        voice: str,
+        speed: float = 1.0
+    ) -> Tuple[np.ndarray, float]:
+        """Generate audio directly from phonemes.
+        
+        Args:
+            phonemes: Phonemes in Kokoro format
+            voice: Voice name
+            speed: Speed multiplier
+            
+        Returns:
+            Tuple of (audio array, processing time)
+        """
+        start_time = time.time()
+        try:
+            # Get backend and voice path
+            raise ValueError("Not yet implemented")
+            #  linked to https://github.com/hexgrad/kokoro/pull/53 or similiar
+            backend = self.model_manager.get_backend()
+            voice_name, voice_path = await self._get_voice_path(voice)
+
+            # if isinstance(backend, KokoroV1):
+            #     # For Kokoro V1, pass phonemes directly to pipeline
+            #     result = None
+            #     for r in backend._pipeline(
+            #         phonemes,
+            #         voice=voice_path,
+            #         speed=speed,
+            #         model=backend._model
+            #     ):
+            #         if r.audio is not None:
+            #             result = r
+            #             break
+                
+            #     if result is None or result.audio is None:
+            #         raise ValueError("No audio generated")
+                
+            #     processing_time = time.time() - start_time
+            #     return result.audio.numpy(), processing_time
+            # else:
+            pass
+
+        except Exception as e:
+            logger.error(f"Error in phoneme audio generation: {str(e)}")
+            raise
diff --git a/api/src/structures/__init__.py b/api/src/structures/__init__.py
index e4933b4..327da94 100644
--- a/api/src/structures/__init__.py
+++ b/api/src/structures/__init__.py
@@ -1,3 +1,17 @@
-from .schemas import OpenAISpeechRequest
+from .schemas import (
+    OpenAISpeechRequest,
+    CaptionedSpeechRequest,
+    CaptionedSpeechResponse,
+    WordTimestamp,
+    TTSStatus,
+    VoiceCombineRequest
+)
 
-__all__ = ["OpenAISpeechRequest"]
+__all__ = [
+    "OpenAISpeechRequest",
+    "CaptionedSpeechRequest",
+    "CaptionedSpeechResponse",
+    "WordTimestamp",
+    "TTSStatus",
+    "VoiceCombineRequest"
+]
diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py
index 0ae50ce..1d69412 100644
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@@ -22,7 +22,19 @@ class TTSStatus(str, Enum):
 
 
 # OpenAI-compatible schemas
+class WordTimestamp(BaseModel):
+    """Word-level timestamp information"""
+    word: str = Field(..., description="The word or token")
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+
+class CaptionedSpeechResponse(BaseModel):
+    """Response schema for captioned speech endpoint"""
+    audio: bytes = Field(..., description="The generated audio data")
+    words: List[WordTimestamp] = Field(..., description="Word-level timestamps")
+
 class OpenAISpeechRequest(BaseModel):
+    """Request schema for OpenAI-compatible speech endpoint"""
     model: str = Field(
         default="kokoro",
         description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro"
@@ -50,3 +62,29 @@ class OpenAISpeechRequest(BaseModel):
         default=False,
         description="If true, returns a download link in X-Download-Path header after streaming completes",
     )
+
+class CaptionedSpeechRequest(BaseModel):
+    """Request schema for captioned speech endpoint"""
+    model: str = Field(
+        default="kokoro",
+        description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro"
+    )
+    input: str = Field(..., description="The text to generate audio for")
+    voice: str = Field(
+        default="af",
+        description="The voice to use for generation. Can be a base voice or a combined voice name.",
+    )
+    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
+        default="mp3",
+        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
+    )
+    speed: float = Field(
+        default=1.0,
+        ge=0.25,
+        le=4.0,
+        description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
+    )
+    return_timestamps: bool = Field(
+        default=True,
+        description="If true (default), returns word-level timestamps in the response",
+    )
diff --git a/api/src/voices/v1_0/v0_af.pt b/api/src/voices/v1_0/af_v0.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_af.pt
rename to api/src/voices/v1_0/af_v0.pt
index c7373936296af8bd031085dbbf3e5fec941b1321..6702c91af5ccd17fadd33d333b92b54a7020020f 100644
GIT binary patch
delta 458
zcmX@ypm3={VS<zxAA<-(nL&JFntn=RNupjsc8;5q0K-IsO2&T^SJ|_q#>y&Be5)kJ
z&mad?mQ-1inqQQXTIA*=HMxONda?tffiMq)1ClBF25wF^Ksnpy1B~ql7(ti`h?#+y
z1&CRJm~Hz3Ms}?~qPz^OP>p4&Ma7x<d2UXO4AcMrW!GhwVK4znO*i<*F3o5FG|te#
zz|`E#!qmdT)ZEy@)Y8(-$lSun00b-zjZG{p3_;k)z|<(fo1Md@yxw#AhJWl5iXcBM
zWnf?cVLl*61t2S){bN@b<wp_!3gm&L<o~m4i1MIFg)u^;BLB0i0zH5XF8pN&>zeSN
zT}lz;0u7)n2+JU=Vi5WVQ3mvy<n)I?8>}pVW`e*DpbQ8Ec(Z{SAO1u5An5>aRyL3v
LD-eRzL(~EQ7vyLK

delta 444
zcmccApm4ZBVS<z}H-k7sVw!$RVo9Q2L3WOtlQ6^NiHyP%9WojJPh4fs^6K)rmlNMA
ziSRHeg488dmZatvrKA?QImv<)%1(A*G!W!uaE2M8Z{X%+50cTJyns=l`2b`40Y(sJ
z0%B$$W&vVWAZFWsfRSD6k0=)d2S{sKYEf}!ex92X3qw>?)b#&<*>x2q8O%XadLXyy
z7pE3wCgx;TC6)k{#b>6tIT^!jVKFc;FflhYGB!6iv@`=^b5jc=GoYY>p}Dbvp`p2%
zp^34fk&&TUfHylwU(m@J(>MHMmyiQ_Z7Bl-0|;{iF)En;@*lgp5D$vrS0Hb?>VI|(
zAx;$WFh-zw(tmbUAubg03xC-erqB7$E+q%DRRbso!jdRzg#H25T>sDRD5(iF1q5~g
WML;0Hn+?o(IMt9toC_oikpTb*TW(wc

diff --git a/api/src/voices/v1_0/v0_af_bella.pt b/api/src/voices/v1_0/af_v0bella.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_af_bella.pt
rename to api/src/voices/v1_0/af_v0bella.pt
index 83f79c22a282436ba25dbb1d35ad096da143e429..daed550f127831c5cde0e1bb9680aeacec5848cb 100644
GIT binary patch
delta 491
zcmZ43sIZ|?VS<#52m>EOnL&JFT6|J!PEMkJN@7W(UO{$_n-dqqM6+VXe-l^Pv!uq#
zDo=c?BqPcofucC6vLrRXC?&PX%}HcB<3D!M$qtMLl0pntD5^k4>KnK@nSqp;Hy>bZ
zKfnmWOhC*G#4JF}3dC&N4=}Q8{gD!8ut(8fmReMtnV;w8WDPRFX1c>ab{%#l20f6X
z=>`AT6&bZa650j^rsifArWO{a=EfGLmX>Bl<`zZ<AYf@|Y-((2Xk>0?W@=_^8sN>&
zQJ5O=efo!g>=KF~&n;zOU;tqeAVvisJJkNO>xzn^h<^q0KvJpy+4V$)P^7{bAyP~K
zvulS5qexx&%Z|-w0p5&EAaxuZ5Y-^JX#klZtOOB)l0pB#2254paF+zx0|GyQVjvLU
R%?4)pO>N*1=K{$>WB>$mcRBz7

delta 432
zcmdncsIah6VS*HkAcG*oL`7j%UWTZssEKC9jQ=ODvS)d9`P|EiZ<VBk7^E2z)8dm-
zb8-^(lPXJ6^NUhai`<;V875C;6rb$CXduqdV2e<al30?cZ{X%+36j^JynvCb`2b`4
z0Y(sJ0%B$$W&vVWAZFWsfRSD6kAwh&6T*P9)S}|d{5&@&JD3Ul(;fb?>#)c(7&1)N
z=a6S{bJ7J#=^7Xqn3x+H8Jn9MTABf|xv7Pb8Box`(A?O-(8R#h+{o12%)rz#z?+@p
zPDaeO=^y^FOUQwIwv>T^0fYsC7!^#{{Lii{B!nXP708>O{hwV=h#y5fj1efl?mxS>
zkN}GKg}>|!)8GAPSC9kQssWS(VR;laLI2o6zHygy2O0tbKY#)t5a7)QX4p^d=Md)t
H$wFiRE|X|h

diff --git a/api/src/voices/v1_0/af_irulan.pt b/api/src/voices/v1_0/af_v0irulan.pt
similarity index 99%
rename from api/src/voices/v1_0/af_irulan.pt
rename to api/src/voices/v1_0/af_v0irulan.pt
index c9e1b7108bfbefd6d69258c48a3ce1a816767529..3d3ccc2ac46361c9043434a5e2dcdfc08181a253 100644
GIT binary patch
delta 580
zcmX@osIawBVS<!`C<8A;nL&JFT6|_vX-;CEeoA6VqFzCEj++xFLsV4MM5`!qW(Ed^
z<bu*d7B7bXj0`1(tbx7^jK2RSuC?dv<h-1;3@E*9y8l0RS!FQ>F;vr%DoaxHi&9dH
z+?<3!#zjR<HfFSy7G|(SRR^+F-@whulwtBjM*YbP7}=T+Ft#6H1Yss1W(HywAZ7((
zw(SQP*|q-2h%nfp8d8>8RGgWg=jLPuGlhG)!#{Q%eq{z-WR=tF{;^9jX@V4KPCxLE
zT~gf8*wVz<$il?b(9GP@*et-Cox?Qu{I}^J{;^AF!u<&JrzjAk0;Is3Zt|a9O;!v=
z{u_{otfcBcySl6}ijpu!R3!)hv#Z35peVWUmmS%ZJbkEF1H2iTKngiHAj(0G)&Md=
zSQ#P&C4>H<m;ed_iK$5(8dlLj+d<$5Pyq-8c(Z{S!3+>HLDB)<tZX1TRv-kaho}Vr
DORJRJ

delta 595
zcmdnmsBolFVS<!`1OpdCS$<}6YP^x2MIMka&C^dwEJ@TW$j)(dVq=(S6(z~Wz`&4P
znv!T`P{`uN@Sl;Pq>wexo5A=0#MSnk^(ntARsa>OnC}0NT~<+&K@h{dq{@=i{Gyc9
zA~z>qhRMc^wsK+&W*DkKw(1+WIT<lbp2(=*yqB?kFCz#u0WmWWvj8zG5VLLH%gC<#
zM_!!48pDjT)S}|d{5&@&bFd+k7cg>6clgJyBc#Tlg{F9V^*?qgCN+>cwdn``u}g{@
znVVZ0T9}wvnwlA#m|FyRvvU;8F3O(%;UBw%4&0kSuSx(hDnJdv>4yK=)f6OAl>7kl
z&{UQDXIEDcLs1pRh@oo7e|D8raTHY-{<5Q)m8TE&d4M+~6G$xw2Sf+R0~$ak2&+Ma
xpk&ZLbVH_Eb2wOK0bKwBKY-F85a7)QW+X8{Oaw^>c(a0%1OqD&g49FQ0szXnksSa4

diff --git a/api/src/voices/v1_0/v0_af_nicole.pt b/api/src/voices/v1_0/af_v0nicole.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_af_nicole.pt
rename to api/src/voices/v1_0/af_v0nicole.pt
index c218f7d1eb757fd4c3b09b8090aae1963a93d18a..914738c6ef9697b536fa61f8bf32d8e4c97c79b7 100644
GIT binary patch
delta 482
zcmZ41sIawBVS<#bC<8A;nL&JFT6|t+a(+&#eoA6VqFzCEj++xF!$k8U#(xu6*|Vg^
z$|_HMt0X7JAdad!sj?(BzbGZO$jwQZVe&*q;mHn+2GYU|mZ-`=hUy!*Ihlgg=uck2
z$ku#-vHbue2r~gOGZ3=?F)I+WZ9l-suJuPoguxEgkh0XG;>`R!HzzBYDcsW?{;}(@
zD>LYVR8BAW$F9Vr36j(_FfcVYvoN)=Ff})}FtxNaGcva@G5`TfLt|4@V<QtI3lmFY
zLz4h+b`DRA-*2XW_{T1x2=eDr1_lNY76oEd0J1~vKf9i&7>f8;AP*#!_Mcr}R2W4n
zj1eNW{6D*ns0fPGg}>}zT~GhBD=LCqpaGNyVPzC$LH{7ifPPb$>cXMH7(F$YLtH7C
bfdi}zXg))LH!B;6!wQ7aK!G1XEes3*I>L4X

delta 439
zcmdnmsIaV2VS*Hk5Q6~2L`4x+9)_r>sEOu9jQ=ODvS)d9`P|EiZ<VBl8Kf8z)8g|o
zlk;;@^^+<~QuB*aQj6T2#26+|WE7k1z-S;Lz+i)<CMB^XQQyGL$pWN6fARuG&gKJ*
z?FSe^m<foPftUq|S%H{s`vFFFtv@V+42}%bKmKJGX0nCp;+yX9k6njFfx&=bdhtJY
zB_<t^jE;eUfr+`Hk+HeCp`{rRo10n~nE?e249$%!EewDn24*IvhL*+w-s~J*FE%Jm
z|L~7pLJs7er3?%VAS?vLs9?J0e|9|~VHClyK;HD6|Lpog0x05Pj6m@X|Jikf1X08<
z{ACAv;Xk{g9LQD;pd1J*pr{G@$IdX-n8U-$2WSWg`~V7oK!7(JnBl^}0p^3G1H4(;
NKpa*e1gVFp1pqQ~Wl8`5

diff --git a/api/src/voices/v1_0/v0_af_sarah.pt b/api/src/voices/v1_0/af_v0sarah.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_af_sarah.pt
rename to api/src/voices/v1_0/af_v0sarah.pt
index 57ae574719e280857af2c45f7be861f4a15a5a82..25032faab51ff12c915edf400d76753a9524ffab 100644
GIT binary patch
delta 491
zcmZ43sIZ|?VS<#52m>EOnL&JFT6}S0QDTOEN@7W(UO{$_n-dqqM6+VXe-l^Pv!uq#
zDo=c?BqPcofucC6vLrRXC?&PX%}HcB<3D!M$qtMLl0pntD5^k4>KnK@nSqp;Hy>bZ
zKfnmWOhC*G#4JF}3dC&N4=}Q8{gD!8ut(8fmReMtnV;w8WDPRFX1c>ab{%#l20f6X
z=>`AT6&bZa650j^rsifArWO{a=EfGLmX>Bl<`zZ<AYf@|Y+`C?Zft332tong>>Qku
zDUQ=W{9~6;1bJ>L0|NsHivTey0NJ7TpIui}6h-_ikOz`V{m-r^Duf~x#t4yG`k!4p
zOc+J#!e4f5J`3<>WCE$<;DD$GxlIGe1YsqJ5R?r12R2};0*AXK$Q}^*0Tcs)0B<%h
P!*6N>hd38V79s-x)jxFN

delta 432
zcmdncsIah6VS*HkAcG*oL`7j%UWTZssEKC9jQ=ODvS)d9`P|EiZ<VBk7^E2z)8dO0
zixM;RlPXJ6^NUhai`<;V875C;6rb$CXduqdV2e<al30?cZ{X%+36j^JynvCb`2b`4
z0Y(sJ0%B$$W&vVWAZFWsfRSD6kAwh&6T*P9)S}|d{5&@&JD3Ul(;fb?>#)c(7&1)N
z=a6S{bJ7J#=^7Xqn3x+H8Jn9MTABf|xv7Pb8Box`(A?O-(9+c0+}PO2(A?B8z?+@p
z9Q&-<(?9%UmyiSbY$*c+0|*NOF)Em@`JY`^NC-vnE08xm`#-y$5I>4|7$Z=8-G6p%
zApsQe3xC-eroa2ot{?}pRRbso!ty9;g8s3CeB&<Z4m1P=egFkPAi$dq%&?!@&mqnQ
Il7+|s02{4m>i_@%

diff --git a/api/src/voices/v1_0/v0_af_sky.pt b/api/src/voices/v1_0/af_v0sky.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_af_sky.pt
rename to api/src/voices/v1_0/af_v0sky.pt
index d86ae26ddbd1c9b905fd8253a603eff7b79fa202..6f697680f88787c31343e1baa6d95d567ed51fef 100644
GIT binary patch
delta 488
zcmccKps=h_VS<#D5Q6|inL&JFT6}SKrG83cNupjsc8;4955q*0QpSH1SJ|_q#>y&B
ze5)ia%pirNG^w&AHNPk&waCp$jA8ObMzP5bj0O?{3^qt=K*s4CxH(yX6zET0z{uHr
zfU*4mBM37AF*6Xe05K~Nvu!`X$gcHAQjozB$%L}hqT<Z_JU1s>m=S!_{;}(@D=-*<
zR82Sd$1cyL1CrA*FfcVYvoN)=Ff})}FtxNaGcva@G5`TfLt|4D1E8dVk(sHPS%5b?
zhsj!_FVi>tW0z0_`EDr#0|N*P0Wm57+41ZjySAt>iuhL`4<x1hpIt{(07WW{5h9iN
zpIuW_5Jl?3Uv{vrIse(^6hSV~07`?f0*bPre-LFrzsXMj`k!6H$_HpB2>bxbfIxsZ
a8<^q3zyaoiqyxNJ*+3jtAOxw0s09F`U~<&}

delta 449
zcmZ41sBpbOVS<ztKZ7tsVp@E0cBOtwVo9Q2L3WOt6F);#RMbS1QpW!iSJ|_?x_s{C
z#J5Tk0t~V+rAd_~srf}IsYPy1k_?k4GD=Q%U^EcrWw3`S0GXw4;O1lvlGUHQfRU&9
z0Au?BMi6EKVrC#_0b*7lX4`&%kzMPL7#{-*OnX^sQE_H|o|_W`!*qv#>^dy648{yo
z)j4EY+?@2mMj99xn3x+H8Jn9MTABf|xv7Pb8Box`(A>z<!q~vj($L7##K6Kbz?+@p
z%~_8-(?9%UmyiSbWhny#0|@g2F)Em@`JY`|NB~9fE08xm^FO<e5HE^&7$Z=8)qi$P
zAwCrG3xC-eroZ^lE++@FRRbso!m=o8g#NLEeB)|m3p4}-b^rxHAi$dq%=r2r!UstQ
Qc(byB<XC|aq#mLc07@NdTmS$7

diff --git a/api/src/voices/v1_0/v0_am_adam.pt b/api/src/voices/v1_0/am_v0adam.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_am_adam.pt
rename to api/src/voices/v1_0/am_v0adam.pt
index 16b8f95d65b0d625cb95db80d1f172788202549e..aaa66af9e2ec566ad14e3e5f64d7c89663497d3b 100644
GIT binary patch
delta 492
zcmbQ!sIa<GVS<#jFatkBnL&JFZhT@&Vy=ElVo9Q2L3WOt6F0*|(-Ovi6Ia=@q{hlB
zPkgH+Ey5s)tTw5#BsIS%CAG-SNp$+ef9ztD9T*KH1R1Q66@d)YH*j+@2dU7XynvCj
z`2b`40Y(sJ0%B$$W&vVWAZFWsfRSD6kE9TT1F{iisYS(^`FU<mHZU{zraSy&*I`#=
z&<ClTUht1yfl(VIsbgSZYHns>YGGk&Zfs#{X=!F;Zee5q0+xoxCT14K=9ZR5W+tWv
z0p9E!j{Wn#Pyg_bT|yD$!=(%i3?M8F#HavdhuVL39Z?Y!@vlH0NGjz&yRN7pic}aQ
zL~8MWb}dmM6sZe;*}=N*|7Vw11i3&1C=J4jD9VEVL6iagCO6fJL&GW%Xe0>y07`&B
cfHxbM;mN=O=7XdIyjj^m99AF%sfVZq0Dg3L@&Et;

delta 461
zcmZ48s4%BdVS<#j0D}-iVs3n5N@A{lN@7W(UO{$_n-d>HR8-VN(-Ox26Ia=@yt;hu
z<;1s2l7b8}aJ5O5C8_yEDXB$nP7(~0Co)P*c3?CR<72Rcs{om%Z{X%+1(MdEynvCr
z`2b`40Y(sJ0%B$$W&vVWAZFWsfRSD6k2pVrGhBaJYEf}!ex93?J<I@s=??$cby(yW
zj2NbBbI7r{Iq89<^b8CPOw0|9jLpprEzN+~+|<Iz3@B(|Xl`s^WMFP!Vr*(*ZeeU3
z;LXkvU2>y%`iFn)5^^AKEoER}0AT?jMg`L~|Fi1|38Dyo1@fk6{b$z|;zJP+V+4w?
z`OmH;#E&9=;V(PG^w<B{<>f%OY5?UxSPn%^&_8yNZ``b$fQEp;51;@D1bDN78CDD&
UU_MAXz?+o~#9;+Oka~z(02%FT-v9sr

diff --git a/api/src/voices/v1_0/v0_am_gurney.pt b/api/src/voices/v1_0/am_v0gurney.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_am_gurney.pt
rename to api/src/voices/v1_0/am_v0gurney.pt
index d927a87434db159a0dfcb38cc157471225f248a4..1e830733a9fecd67559ef97812f54f18c925e5fb 100644
GIT binary patch
delta 542
zcmdnhsIawBVfux?>@qUE3}puKiMjFVrA2wEmHH`(C5d_k**R`boD5MDEn>u(85kIn
z3rY)Fycqs7GL#gu2Kq8E`u>}^)}FJI^K#NMp!Blo{{Prz6~!3DP|ZrJEJ@8TN=Yqp
za}r{Rii(<S$Y?Ju%wUPC3S^_cft!;l!{mvK`jZzhvNa!IY(KyV!c0KS48$xz%nHP8
z+Yd0ZYyFWCVX#9rq%5_lI5R)b&B+R83iot}f9yH}$_%<FDyKSfh_kpkX@Zn#PCxLE
zT~gf8*wVzv($Lt*%*?>h#4Ny@onr(0gWTyK{;^AF!TkvIrzjAk0_32ZZv3BJRZa{=
z!8afeMNRpCb~QO+6g6RtXlnNSXIGXJK~V#YDO7X*{Rg@Y<Vp>oIuKSyQ6KaVRXxzx
l_KeX}8#%<4f*Cl#N`baA1bDNufjF!{7!4Ho0o2C8001{qk&*xa

delta 584
zcmdnmsIaS1VS<#L7y}POS$<}6YP@klyrG_9fqqJ2Nupjsc8;492g5{*7)dq;28QI)
zlte3oLKZKE|BMVJg{*<z48H#-uD0i_Px)Q30;picbpL<svhv~#!f2)?RhFdY7p0^Y
zxjFGOOg3b+ml0vGKvM&<QQyGL$%J9@L`MC|3m93O4=}bLU<6?%AZ7+)79eH?Vz%uE
z7}>S{$ci%9qM1^bT2!2wpXcUe2{VRky2C$q9YGZa9aN>$YyPoIGHHO6X-q!=WQ!Y_
zTNoG{7+4xx8XFlHS_XKtb6jY2zB&EFKXwUixF3Q36a!*ZfD(AqjsCN%0z&~A`~Y%L
z6_x&HSCbb(R>Kg+h^}boe|F_0Q4~cN{<5Q*q!0CTfHxx(NF@gcL_Nsu8bBrpt3ZUH
vWY9k}Bc|GL*jpt4Z3lrLKv@t7@MZ%uA{ZbBf}{hySwSg*ffWcr>LF?Y7Vwc!

diff --git a/api/src/voices/v1_0/v0_am_michael.pt b/api/src/voices/v1_0/am_v0michael.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_am_michael.pt
rename to api/src/voices/v1_0/am_v0michael.pt
index ff1382f8f507c43f464f5bf566e0c8a627b739f4..e0d5a2b8b77ed4e105368d44cadbc82820c22032 100644
GIT binary patch
delta 500
zcmZ48sIaS1VS<#L7y}POnL&JFZhUTLaz<imj($pFNupjsc8;492g5{*LdJg+SJ|_q
z#>y&Be5)ic&LD=SJgKrIHNPk&waCp$h+(oGqmzsXg9VxzkdgWZZcZi)lP5CjPhP;t
z+I)bq{Qx5fGXXI(5VHU=D-g47KfuVY^+#5e!4}PwvecsD%=|nzCrg+yT+<!?vFos_
zFzA4kPA~Y!uFRwXlGHFTFf})`FtxBSH8-{}wX`%dGPf`?00B!wV-piY6H`M26H^Nl
zg8*-K4y#|=-%S7Tk6l6$<jtiF3=AMF2E?cUWQW>+c70KC6!EV>9!M(vKf8gb2#Qn~
zBSdQDe|B9_Q52~Qf7!vhUi@cQQUtj`11Jr`Dk#c={y~%h{iZn8lS9KQ0ca!$`~XUT
dK!7(Jm=VFi0p^3G1H4(;Kpa*e1gVFp1puGsc;)~A

delta 427
zcmdnhsIa<GVS*HkFatlsL`6|nZic9+sEHPZjQ=ODvS)d9`P|EiZ<S<37$g}IbK`R}
zlQR-ibM%ucOH%WTQc{cDoJ1KWPh=FG?7(OsDac@rtSBY1BvIeM&B+|3LVxlCMvmqK
zjO_;)L6`}MnSq!Eh*^P{ZTkU6cC9}wLJSTJ(?9-Y7h$r2>EfO4@Q+=GMUg?DVX8TY
zB8!`oHb_d_z`(%7+|bC_+}zO842aE5EsV^7f(C}>#s)?P76uk3=4OUQX2t>D>>Ng>
zE4EMn@Q+<W4&<Yy3=9k)EDXe`V4|Y%besR|GD0FK;$MNh>AC;e4TJ<y#KRbY;+y`n
z>k0{>h+p{2&M^JUe|9A~ki{B6IS^JvQ4{o!9poWT#=xnK9O6oz3>;u-pot6t-mGjO
Q4l57_0tJ2ml`=2@0LqwSvj6}9

diff --git a/api/src/voices/v1_0/v0_bf_emma.pt b/api/src/voices/v1_0/bf_v0emma.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_bf_emma.pt
rename to api/src/voices/v1_0/bf_v0emma.pt
index 7b3eece4d81b61b777eebab89415dfb6e71a6320..26b04662a6820c22d3219bd72ee860f189505def 100644
GIT binary patch
delta 566
zcmX@xps>18VS<#jFatkBnL&I~T6}75ZlZolVo9Q2L3WOt6F0*|(-Ovi6Ia=@q{hlB
zPkgH+Bf=nwtTw5#BsIS%CAG-SNt9voL`Koc4vYqpf(+Kkia-YH8@M@{gH-5GUckuF
ze1Nh203!%90WmWWvj8zG5VLJRz{sxkM@opn0ojPM)S}|d{5&@&8<-iq)BdsRXeu)3
zBP-JbTUDG|l$n^5S(R7<)DoYW;^w3c(xq)+U}|n=VQOJvYHn;{YH4X^WNu+(00NeV
zMiyq~<|Y=F7Dk4KMgiXJ9Lv6>J(|AZAG?GG$m>fP7#KiU7>H2;BH*UK`p2#c3?O9i
z705wSq57X)O-2w|0Yexgii*Vl?8-4hC@L=eWk)hY9~u+^-i%BjWgHw3#US@;0GS}H
z2oZvkLI05TPrve?-QFq?Xe$W(0Lp<tfHxbM;mH6o4kR7m&B_LnV+BHxdWc#8lHQHO

delta 506
zcmZ48sBpGHVS<!2FM}vUQd+!$eoA6VqFzCEj+>JpLlhWHG%aELKXH{k%d5-hUQT?g
zB*w=e4^f*`S(2Jxl#*KH<|NH9c_O3qWCunAVQvOTgi-niZcer!IsM5C82OqHFt#6H
z1Yss1W(HywAZ7((w(SQP*|q+N@G!7JG?%3o6=&w>xj8X0Ot<^TuBR-`U<#4Z136K@
zIJGD<F(<Psu>`0pJ~PG5$q;OfQB;(Hfr+`Hk+HeCp`{rRo10n~nHd0qp}Dbvsey^H
zrKOR9iK&5cfHylwm&>ki(?9%Umrw%vZYcu;0|@g1F)9Fs&2)qR?5aTDBZIF%4p^@6
zKf9V3H?kB%7$a0}^M7{b03H;%3xC;B{2AcQ$OKZr!2wYQa+d~>3BuA4At)*I52ACb
o5r@5%G0+MS*Z~v-fdFqdFyr-q2p=RJ;LXYgl4Auzka~z(03zOa9{>OV

diff --git a/api/src/voices/v1_0/v0_bf_isabella.pt b/api/src/voices/v1_0/bf_v0isabella.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_bf_isabella.pt
rename to api/src/voices/v1_0/bf_v0isabella.pt
index ebf95a5ec766fae552e273efe5c36a91da1b3d0e..91c100b099872ad430d09914d890707753c6a4fe 100644
GIT binary patch
delta 640
zcmX@xps>GDVS<#rI0H9BnL&I~T6|`4Vp3{OPNIHFVo9Q2L3WOt6Fb91%Uo7p21eh1
z6Ia=Dc5+@$TE@V@uxvX2KXzFK2?h~#tx1(7srf}IsYPy10t}N47#(Cq8O+fYfy~!8
zaC0(dm^_hDfARuGmgWPD?FSe^m<foPftUq|S%H{s`vFFFtv_;N3^wS-l%*CGXXfX*
zIa$EW;hgr5T}M}yK^tAE9@xs_)S}G9oXo1k5}>yD%oH~#b&yVV0|QfYGYeA-3sZAr
z3sXx=Gb3{gBLg5XG&Hs}Gc+|aGB-6aH8L;>@Mh-_Of(UmzTqFcgbqA3fFU9d#Hau@
zOsBv4$F2$tFl6uz$U#%3`k!4*K@?dLLl`55s>J{7%JpI>sxJIxM>9(w8ax5sj7%V<
z92^i0AoptknINnR5rUFI|1iwJ9&Q2NY;4ot|7TaRN(DL&1bzUuf&fqk%!p-x*angg
Q@MdKL$*}?<NIgU?0MewT;Q#;t

delta 490
zcmdnrsBpGHVS<!AFM}vUQd+#BeoA6VqFzCEj+>JpLlhiLw9IAkX7K$#akV|?tIOwJ
zE@fa~SUTPRAG@p=AA>wZXHsQJYJO2lYLT0hG{fYHjM9@G7!8EE85|J?>l?T^*@EQs
zCof>+Yd*l(et;2#nShuXh*^M`6^PlkA7EtH`Xj=_zy{G=mReMtnV;w8#KbV&?jO4z
zhctsJSY~?NKXyqbL$H}fQBej4Cgz4l#^&aRmS#X~Zfap<W&i|+=Eep_hUTW`X6EJw
zhGvEV-s~KrX%k;f|L~7pLJ968ps#p=7!`nmWxC0Kc2%HPk-=9W2P{|lpIuFi8(E4W
zj1elg|3ABO01t}Xg}>}5o(k}0WCAJR;D9Itxl04c1Yv225R?@92hj=gwD{Cm4h?Bz
dpgkb411JLm0p4t2#_Rv=Qx|ZEa)IO_asYBBcA5YH

diff --git a/api/src/voices/v1_0/v0_bm_george.pt b/api/src/voices/v1_0/bm_v0george.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_bm_george.pt
rename to api/src/voices/v1_0/bm_v0george.pt
index 3f8d049f25bb4a491ed8c19280b82ae07a37f1af..24e81f1f26b844bf69bd684c01d80e727eec3309 100644
GIT binary patch
delta 549
zcmdncsIawBVS<#bC<8A;nL&I~ZhU%beo=a=eoA6VqFzCEj++xF!$k8U#(xu6*|Vg^
z$|_HMt0X7JAdad!sj>v9EhV+c%}JPH@<c}A$qtML(!vausLDWw>KnK@nS#{lPhP;t
z)_j1m{Qx5fGXXI(5VHU=D-g47KfuVY^+!g8!4B1svecsD%=|nzCo7mK+|wQYvFivZ
zGw7nIoL>EpU6M%?q)OAkz|`E#!qmdT)ZEy@)Y8(-$lSun00b-zjZF>B%q=X;jm<4h
z%mTdGIXc={?52PC$1b4-^6*jy1_lro1!7cy9AMK8|Ff$C0|ps<1#(c-6#r*elM_Z(
z!Vt!Yre@oJcI5;S6g3zAvZI)z4-J9<Z$>7NLJkgya*(?<fJ_iph6q8)pns?)Of~1Q
dmy8Bl4FW%ak{}S^%?4%!PwnRr=K{$>WB`Yji-7<D

delta 559
zcmdnmsIZ|?Vfux?>@w1P4C$%)Md_)!xw@s<`YDMeiFyUuIc`o|3{ex!ix~e;TxHMl
z>higl6W=PyiZV!`s7|UZ0qRXjEpl@bVVFFTQDm|Mqk)tVgB6M@keT`hZcb((CHj*W
zFtRrvU~E6Y2*ON2%nZaVK+FonY}*eovTOa37G|(VF{3QCs5mn}&&|miW(d!8hkxuk
z{7MXZ$O@;|{9~77(gG>cGB7YOF*h_aHa9o4Gy`IDQwt+AprC=Fxv_zvk(s%Xse!4P
zk-2$*H#<kqWJjLqAO5jRXo5Vul!1W(ghhZD6(9xHbff?5s=zQo248_3WF@8l+0|r)
zkX0~*F`_Ej`JY`mP8dZAFk+BR(TDmzz?+c?q>h6Fq8j984ImSQl^{Y;GUy+Q0aI-_
m?5#ldg1`@;90&w>vw<0Y3=rc$(gEJAY#=#SAOxw0s09Gn{EV9b

diff --git a/api/src/voices/v1_0/v0_bm_lewis.pt b/api/src/voices/v1_0/bm_v0lewis.pt
similarity index 99%
rename from api/src/voices/v1_0/v0_bm_lewis.pt
rename to api/src/voices/v1_0/bm_v0lewis.pt
index d1e36fab5158709fbdf68effe47c0e9236295198..f9e054c6625480c9dfd8e46e09b60a97fe773d3a 100644
GIT binary patch
delta 491
zcmZ43sIZ|?VS<#52m>EOnL&I~ZhTH^d1kSGN@7W(UO{$_n-dqqM6+VXe-l^Pv!uq#
zDo=c?BqPcofucC6vLrRXC?&PX%}HcB<3D!M$qtMLl0pntD5^k4>KnK@nSqp;Hy>bZ
zKfnmWOhC*G#4JF}3dC&N4=}Q8{gD!8ut(8fmReMtnV;w8WDPRFX1c>ab{%#l20f6X
z=>`AT6&bZa650j^rsifArWO{a=EfGLmX>Bl<`zZ<AYf@|Y-(U?U~Fb!Xlh|<5a7+u
zF>~*K$LSybu}dg|JhzmAfdPa?fEX2k>`?p9t}7~vBK{S~14*U+XV()ILXiq%gh(y@
z&#oONj3Ra6FFQ7$1$Z+ufz)wuKvaX=rU7Jvuo6TFN(TJ{8!%OY!(9?&4+#7Kih)3Y
SHyfDYH?@I7oC_oikpTd!{CM;L

delta 432
zcmdncsIah6VS*HkAcG*oL`7j%UWTZssEKC9jQ=ODvS)d9`P|EiZ<VBk7^E4Ja^rJS
z%QK7hlPXJ6^NUhai`<;V875C;6rb$CXduqdV2e<al30?cZ{X%+36j^JynvCb`2b`4
z0Y(sJ0%B$$W&vVWAZFWsfRSD6kAwh&6T*P9)S}|d{5&@&JD3Ul(;fb?>#)c(7&1)N
z=a6S{bJ7J#=^7Xqn3x+H8Jn9MTABf|xv7Pb8Box`(A*e^EeuUfOe{<cjg14m**T0_
zEkve&_{T0G2lClc1_lNY76f8cFkSOMyRMKBir`luZ+iBBc0D0}6!9=dp!mA~?Ak&C
zDB>6XvNKG7_n%!s4rHqaP!5FUQPc$eV+Z-hUD6$B2nhTD3V=X>HyfB?KeeAjoC_oi
GkpTdGduB2K

diff --git a/api/src/voices/v1_0/v0_af_irulan.pt b/api/src/voices/v1_0/v0_af_irulan.pt
deleted file mode 100644
index c9e1b7108bfbefd6d69258c48a3ce1a816767529..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 524484
zcmeI*dvI0todEEg5Rwq2P#B7=5+7SAx;79L2*UlPq7xYc7~*3nmSRY*G>|6dAqpZE
zEcjaOc2~FEqKJ=eMQJOF;L1I}WVN()?Vwh-?RHUERB(J0JK`=CDtoW89f7se>0kZh
zbCPo(x#xcG@0^=6=bYc;`>Ct#Rg_AVmZpxGj!%`P7PU1uX6BBmsA|pKjBOp&)X>#X
zamB)xn#skf*&j|{D{Sm)>uCH+#oQ~qnp--{=5}P}UD@2yG`B0$+S%4IchQ)#_SDFc
zvr2ODJ9A4m%+ItJeDs#GnwGYPt_wf<V_AFQtb*J!quYDUD$d<EUfI-e_UQJasi{K+
zsjl|kGiy`R4;9oFOivZu&|aLo&HdKq9+uqH*x1(6l4<N}Zfor<yP%^f(~)U9ueq_S
ztiANK`u09u?Pc}teJko}OG>Y6I&k#^x!C!+?i1I0XzJ6)toOXdU75Cg=$gs>4zKt}
zt3SJ?xvQ%slaIbK{mRgM$@B9kNB>M}+%eJfTXodvn#p6141M?${`lS5-+Lk6HSyJC
zN6TyJo%?#?{)M}fl}!gi<;`y<gNjzgpAH_<oq3}?-aBJM{I`wi<bxHvCjI2r8IxA^
z*%pdtFYI1@(KYGvjvGVv?MFiCwmV~8(>G$rzuX$XFm7*p*EfEb^gI9ISh2S|J~4dL
zmam^TGrV_acXH8`qEL4I8R1(cZ^!L7Y=}=cuLx&Wyq|39H=z2WB~K;SyxS2k|J8%Z
zlEK@<7dwmN7jE8=e(0Q?F}3W6lO}IFp}XLVyRzj;OLpPIz0&XAd|4QL`S-(Pt@nnl
z7pw>?cWh1`I<PxDbkCQPwY!H@AHQ}+dhOz6aZkbZ@vYSH@yB0zCB1J%-}G(SgGqVv
zXm-NZW$6R^cZK1r?#phz@r}^G&y&fyRTHz7r*8_kWez1x-&~Vi+V_8w@{4NXRiB-j
z+_wL?cuDce$?~2P;|;G5NIp03<gmQ&>TI&{*4VndHhb|?&xK{DF9;`W>I(fY?jJku
z`R1g1?kEUP*X>EBPWn+SJ#A2Yr{T#-C6}#B{`(*PCT@PLC(gM2l(2ZhDX}q3OTIJx
zyRo)oRkHc@WYX$2?dgiDL)r5dKM)3nW#PFs!@~nhSB37k=ElFDaCiF0J;Sn}UcM}h
z`uDFTFE07lWPRUfl0Bb)Cl(eyn{4^=^tkub1Ig?@H%_|c>=V=dPrN@4KjEfi_g%kA
zcKl>+{B>n^%i}NZi95Ge#esKk3@_F1O-in7P512G6xUq4sCv%y!Z3gN=H&f@>ys^;
z?ucCr&gdR`)rzqF)P3nuU+Rc69_~!OeBVjkOP}r?M!j`XY<c&BbjyNg;_$z|Bz^zv
z|4a_fx+M18eo}a>{Fia^x<0XR?aE~FjyES>yZUGG@}D+_DMh!$%x~IbNBMsx?>zeb
zaQ&$5@w?~j4t1Bk9sVObEcE>Rv+<nJFOD2{a=38qBgumIe-y8L@cM9VW=GO;*)8cM
zSL_alT33c&&nb#;Z~k^z|IU)|t@Yc&-xdw+{^biD;oC#A;q~8U;)4^%r5F9+y=3!%
z7n9MAGq-Fz_3k*}XJ1Wsl=OtFp0BLFx@uaO^73oxo9gRh+4(a=%gEAL9gd4jrw)wW
z&weVbxP4eCp0P82>iNmtwHvmEiW8pQGXIg;$#1h`!p3QX!u{1%;dgDrk|!P-7X}xu
zPHLC^JbAC*taNL`cS7So*Cc1%nZ$`tof8J!^YyT3(7JfA{lWD3Wjm7(N_NFl3hR^Y
zokQXopL-&^;k^FczuEa$@vGM~Bu(pz!YvoCiPLHxO-?y&bZEP@fAxxI?wr)St~}1K
z93BonlIq?(;@<E?pH%mWlY4cKx_4hX{Yp)Bubxckx%8#j_WR=Q%6Er_e|r2M!?r~w
zacSqexV9vT>+dM5E-U=III-8t?m1-_#=&!bk+uKZ|Ly<wf5(5vf5(5vf9L<s|DFFk
z|9AcG`rq}x>woos>i^XLssB^|ul`^Czxsdef7<`F|7riz{;&OC`@i;o{eSfT(f>#P
zAN~LI|I`0Z|3CeI_5aoXSN~uA|MmaZ|6l)q<A03*G5*K+ALIXw|1<v2_&?)+jsG?N
z*Z5!K|Be4Q{@?h2`9I|UkpDye5BY!O|B?Sk{vY|j<o}ZYOa3qU|K$IZ|4;ru`9J0V
zl>bxyPx*i4|CRq&{$KgO<^PudTmEnP|K<Oe|6l%p{s;eq|H1#@|L}kKKl~s57ypa@
z#sA{}@&EXL{6GFD|C9g8|K$JjfBC=sU;a1$oBz%KzAbw&DNi2FPT0!-=700Q`QQ9+
z{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K
z=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%
zZ~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K
z|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au
z{BQm@|C|5K|K@-5zxm(UEswvrC+^%@75U%%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpA
zfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+
z{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K
z=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%
zZ~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|NE-v
zE32=rnii(K%>U+p^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d
z|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-@nM7{k<3BT@znT_}~0*{x|=d
z|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`d&1Ub=>z+Bh2g9C-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K
z|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au
z{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe
z&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj
z-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=-
zDh|ARV|b~4Z^HlPfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QKAsel2}deSIuDpa0GO=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zh6yuw7izyxvwYkzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d
z|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQpE
z@KyI^H{bY1=--F`&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+
z{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K
z=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%
zZ~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K
z|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhchCMDOkrhE2oiu`Z>H~*Xe
z&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj
z-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d
z|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|6O+e%+NBjG*$=xH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>
zH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpI
ziTfAsPF6M@2>fsUH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q
z`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`
zoBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|J}dOlgYVN6SI}4^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>
zH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpA
zfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+
z{x|=d|IPpAfAhcj-~4a>_ole!+C|lKrWXeOH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5
zzxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@
z|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>
zH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPod
z4#&l%QwPTGXZhd!Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q
z^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>
zH~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+{x|=d|IPpA
zfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K|K@-5zxm(%Z~iy`oBz%K=700Q`QQ9+
z{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|NB6wy!p*!P|>Q$|K@-5zxm(%
zZ~iy`oBz%K=700Q`QQ9+{x|=d|IPpAfAhcj-~4a>H~*Xe&Hv_q^S}Au{BQm@|C|5K
z|K@-5zxm(%Z~iy``!CP`uB$C69sKgSm8pXzsr+15+p8#*DlJVNGYv^iOf70_Zp_Rb
zQ&H8LyBXU$YEh=6v$?IcX7c#d>_2Zhvn*eN=hw`?xG?vL`T3JeFd}vOF(s&IYUpYh
z)tTvNZfI#<($JMF&D`dun#m*n!~%^TJAPc%*s7|r<Ik)bJ7L1OG2^SojLuD!W5$o4
zP+4`>Srf*NJM*mZRryk0vASd7$Xp5Yb6ssgzOaATQr9&dxcY(I^Za~#{)Ho^qqkeW
z&=prKY&km0%JeHki*xVH&qI%hus`?g=qU3RcV*f-nlc?nhxzyy`k5obTvBjsm=Cw$
z=%ZtN{3{u9M2zQtUvf-Q4*w$@9pvL%Y{U^k>OVL($p8Oc*3}jlpK<uT<-I=thbX-d
tKl;D>1-VtrPy2I`^Hc79A3m6uD*f<z{^h#b-uX?N>Ye+^FaOc9{|gb2)Sv(W

diff --git a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
index c3496b6..11c3142 100644
--- a/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
+++ b/examples/assorted_checks/test_combinations/test_analyze_combined_voices.py
@@ -92,15 +92,16 @@ def analyze_audio(filepath: str):
         'dominant_frequencies': dominant_freqs
     }
 
-def plot_comparison(analyses, output_path):
-    """Create comparison plot of the audio analyses."""
+def plot_comparison(analyses, output_dir):
+    """Create detailed comparison plots of the audio analyses."""
     plt.style.use('dark_background')
-    fig = plt.figure(figsize=(15, 10))
-    fig.patch.set_facecolor('#1a1a2e')
     
     # Plot waveforms
+    fig_wave = plt.figure(figsize=(15, 10))
+    fig_wave.patch.set_facecolor('#1a1a2e')
+    
     for i, (name, data) in enumerate(analyses.items()):
-        ax = plt.subplot(3, 1, i+1)
+        ax = plt.subplot(len(analyses), 1, i+1)
         samples = data['samples']
         time = np.arange(len(samples)) / data['sample_rate']
         plt.plot(time, samples / data['max_amplitude'], linewidth=0.5, color='#ff2a6d')
@@ -112,27 +113,96 @@ def plot_comparison(analyses, output_path):
         plt.ylim(-1.1, 1.1)
     
     plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches='tight')
-    print(f"\nSaved comparison plot to {output_path}")
+    plt.savefig(output_dir / 'waveforms.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    # Plot spectral characteristics
+    fig_spec = plt.figure(figsize=(15, 10))
+    fig_spec.patch.set_facecolor('#1a1a2e')
+    
+    for i, (name, data) in enumerate(analyses.items()):
+        # Calculate spectrogram
+        samples = data['samples']
+        sample_rate = data['sample_rate']
+        nperseg = 2048
+        f, t, Sxx = plt.mlab.specgram(samples, NFFT=2048, Fs=sample_rate, 
+                                     noverlap=nperseg//2, scale='dB')
+        
+        ax = plt.subplot(len(analyses), 1, i+1)
+        plt.pcolormesh(t, f, Sxx, shading='gouraud', cmap='magma')
+        plt.title(f"Spectrogram: {name}", color='white', pad=20)
+        plt.ylabel('Frequency [Hz]', color='white')
+        plt.xlabel('Time [sec]', color='white')
+        plt.colorbar(label='Intensity [dB]')
+        ax.set_facecolor('#1a1a2e')
+    
+    plt.tight_layout()
+    plt.savefig(output_dir / 'spectrograms.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    # Plot voice characteristics comparison
+    fig_chars = plt.figure(figsize=(15, 8))
+    fig_chars.patch.set_facecolor('#1a1a2e')
+    
+    # Extract characteristics
+    names = list(analyses.keys())
+    rms_values = [data['rms'] for data in analyses.values()]
+    centroids = [data['spectral_centroid'] for data in analyses.values()]
+    max_amps = [data['max_amplitude'] for data in analyses.values()]
+    
+    # Plot characteristics
+    x = np.arange(len(names))
+    width = 0.25
+    
+    ax = plt.subplot(111)
+    ax.bar(x - width, rms_values, width, label='RMS (Texture)', color='#ff2a6d')
+    ax.bar(x, [c/1000 for c in centroids], width, label='Spectral Centroid/1000 (Brightness)', color='#05d9e8')
+    ax.bar(x + width, max_amps, width, label='Max Amplitude', color='#ff65bd')
+    
+    ax.set_xticks(x)
+    ax.set_xticklabels(names, rotation=45, ha='right')
+    ax.legend()
+    ax.set_title('Voice Characteristics Comparison', color='white', pad=20)
+    ax.set_facecolor('#1a1a2e')
+    
+    plt.tight_layout()
+    plt.savefig(output_dir / 'characteristics.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    print(f"\nSaved comparison plots to {output_dir}")
 
 def main():
-    # Generate audio for each voice
+    # Test different voice combinations with weights
     voices = {
         'af_bella': output_dir / 'af_bella.wav',
-        'af_irulan': output_dir / 'af_irulan.wav',
-        'af_bella+af_irulan': output_dir / 'af_bella+af_irulan.wav'
+        'af_kore': output_dir / 'af_kore.wav',
+        'af_bella(0.2)+af_kore(0.8)': output_dir / 'af_bella_20_af_kore_80.wav',
+        'af_bella(0.8)+af_kore(0.2)': output_dir / 'af_bella_80_af_kore_20.wav',
+        'af_bella(0.5)+af_kore(0.5)': output_dir / 'af_bella_50_af_kore_50.wav'
     }
     
+    # Generate audio for each voice/combination
     for voice, path in voices.items():
-        generate_and_save_audio(voice, str(path))
+        try:
+            generate_and_save_audio(voice, str(path))
+        except Exception as e:
+            print(f"Error generating audio for {voice}: {e}")
+            continue
     
     # Analyze each audio file
     analyses = {}
     for name, path in voices.items():
-        analyses[name] = analyze_audio(str(path))
+        try:
+            analyses[name] = analyze_audio(str(path))
+        except Exception as e:
+            print(f"Error analyzing {name}: {e}")
+            continue
     
-    # Create comparison plot
-    plot_comparison(analyses, output_dir / 'voice_comparison.png')
+    # Create comparison plots
+    if analyses:
+        plot_comparison(analyses, output_dir)
+    else:
+        print("No analyses to plot")
 
 if __name__ == "__main__":
     main()
diff --git a/examples/assorted_checks/test_combinations/test_download_voice.py b/examples/assorted_checks/test_combinations/test_download_voice.py
new file mode 100644
index 0000000..d2b847c
--- /dev/null
+++ b/examples/assorted_checks/test_combinations/test_download_voice.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+import os
+from pathlib import Path
+import requests
+
+# Create output directory
+output_dir = Path(__file__).parent / "output"
+output_dir.mkdir(exist_ok=True)
+
+def download_combined_voice(voice1: str, voice2: str, weights: tuple[float, float] = None) -> str:
+    """Download a combined voice file.
+    
+    Args:
+        voice1: First voice name
+        voice2: Second voice name
+        weights: Optional tuple of weights (w1, w2). If not provided, uses equal weights.
+    
+    Returns:
+        Path to downloaded .pt file
+    """
+    print(f"\nDownloading combined voice: {voice1} + {voice2}")
+    
+    # Construct voice string with optional weights
+    if weights:
+        voice_str = f"{voice1}({weights[0]})+{voice2}({weights[1]})"
+    else:
+        voice_str = f"{voice1}+{voice2}"
+    
+    # Make the request to combine voices
+    response = requests.post(
+        "http://localhost:8880/v1/audio/voices/combine",
+        json=voice_str
+    )
+    
+    if response.status_code != 200:
+        raise Exception(f"Failed to combine voices: {response.text}")
+    
+    # Save the .pt file
+    output_path = output_dir / f"{voice_str}.pt"
+    with open(output_path, "wb") as f:
+        f.write(response.content)
+    
+    print(f"Saved combined voice to {output_path}")
+    return str(output_path)
+
+def main():
+    # Test downloading various voice combinations
+    combinations = [
+        # Equal weights (default)
+        ("af_bella", "af_kore"),
+        
+        # Different weight combinations
+        ("af_bella", "af_kore", (0.2, 0.8)),
+        ("af_bella", "af_kore", (0.8, 0.2)),
+        ("af_bella", "af_kore", (0.5, 0.5)),
+        
+        # Test with different voices
+        ("af_bella", "af_jadzia"),
+        ("af_bella", "af_jadzia", (0.3, 0.7))
+    ]
+    
+    for combo in combinations:
+        try:
+            if len(combo) == 3:
+                voice1, voice2, weights = combo
+                download_combined_voice(voice1, voice2, weights)
+            else:
+                voice1, voice2 = combo
+                download_combined_voice(voice1, voice2)
+        except Exception as e:
+            print(f"Error downloading combination {combo}: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/assorted_checks/test_voices/analyze_voice_dimensions.py b/examples/assorted_checks/test_voices/analyze_voice_dimensions.py
new file mode 100644
index 0000000..6c7a7f8
--- /dev/null
+++ b/examples/assorted_checks/test_voices/analyze_voice_dimensions.py
@@ -0,0 +1,54 @@
+import os
+import torch
+from loguru import logger
+
+def analyze_voice_file(file_path):
+    """Analyze dimensions and statistics of a voice tensor."""
+    try:
+        tensor = torch.load(file_path, map_location="cpu")
+        logger.info(f"\nAnalyzing {os.path.basename(file_path)}:")
+        logger.info(f"Shape: {tensor.shape}")
+        logger.info(f"Mean: {tensor.mean().item():.4f}")
+        logger.info(f"Std: {tensor.std().item():.4f}")
+        logger.info(f"Min: {tensor.min().item():.4f}")
+        logger.info(f"Max: {tensor.max().item():.4f}")
+        return tensor.shape
+    except Exception as e:
+        logger.error(f"Error analyzing {file_path}: {e}")
+        return None
+
+def main():
+    """Analyze voice files in the voices directory."""
+    # Get the project root directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+    voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
+    
+    logger.info(f"Scanning voices in: {voices_dir}")
+    
+    # Track shapes for comparison
+    shapes = {}
+    
+    # Analyze each .pt file
+    for file in os.listdir(voices_dir):
+        if file.endswith('.pt'):
+            file_path = os.path.join(voices_dir, file)
+            shape = analyze_voice_file(file_path)
+            if shape:
+                shapes[file] = shape
+    
+    # Report findings
+    logger.info("\nShape Analysis:")
+    shape_groups = {}
+    for file, shape in shapes.items():
+        if shape not in shape_groups:
+            shape_groups[shape] = []
+        shape_groups[shape].append(file)
+    
+    for shape, files in shape_groups.items():
+        logger.info(f"\nShape {shape}:")
+        for file in files:
+            logger.info(f"  - {file}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/assorted_checks/test_voices/trim_voice_dimensions.py b/examples/assorted_checks/test_voices/trim_voice_dimensions.py
new file mode 100644
index 0000000..90f7013
--- /dev/null
+++ b/examples/assorted_checks/test_voices/trim_voice_dimensions.py
@@ -0,0 +1,85 @@
+import os
+import torch
+from loguru import logger
+
+def analyze_voice_content(tensor):
+    """Analyze the content distribution in the voice tensor."""
+    # Look at the variance along the first dimension to see where the information is concentrated
+    variance = torch.var(tensor, dim=(1,2))  # Variance across features
+    logger.info(f"Variance distribution:")
+    logger.info(f"First 5 rows variance: {variance[:5].mean().item():.6f}")
+    logger.info(f"Last 5 rows variance: {variance[-5:].mean().item():.6f}")
+    return variance
+
+def trim_voice_tensor(tensor):
+    """Trim a 511x1x256 tensor to 510x1x256 by removing the row with least impact."""
+    if tensor.shape[0] != 511:
+        raise ValueError(f"Expected tensor with first dimension 511, got {tensor.shape[0]}")
+    
+    # Analyze variance contribution of each row
+    variance = analyze_voice_content(tensor)
+    
+    # Determine which end has lower variance (less information)
+    start_var = variance[:5].mean().item()
+    end_var = variance[-5:].mean().item()
+    
+    # Remove from the end with lower variance
+    if end_var < start_var:
+        logger.info("Trimming last row (lower variance at end)")
+        return tensor[:-1]
+    else:
+        logger.info("Trimming first row (lower variance at start)")
+        return tensor[1:]
+
+def process_voice_file(file_path):
+    """Process a single voice file."""
+    try:
+        tensor = torch.load(file_path, map_location="cpu")
+        if tensor.shape[0] != 511:
+            logger.info(f"Skipping {os.path.basename(file_path)} - already correct shape {tensor.shape}")
+            return False
+            
+        logger.info(f"\nProcessing {os.path.basename(file_path)}:")
+        logger.info(f"Original shape: {tensor.shape}")
+        
+        # Create backup
+        backup_path = file_path + ".backup"
+        if not os.path.exists(backup_path):
+            torch.save(tensor, backup_path)
+            logger.info(f"Created backup at {backup_path}")
+        
+        # Trim tensor
+        trimmed = trim_voice_tensor(tensor)
+        logger.info(f"New shape: {trimmed.shape}")
+        
+        # Save trimmed tensor
+        torch.save(trimmed, file_path)
+        logger.info(f"Saved trimmed tensor to {file_path}")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Error processing {file_path}: {e}")
+        return False
+
+def main():
+    """Process voice files in the voices directory."""
+    # Get the project root directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+    voices_dir = os.path.join(project_root, "api", "src", "voices", "v1_0")
+    
+    logger.info(f"Processing voices in: {voices_dir}")
+    
+    processed = 0
+    for file in os.listdir(voices_dir):
+        if file.endswith('.pt') and not file.endswith('.backup'):
+            file_path = os.path.join(voices_dir, file)
+            if process_voice_file(file_path):
+                processed += 1
+    
+    logger.info(f"\nProcessed {processed} voice files")
+    logger.info("Backups created with .backup extension")
+    logger.info("To restore backups if needed, remove .backup extension to replace trimmed files")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/captioned_speech_example.py b/examples/captioned_speech_example.py
new file mode 100644
index 0000000..df208ab
--- /dev/null
+++ b/examples/captioned_speech_example.py
@@ -0,0 +1,98 @@
+import json
+from typing import Tuple, Optional, Dict, List
+from pathlib import Path
+
+import requests
+
+# Get the directory this script is in
+SCRIPT_DIR = Path(__file__).absolute().parent
+
+def generate_captioned_speech(
+    text: str,
+    voice: str = "af_bella",
+    speed: float = 1.0,
+    response_format: str = "wav"
+) -> Tuple[Optional[bytes], Optional[List[Dict]]]:
+    """Generate audio with word-level timestamps."""
+    response = requests.post(
+        "http://localhost:8880/dev/captioned_speech",
+        json={
+            "model": "kokoro",
+            "input": text,
+            "voice": voice,
+            "speed": speed,
+            "response_format": response_format
+        }
+    )
+    
+    print(f"Response status: {response.status_code}")
+    print(f"Response headers: {dict(response.headers)}")
+    
+    if response.status_code != 200:
+        print(f"Error response: {response.text}")
+        return None, None
+        
+    try:
+        # Get timestamps from header
+        timestamps_json = response.headers.get('X-Word-Timestamps', '[]')
+        word_timestamps = json.loads(timestamps_json)
+        
+        # Get audio bytes from content
+        audio_bytes = response.content
+        
+        if not audio_bytes:
+            print("Error: Empty audio content")
+            return None, None
+            
+        return audio_bytes, word_timestamps
+    except json.JSONDecodeError as e:
+        print(f"Error parsing timestamps: {e}")
+        return None, None
+
+def main():
+    # Example texts to convert
+    examples = [
+        "Hello world! Welcome to the captioned speech system.",
+        "The quick brown fox jumps over the lazy dog.",
+        """If you have access to a room where gasoline is stored, remember that gas vapor accumulating in a closed room will explode after a time if you leave a candle burning in the room. A good deal of evaporation, however, must occur from the gasoline tins into the air of the room. If removal of the tops of the tins does not expose enough gasoline to the air to ensure copious evaporation, you can open lightly constructed tins further with a knife, ice pick or sharpened nail file. Or puncture a tiny hole in the tank which will permit gasoline to leak out on the floor. This will greatly increase the rate of evaporation. Before you light your candle, be sure that windows are closed and the room is as air-tight as you can make it. If you can see that windows in a neighboring room are opened wide, you have a chance of setting a large fire which will not only destroy the gasoline but anything else nearby; when the gasoline explodes, the doors of the storage room will be blown open, a draft to the neighboring windows will be created which will whip up a fine conflagration"""
+    ]
+
+    print("Generating captioned speech for example texts...\n")
+
+    # Create output directory in same directory as script
+    output_dir = SCRIPT_DIR / "output"
+    output_dir.mkdir(exist_ok=True)
+
+    for i, text in enumerate(examples):
+        print(f"\nExample {i+1}:")
+        print(f"Input text: {text}")
+        try:
+            # Generate audio and get timestamps
+            audio_bytes, word_timestamps = generate_captioned_speech(text)
+            
+            if not audio_bytes or not word_timestamps:
+                print("Error: No audio data or timestamps generated")
+                continue
+
+            # Save audio file
+            audio_path = output_dir / f"captioned_example_{i+1}.wav"
+            with audio_path.open("wb") as f:
+                f.write(audio_bytes)
+            print(f"Audio saved to: {audio_path}")
+
+            # Save timestamps to JSON
+            timestamps_path = output_dir / f"captioned_example_{i+1}_timestamps.json"
+            with timestamps_path.open("w") as f:
+                json.dump(word_timestamps, f, indent=2)
+            print(f"Timestamps saved to: {timestamps_path}")
+
+            # Print timestamps
+            print("\nWord-level timestamps:")
+            for ts in word_timestamps:
+                print(f"{ts['word']}: {ts['start_time']:.3f}s - {ts['end_time']:.3f}s")
+
+        except requests.RequestException as e:
+            print(f"Error: {e}\n")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 571acd1..2f318de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,8 +37,8 @@ dependencies = [
     "semchunk>=3.0.1",
     "mutagen>=1.47.0",
     "psutil>=6.1.1",
-    "kokoro==0.3.5",
-    'misaki[en,ja,ko,zh,vi]==0.6.7',
+    "kokoro==0.7.4",
+    'misaki[en,ja,ko,zh,vi]==0.7.4',
     "spacy>=3.7.6",
     "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
 ]
diff --git a/web/src/components/VoiceSelector.js b/web/src/components/VoiceSelector.js
index 4209378..4499d2f 100644
--- a/web/src/components/VoiceSelector.js
+++ b/web/src/components/VoiceSelector.js
@@ -30,6 +30,20 @@ export class VoiceSelector {
             }
         });
 
+        // Weight adjustment and voice removal
+        this.elements.selectedVoices.addEventListener('input', (e) => {
+            if (e.target.type === 'number') {
+                const voice = e.target.dataset.voice;
+                let weight = parseFloat(e.target.value);
+                
+                // Ensure weight is between 0.1 and 10
+                weight = Math.max(0.1, Math.min(10, weight));
+                e.target.value = weight;
+                
+                this.voiceService.updateWeight(voice, weight);
+            }
+        });
+
         // Remove selected voice
         this.elements.selectedVoices.addEventListener('click', (e) => {
             if (e.target.classList.contains('remove-voice')) {
@@ -73,12 +87,22 @@ export class VoiceSelector {
     }
 
     updateSelectedVoicesDisplay() {
-        const selectedVoices = this.voiceService.getSelectedVoices();
+        const selectedVoices = this.voiceService.getSelectedVoiceWeights();
         this.elements.selectedVoices.innerHTML = selectedVoices
-            .map(voice => `
+            .map(({voice, weight}) => `
                 <span class="selected-voice-tag">
-                    ${voice}
-                    <span class="remove-voice" data-voice="${voice}">×</span>
+                    <span class="voice-name">${voice}</span>
+                    <span class="voice-weight">
+                        <input type="number" 
+                               value="${weight}" 
+                               min="0.1" 
+                               max="10" 
+                               step="0.1" 
+                               data-voice="${voice}"
+                               class="weight-input"
+                               title="Voice weight (0.1 to 10)">
+                    </span>
+                    <span class="remove-voice" data-voice="${voice}" title="Remove voice">×</span>
                 </span>
             `)
             .join('');
diff --git a/web/src/services/VoiceService.js b/web/src/services/VoiceService.js
index 92cd4db..93a31a8 100644
--- a/web/src/services/VoiceService.js
+++ b/web/src/services/VoiceService.js
@@ -1,7 +1,7 @@
 export class VoiceService {
     constructor() {
         this.availableVoices = [];
-        this.selectedVoices = new Set();
+        this.selectedVoices = new Map(); // Changed to Map to store voice:weight pairs
     }
 
     async loadVoices() {
@@ -39,16 +39,33 @@ export class VoiceService {
     }
 
     getSelectedVoices() {
-        return Array.from(this.selectedVoices);
+        return Array.from(this.selectedVoices.keys());
+    }
+
+    getSelectedVoiceWeights() {
+        return Array.from(this.selectedVoices.entries()).map(([voice, weight]) => ({
+            voice,
+            weight
+        }));
     }
 
     getSelectedVoiceString() {
-        return Array.from(this.selectedVoices).join('+');
+        return Array.from(this.selectedVoices.entries())
+            .map(([voice, weight]) => `${voice}(${weight})`)
+            .join('+');
     }
 
-    addVoice(voice) {
+    addVoice(voice, weight = 1) {
         if (this.availableVoices.includes(voice)) {
-            this.selectedVoices.add(voice);
+            this.selectedVoices.set(voice, parseFloat(weight) || 1);
+            return true;
+        }
+        return false;
+    }
+
+    updateWeight(voice, weight) {
+        if (this.selectedVoices.has(voice)) {
+            this.selectedVoices.set(voice, parseFloat(weight) || 1);
             return true;
         }
         return false;