Merge branch 'master' into master

2025-08-05 16:48:53 +00:00 · 2025-02-12 23:31:13 -07:00 · 2025-02-12 23:31:13 -07:00 · 694b7435f1
commit 694b7435f1
parent 1cf011b2eb 728e18b613
23 changed files with 693 additions and 85 deletions
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -29,8 +29,11 @@ class Settings(BaseSettings):
    target_min_tokens: int = 175  # Target minimum tokens per chunk
    target_max_tokens: int = 250  # Target maximum tokens per chunk
    absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
    advanced_text_normalization: bool = True # Preproesses the text before misiki which leads 
-    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
+    gap_trim_ms: int = 1  # Base amount to trim from streaming chunk ends in milliseconds
    dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
    dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
    # Web Player Settings
    enable_web_player: bool = True  # Whether to serve the web player UI
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -150,7 +150,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)
            logger.debug(
-                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'"
+                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
            )
            for result in pipeline.generate_from_tokens(
                tokens=tokens, voice=voice_path, speed=speed, model=self._model
@ -198,7 +198,6 @@ class KokoroV1(BaseModelBackend):
        """
        if not self.is_loaded:
            raise RuntimeError("Model not loaded")
        try:
            # Memory management for GPU
            if self._device == "cuda":
@ -243,7 +242,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)
            logger.debug(
-                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'"
+                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
            )
            for result in pipeline(
                text, voice=voice_path, speed=speed, model=self._model
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -3,6 +3,7 @@
 import io
 import json
 import os
 import re
 import tempfile
 from typing import AsyncGenerator, Dict, List, Union
@ -137,7 +138,8 @@ async def stream_audio_chunks(
            voice=voice_name,
            speed=request.speed,
            output_format=request.response_format,
-            lang_code=request.lang_code if request.lang_code else (settings.default_voice_code if settings.default_voice_code else voice_name[0].lower()),
+            lang_code = request.lang_code or settings.default_voice_code or voice_name[0].lower(),
            normalization_options=request.normalization_options
        ):
            # Check if client is still connected
            is_disconnected = client_request.is_disconnected
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,10 +4,12 @@ import struct
 from io import BytesIO
 import numpy as np
 import math
 import scipy.io.wavfile as wavfile
 import soundfile as sf
 from loguru import logger
 from pydub import AudioSegment
 from torch import norm
 from ..core.config import settings
 from .streaming_audio_writer import StreamingAudioWriter
@ -20,23 +22,66 @@ class AudioNormalizer:
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
        self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
    def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
        """Finds the indices of the first and last non-silent samples in audio data.
        Args:
            audio_data: Input audio data as numpy array
            chunk_text: The text sent to the model to generate the resulting speech
            speed: The speaking speed of the voice
            silence_threshold_db: How quiet audio has to be to be conssidered silent
            is_last_chunk: Whether this is the last chunk
        Returns:
            A tuple with the start of the non silent portion and with the end of the non silent portion
        """
        pad_multiplier=1
        split_character=chunk_text.strip()
        if len(split_character) > 0:
            split_character=split_character[-1]
            if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
                pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
        if not is_last_chunk:
            samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
        else:
            samples_to_pad_end=self.samples_to_pad_start
        # Convert dBFS threshold to amplitude
        amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
        # Find the first samples above the silence threshold at the start and end of the audio
        non_silent_index_start, non_silent_index_end = None,None 
        for X in range(0,len(audio_data)):
            #print(audio_data[X])
            if audio_data[X] > amplitude_threshold:
                non_silent_index_start=X
                break
        for X in range(len(audio_data) - 1, -1, -1):
            if audio_data[X] > amplitude_threshold:
                non_silent_index_end=X
                break
        # Handle the case where the entire audio is silent
        if non_silent_index_start == None or non_silent_index_end == None:
            return 0, len(audio_data)
        return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
    async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
-        """Convert audio data to int16 range and trim silence from start and end
+        """Convert audio data to int16 range
        Args:
            audio_data: Input audio data as numpy array
        Returns:
-            Normalized and trimmed audio data
+            Normalized audio data
        """
        if len(audio_data) == 0:
            raise ValueError("Empty audio data")
        # Trim start and end if enough samples
        if len(audio_data) > (2 * self.samples_to_trim):
            audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
        # Scale directly to int16 range with clipping
        return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
@ -71,6 +116,8 @@ class AudioService:
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
        speed: float = 1,
        chunk_text: str = "",
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
@ -81,6 +128,8 @@ class AudioService:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, ogg, pcm)
            speed: The speaking speed of the voice
            chunk_text: The text sent to the model to generate the resulting speech
            is_first_chunk: Whether this is the first chunk
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization
@ -96,8 +145,10 @@ class AudioService:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = await normalizer.normalize(audio_data)
-
+            normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
            # Get or create format-specific writer
            writer_key = f"{output_format}_{sample_rate}"
            if is_first_chunk or writer_key not in AudioService._writers:
@ -123,3 +174,27 @@ class AudioService:
            raise ValueError(
                f"Failed to convert audio stream to {output_format}: {str(e)}"
            )
    @staticmethod
    def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
        """Trim silence from start and end
        Args:
            audio_data: Input audio data as numpy array
            chunk_text: The text sent to the model to generate the resulting speech
            speed: The speaking speed of the voice
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization
        Returns:
            Trimmed audio data
        """
        if normalizer is None:
            normalizer = AudioNormalizer()
        # Trim start and end if enough samples
        if len(audio_data) > (2 * normalizer.samples_to_trim):
            audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
        # Find non silent portion and trim 
        start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
        return audio_data[start_index:end_index]
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -6,6 +6,9 @@ Converts them into a format suitable for text-to-speech processing.
 import re
 from functools import lru_cache
 import inflect
 from ...structures.schemas import NormalizationOptions
 # Constants
 VALID_TLDS = [
@ -50,6 +53,27 @@ VALID_TLDS = [
    "io",
 ]
 VALID_UNITS = {
    "m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile",  # Length
    "g":"gram", "kg":"kilogram", "mg":"miligram",      # Mass
    "s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
    "l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter",  # Volume
    "kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second","cm/h":"centimeter per day", # Speed
    "°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin",     # Temperature
    "pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere",  # Pressure
    "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
    "v":"volt", "kv":"kilovolt", "mv":"mergavolt",      # Voltage
    "a":"amp", "ma":"megaamp", "ka":"kiloamp",      # Current
    "w":"watt", "kw":"kilowatt", "mw":"megawatt",      # Power
    "j":"joule", "kj":"kilojoule", "mj":"megajoule",      # Energy
    "Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm",      # Resistance (Ohm)
    "f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
    "b":"bit", "kb":"kilobit", "mb":"megabit", "gb":"gigabit", "tb":"terabit", "pb":"petabit", # Data size
    "kbps":"kilobit per second","mbps":"megabit per second","gbps":"gigabit per second","tbps":"terabit per second",
    "px":"pixel"  # CSS units
 }
 # Pre-compiled regex patterns for performance
 EMAIL_PATTERN = re.compile(
    r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
@ -61,6 +85,9 @@ URL_PATTERN = re.compile(
    re.IGNORECASE,
 )
 UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[^\w\d]{1}|\b)""",re.IGNORECASE)
 INFLECT_ENGINE=inflect.engine()
 def split_num(num: re.Match[str]) -> str:
    """Handle number splitting for various formats"""
@ -86,6 +113,23 @@ def split_num(num: re.Match[str]) -> str:
            return f"{left} oh {right}{s}"
    return f"{left} {right}{s}"
 def handle_units(u: re.Match[str]) -> str:
    """Converts units to their full form"""
    unit_string=u.group(6).strip() 
    unit=unit_string
    if unit_string.lower() in VALID_UNITS:
        unit=VALID_UNITS[unit_string.lower()].split(" ")
        # Handles the B vs b case
        if unit[0].endswith("bit"):
            b_case=unit_string[min(1,len(unit_string) - 1)]
            if b_case == "B":
                unit[0]=unit[0][:-3] + "byte"
        number=u.group(1).strip()
        unit[0]=INFLECT_ENGINE.no(unit[0],number)
    return " ".join(unit)
 def handle_money(m: re.Match[str]) -> str:
    """Convert money expressions to spoken form"""
@ -171,30 +215,32 @@ def handle_url(u: re.Match[str]) -> str:
    return re.sub(r"\s+", " ", url).strip()
-def normalize_urls(text: str) -> str:
+def normalize_text(text: str,normalization_options: NormalizationOptions) -> str:
    """Pre-process URLs before other text normalization"""
    # Handle email addresses first
    text = EMAIL_PATTERN.sub(handle_email, text)
    # Handle URLs
    text = URL_PATTERN.sub(handle_url, text)
    return text
 def normalize_text(text: str) -> str:
    """Normalize text for TTS processing"""
-    # Pre-process URLs first
+    # Handle email addresses first if enabled
-    text = normalize_urls(text)
+    if normalization_options.email_normalization:
        text = EMAIL_PATTERN.sub(handle_email, text)
    # Handle URLs if enabled
    if normalization_options.url_normalization:
        text = URL_PATTERN.sub(handle_url, text)
    # Pre-process numbers with units if enabled
    if normalization_options.unit_normalization:
        text=UNIT_PATTERN.sub(handle_units,text)
    # Replace optional pluralization
    if normalization_options.optional_pluralization_normalization:
        text = re.sub(r"\(s\)","s",text)
    # Replace quotes and brackets
    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
    text = text.replace("«", chr(8220)).replace("»", chr(8221))
    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
    text = text.replace("(", "«").replace(")", "»")
-    # Handle CJK punctuation
+    # Handle CJK punctuation and some non standard chars
-    for a, b in zip("、。！，：；？", ",.!,:;?"):
+    for a, b in zip("、。！，：；？–", ",.!,:;?-"):
        text = text.replace(a, b + " ")
    # Clean up whitespace
@ -216,12 +262,14 @@ def normalize_text(text: str) -> str:
    text = re.sub(
        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
    )
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    text = re.sub(
        r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
        handle_money,
        text,
    )
    text = re.sub(r"\d*\.\d+", handle_decimal, text)
    # Handle various formatting
@ -232,6 +280,6 @@ def normalize_text(text: str) -> str:
    text = re.sub(
        r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
    )
-    text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+    text = re.sub( r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
    return text.strip()
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@ -10,7 +10,7 @@ from ...core.config import settings
 from .normalizer import normalize_text
 from .phonemizer import phonemize
 from .vocabulary import tokenize
-
+from ...structures.schemas import NormalizationOptions
 def process_text_chunk(
    text: str, language: str = "a", skip_phonemize: bool = False
@ -26,7 +26,7 @@ def process_text_chunk(
        List of token IDs
    """
    start_time = time.time()
-
+    
    if skip_phonemize:
        # Input is already phonemes, just tokenize
        t0 = time.time()
@ -35,12 +35,11 @@ def process_text_chunk(
    else:
        # Normal text processing pipeline
        t0 = time.time()
        normalized = normalize_text(text)
        t1 = time.time()
        t0 = time.time()
        phonemes = phonemize(
-            normalized, language, normalize=False
+            text, language, normalize=False
        )  # Already normalized
        t1 = time.time()
@ -50,7 +49,7 @@ def process_text_chunk(
    total_time = time.time() - start_time
    logger.debug(
-        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'"
+        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
    )
    return tokens
@ -61,7 +60,7 @@ async def yield_chunk(
 ) -> Tuple[str, List[int]]:
    """Yield a chunk with consistent logging."""
    logger.debug(
-        f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)"
+        f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
    )
    return text, tokens
@ -88,9 +87,8 @@ def process_text(text: str, language: str = "a") -> List[int]:
 def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
    """Process all sentences and return info."""
-    sentences = re.split(r"([.!?;:])", text)
+    sentences = re.split(r"([.!?;:])(?=\s|$)", text)
    results = []
    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
@ -106,13 +104,19 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
 async def smart_split(
-    text: str, max_tokens: int = settings.absolute_max_tokens
+    text: str, 
    max_tokens: int = settings.absolute_max_tokens,
    normalization_options: NormalizationOptions = NormalizationOptions()
 ) -> AsyncGenerator[Tuple[str, List[int]], None]:
    """Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
    start_time = time.time()
    chunk_count = 0
    logger.info(f"Starting smart split for {len(text)} chars")
    # Normilize text
    if settings.advanced_text_normalization and normalization_options.normalize:
        text=normalize_text(text,normalization_options)
    # Process all sentences
    sentences = get_sentence_info(text)
@ -128,7 +132,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
                current_chunk = []
@ -149,6 +153,7 @@ async def smart_split(
                    continue
                full_clause = clause + comma
                tokens = process_text_chunk(full_clause)
                count = len(tokens)
@ -166,7 +171,7 @@ async def smart_split(
                        chunk_text = " ".join(clause_chunk)
                        chunk_count += 1
                        logger.debug(
-                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                        )
                        yield chunk_text, clause_tokens
                    clause_chunk = [full_clause]
@ -178,7 +183,7 @@ async def smart_split(
                chunk_text = " ".join(clause_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                )
                yield chunk_text, clause_tokens
@ -192,7 +197,7 @@ async def smart_split(
            chunk_text = " ".join(current_chunk)
            chunk_count += 1
            logger.info(
-                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
            )
            yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -217,7 +222,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.info(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -229,7 +234,7 @@ async def smart_split(
        chunk_text = " ".join(current_chunk)
        chunk_count += 1
        logger.info(
-            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
        )
        yield chunk_text, current_tokens
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -18,7 +18,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager
 from .audio import AudioNormalizer, AudioService
 from .text_processing import tokenize
 from .text_processing.text_processor import process_text_chunk, smart_split
-
+from ..structures.schemas import NormalizationOptions
 class TTSService:
    """Text-to-speech service."""
@ -67,6 +67,8 @@ class TTSService:
                        np.array([0], dtype=np.float32),  # Dummy data for type checking
                        24000,
                        output_format,
                        speed,
                        "",
                        is_first_chunk=False,
                        normalizer=normalizer,
                        is_last_chunk=True,
@ -97,15 +99,22 @@ class TTSService:
                                    chunk_audio,
                                    24000,
                                    output_format,
                                    speed,
                                    chunk_text,
                                    is_first_chunk=is_first,
                                    normalizer=normalizer,
                                    is_last_chunk=is_last,
                                    normalizer=normalizer,
                                )
                                yield converted
                            except Exception as e:
                                logger.error(f"Failed to convert audio: {str(e)}")
                        else:
-                            yield chunk_audio
+                            trimmed = await AudioService.trim_audio(chunk_audio,
                                                                    chunk_text,
                                                                    speed,
                                                                    is_last,
                                                                    normalizer)
                            yield trimmed
                else:
                    # For legacy backends, load voice tensor
                    voice_tensor = await self._voice_manager.load_voice(
@ -130,6 +139,8 @@ class TTSService:
                                chunk_audio,
                                24000,
                                output_format,
                                speed,
                                chunk_text,
                                is_first_chunk=is_first,
                                normalizer=normalizer,
                                is_last_chunk=is_last,
@ -138,7 +149,12 @@ class TTSService:
                        except Exception as e:
                            logger.error(f"Failed to convert audio: {str(e)}")
                    else:
-                        yield chunk_audio
+                        trimmed = await AudioService.trim_audio(chunk_audio,
                                                                    chunk_text,
                                                                    speed,
                                                                    is_last,
                                                                    normalizer)
                        yield trimmed
            except Exception as e:
                logger.error(f"Failed to process tokens: {str(e)}")
@ -222,6 +238,7 @@ class TTSService:
        speed: float = 1.0,
        output_format: str = "wav",
        lang_code: Optional[str] = None,
        normalization_options: Optional[NormalizationOptions] = NormalizationOptions()
    ) -> AsyncGenerator[bytes, None]:
        """Generate and stream audio chunks."""
        stream_normalizer = AudioNormalizer()
@ -242,7 +259,7 @@ class TTSService:
            )
            # Process text in chunks with smart splitting
-            async for chunk_text, tokens in smart_split(text):
+            async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
                try:
                    # Process audio for chunk
                    async for result in self._process_chunk(
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -36,7 +36,14 @@ class CaptionedSpeechResponse(BaseModel):
    audio: bytes = Field(..., description="The generated audio data")
    words: List[WordTimestamp] = Field(..., description="Word-level timestamps")
-
+class NormalizationOptions(BaseModel):
    """Options for the normalization system"""
    normalize: bool = Field(default=True, description="Normalizes input text to make it easier for the model to say")
    unit_normalization: bool = Field(default=False,description="Transforms units like 10KB to 10 kilobytes")
    url_normalization: bool = Field(default=True, description="Changes urls so they can be properly pronouced by kokoro")
    email_normalization: bool = Field(default=True, description="Changes emails so they can be properly pronouced by kokoro")
    optional_pluralization_normalization: bool = Field(default=True, description="Replaces (s) with s so some words get pronounced correctly")
 class OpenAISpeechRequest(BaseModel):
    """Request schema for OpenAI-compatible speech endpoint"""
@ -71,6 +78,10 @@ class OpenAISpeechRequest(BaseModel):
        default=None,
        description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
    )
    normalization_options: Optional[NormalizationOptions] = Field(
        default= NormalizationOptions(),
        description= "Options for the normalization system"
    )
 class CaptionedSpeechRequest(BaseModel):
--- a/api/tests/test_normalizer.py
+++ b/api/tests/test_normalizer.py
@ -3,29 +3,29 @@
 import pytest
 from api.src.services.text_processing.normalizer import normalize_text
-
+from api.src.structures.schemas import NormalizationOptions
 def test_url_protocols():
    """Test URL protocol handling"""
    assert (
-        normalize_text("Check out https://example.com")
+        normalize_text("Check out https://example.com",normalization_options=NormalizationOptions())
        == "Check out https example dot com"
    )
-    assert normalize_text("Visit http://site.com") == "Visit http site dot com"
+    assert normalize_text("Visit http://site.com",normalization_options=NormalizationOptions()) == "Visit http site dot com"
    assert (
-        normalize_text("Go to https://test.org/path")
+        normalize_text("Go to https://test.org/path",normalization_options=NormalizationOptions())
        == "Go to https test dot org slash path"
    )
 def test_url_www():
    """Test www prefix handling"""
-    assert normalize_text("Go to www.example.com") == "Go to www example dot com"
+    assert normalize_text("Go to www.example.com",normalization_options=NormalizationOptions()) == "Go to www example dot com"
    assert (
-        normalize_text("Visit www.test.org/docs") == "Visit www test dot org slash docs"
+        normalize_text("Visit www.test.org/docs",normalization_options=NormalizationOptions()) == "Visit www test dot org slash docs"
    )
    assert (
-        normalize_text("Check www.site.com?q=test")
+        normalize_text("Check www.site.com?q=test",normalization_options=NormalizationOptions())
        == "Check www site dot com question-mark q equals test"
    )
@ -33,15 +33,15 @@ def test_url_www():
 def test_url_localhost():
    """Test localhost URL handling"""
    assert (
-        normalize_text("Running on localhost:7860")
+        normalize_text("Running on localhost:7860",normalization_options=NormalizationOptions())
        == "Running on localhost colon 78 60"
    )
    assert (
-        normalize_text("Server at localhost:8080/api")
+        normalize_text("Server at localhost:8080/api",normalization_options=NormalizationOptions())
        == "Server at localhost colon 80 80 slash api"
    )
    assert (
-        normalize_text("Test localhost:3000/test?v=1")
+        normalize_text("Test localhost:3000/test?v=1",normalization_options=NormalizationOptions())
        == "Test localhost colon 3000 slash test question-mark v equals 1"
    )
@ -49,43 +49,43 @@ def test_url_localhost():
 def test_url_ip_addresses():
    """Test IP address URL handling"""
    assert (
-        normalize_text("Access 0.0.0.0:9090/test")
+        normalize_text("Access 0.0.0.0:9090/test",normalization_options=NormalizationOptions())
        == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test"
    )
    assert (
-        normalize_text("API at 192.168.1.1:8000")
+        normalize_text("API at 192.168.1.1:8000",normalization_options=NormalizationOptions())
        == "API at 192 dot 168 dot 1 dot 1 colon 8000"
    )
-    assert normalize_text("Server 127.0.0.1") == "Server 127 dot 0 dot 0 dot 1"
+    assert normalize_text("Server 127.0.0.1",normalization_options=NormalizationOptions()) == "Server 127 dot 0 dot 0 dot 1"
 def test_url_raw_domains():
    """Test raw domain handling"""
    assert (
-        normalize_text("Visit google.com/search") == "Visit google dot com slash search"
+        normalize_text("Visit google.com/search",normalization_options=NormalizationOptions()) == "Visit google dot com slash search"
    )
    assert (
-        normalize_text("Go to example.com/path?q=test")
+        normalize_text("Go to example.com/path?q=test",normalization_options=NormalizationOptions())
        == "Go to example dot com slash path question-mark q equals test"
    )
-    assert normalize_text("Check docs.test.com") == "Check docs dot test dot com"
+    assert normalize_text("Check docs.test.com",normalization_options=NormalizationOptions()) == "Check docs dot test dot com"
 def test_url_email_addresses():
    """Test email address handling"""
    assert (
-        normalize_text("Email me at user@example.com")
+        normalize_text("Email me at user@example.com",normalization_options=NormalizationOptions())
        == "Email me at user at example dot com"
    )
-    assert normalize_text("Contact admin@test.org") == "Contact admin at test dot org"
+    assert normalize_text("Contact admin@test.org",normalization_options=NormalizationOptions()) == "Contact admin at test dot org"
    assert (
-        normalize_text("Send to test.user@site.com")
+        normalize_text("Send to test.user@site.com",normalization_options=NormalizationOptions())
        == "Send to test dot user at site dot com"
    )
 def test_non_url_text():
    """Test that non-URL text is unaffected"""
-    assert normalize_text("This is not.a.url text") == "This is not-a-url text"
+    assert normalize_text("This is not.a.url text",normalization_options=NormalizationOptions()) == "This is not-a-url text"
-    assert normalize_text("Hello, how are you today?") == "Hello, how are you today?"
+    assert normalize_text("Hello, how are you today?",normalization_options=NormalizationOptions()) == "Hello, how are you today?"
-    assert normalize_text("It costs $50.") == "It costs 50 dollars."
+    assert normalize_text("It costs $50.",normalization_options=NormalizationOptions()) == "It costs 50 dollars."
--- a/charts/kokoro-fastapi/.helmignore
+++ b/charts/kokoro-fastapi/.helmignore
@ -0,0 +1,23 @@
 # Patterns to ignore when building packages.
 # This supports shell glob matching, relative path matching, and
 # negation (prefixed with !). Only one pattern per line.
 .DS_Store
 # Common VCS dirs
 .git/
 .gitignore
 .bzr/
 .bzrignore
 .hg/
 .hgignore
 .svn/
 # Common backup files
 *.swp
 *.bak
 *.tmp
 *.orig
 *~
 # Various IDEs
 .project
 .idea/
 *.tmproj
 .vscode/
--- a/charts/kokoro-fastapi/Chart.yaml
+++ b/charts/kokoro-fastapi/Chart.yaml
@ -0,0 +1,24 @@
 apiVersion: v2
 name: kokoro-fastapi
 description: A Helm chart for kokoro-fastapi
 # A chart can be either an 'application' or a 'library' chart.
 #
 # Application charts are a collection of templates that can be packaged into versioned archives
 # to be deployed.
 #
 # Library charts provide useful utilities or functions for the chart developer. They're included as
 # a dependency of application charts to inject those utilities and functions into the rendering
 # pipeline. Library charts do not define any templates and therefore cannot be deployed.
 type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
 version: 0.1.0
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
 appVersion: "1.16.0"
--- a/charts/kokoro-fastapi/templates/NOTES.txt
+++ b/charts/kokoro-fastapi/templates/NOTES.txt
@ -0,0 +1,22 @@
 1. Get the application URL by running these commands:
 {{- if .Values.ingress.enabled }}
 {{- range $host := .Values.ingress.hosts }}
  {{- range .paths }}
  http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
  {{- end }}
 {{- end }}
 {{- else if contains "NodePort" .Values.service.type }}
  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kokoro-fastapi.fullname" . }})
  export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
  echo http://$NODE_IP:$NODE_PORT
 {{- else if contains "LoadBalancer" .Values.service.type }}
     NOTE: It may take a few minutes for the LoadBalancer IP to be available.
           You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kokoro-fastapi.fullname" . }}'
  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kokoro-fastapi.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
  echo http://$SERVICE_IP:{{ .Values.service.port }}
 {{- else if contains "ClusterIP" .Values.service.type }}
  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kokoro-fastapi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
  export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
  echo "Visit http://127.0.0.1:8080 to use your application"
  kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
 {{- end }}
--- a/charts/kokoro-fastapi/templates/_helpers.tpl
+++ b/charts/kokoro-fastapi/templates/_helpers.tpl
@ -0,0 +1,62 @@
 {{/*
 Expand the name of the chart.
 */}}
 {{- define "kokoro-fastapi.name" -}}
 {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Create a default fully qualified app name.
 We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
 If release name contains chart name it will be used as a full name.
 */}}
 {{- define "kokoro-fastapi.fullname" -}}
 {{- if .Values.fullnameOverride }}
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- $name := default .Chart.Name .Values.nameOverride }}
 {{- if contains $name .Release.Name }}
 {{- .Release.Name | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{- end }}
 {{- end }}
 {{/*
 Create chart name and version as used by the chart label.
 */}}
 {{- define "kokoro-fastapi.chart" -}}
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Common labels
 */}}
 {{- define "kokoro-fastapi.labels" -}}
 helm.sh/chart: {{ include "kokoro-fastapi.chart" . }}
 {{ include "kokoro-fastapi.selectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{- end }}
 {{/*
 Selector labels
 */}}
 {{- define "kokoro-fastapi.selectorLabels" -}}
 app.kubernetes.io/name: {{ include "kokoro-fastapi.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 {{/*
 Create the name of the service account to use
 */}}
 {{- define "kokoro-fastapi.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
 {{- default (include "kokoro-fastapi.fullname" .) .Values.serviceAccount.name }}
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 {{- end }}
--- a/charts/kokoro-fastapi/templates/hpa.yaml
+++ b/charts/kokoro-fastapi/templates/hpa.yaml
@ -0,0 +1,28 @@
 {{- if .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2beta1
 kind: HorizontalPodAutoscaler
 metadata:
  name: {{ include "kokoro-fastapi.fullname" . }}
  labels:
    {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: {{ include "kokoro-fastapi.fullname" . }}
  minReplicas: {{ .Values.autoscaling.minReplicas }}
  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
  metrics:
    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
    - type: Resource
      resource:
        name: cpu
        targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
    {{- end }}
    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
    - type: Resource
      resource:
        name: memory
        targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
    {{- end }}
 {{- end }}
--- a/charts/kokoro-fastapi/templates/ingress.yaml
+++ b/charts/kokoro-fastapi/templates/ingress.yaml
@ -0,0 +1,82 @@
 {{- if .Values.ingress.enabled -}}
 {{- $fullName := include "kokoro-fastapi.fullname" . -}}
 {{- $svcPort := .Values.service.port -}}
 {{- $rewriteTargets := (list) -}}
 {{- with .Values.ingress.host }}
  {{- range .endpoints }}
    {{- $serviceName := default $fullName .serviceName -}}
    {{- $rewrite := .rewrite | default "none" -}}
    {{- if not (has $rewrite $rewriteTargets  ) -}}
    {{- $rewriteTargets = append $rewriteTargets $rewrite -}}
    {{- end -}}
  {{- end}}
 {{- end }}
 {{- range $key := $rewriteTargets }}
 {{- $expandedRewrite := regexReplaceAll "/(.*)$" $key "slash${1}" -}}
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
 {{- if eq $key "none" }}
  name: {{ $fullName }}
 {{- else }}
  name: {{ $fullName }}-{{ $expandedRewrite }}
 {{- end }}
  labels:
    {{- include "kokoro-fastapi.labels" $ | nindent 4 }}
    {{- if ne $key "none" }}
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: {{ regexReplaceAll "/$" $key "" }}/$2
    {{- end }}
 spec:
 {{- if $.Values.ingress.tls }}
  tls:
  {{- range $.Values.ingress.tls }}
    - hosts:
      {{- range .hosts }}
        - {{ . | quote }}
      {{- end }}
      secretName: {{ .secretName }}
  {{- end }}
 {{- end }}
  rules:
  {{- with $.Values.ingress.host }}
    - host: {{ .name | quote }}
      http:
        paths:
        {{- range .endpoints }}
          {{- $serviceName := default $fullName .serviceName -}}
          {{- $servicePort := default (print "http") .servicePort -}}
          {{- if eq ( .rewrite | default "none" ) $key }}
            {{- range .paths }}
              {{- if not (contains "@" .) }}
                {{- if eq $key "none" }}
            - path: {{ . }}
                {{- else }}
            - path: {{ regexReplaceAll "(.*)/$" . "${1}" }}(/|$)(.*)
                {{- end }}
              pathType: Prefix
              backend:
                service:
                  name: "{{ $fullName }}-{{ $serviceName }}"
                  port:
                    number: {{ $servicePort }}
              {{- else }}
                {{- $path := . -}}
                {{- $replicaCount := include "getServiceNameReplicaCount" (dict "global" $.Values "serviceName" $serviceName ) -}}
                {{- range $count, $e := until ($replicaCount|int) }}
            - path: {{ $path | replace "@" ( . | toString ) }}(/|$)(.*)
              pathType: Prefix
              backend:
                service:
                  name: "{{ $fullName }}-{{ $serviceName }}-{{ . }}"
                  port:
                    number: {{ $servicePort }}
                {{- end }}
              {{- end }}
            {{- end }}
          {{- end }}
        {{- end }}
  {{- end }}
 ---
 {{- end }}
 {{- end }}
--- a/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml
+++ b/charts/kokoro-fastapi/templates/kokoro-tts-deployment.yaml
@ -0,0 +1,71 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts
  labels:
    {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 spec:
  {{- if not .Values.autoscaling.enabled }}
  replicas: {{ .Values.kokoroTTS.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      {{- include "kokoro-fastapi.selectorLabels" . | nindent 6 }}
  template:
    metadata:
      {{- with .Values.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        {{- include "kokoro-fastapi.selectorLabels" . | nindent 8 }}
    spec:
      {{- with .Values.images.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      serviceAccountName: {{ include "kokoro-fastapi.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
      initContainers: []
      containers:
        - name: kokoro-tts
          securityContext:
            {{- toYaml .Values.securityContext | nindent 12 }}
          image: "{{ .Values.kokoroTTS.repository }}:{{ .Values.kokoroTTS.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.kokoroTTS.pullPolicy }}
          env:
            - name: PYTHONPATH
              value: "/app:/app/api"
            - name: USE_GPU
              value: "true"
            - name: PYTHONUNBUFFERED
              value: "1"
          ports:
            - name: kokoro-tts-http
              containerPort: {{ .Values.kokoroTTS.port | default 8880 }}
              protocol: TCP
          livenessProbe:
            httpGet:
              path: /health
              port: kokoro-tts-http
          readinessProbe:
            httpGet:
              path: /health
              port: kokoro-tts-http
          resources:
            {{- toYaml .Values.kokoroTTS.resources | nindent 12 }}
          volumeMounts: []
      volumes: []
      {{- with .Values.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
--- a/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml
+++ b/charts/kokoro-fastapi/templates/kokoro-tts-service.yaml
@ -0,0 +1,15 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "kokoro-fastapi.fullname" . }}-kokoro-tts-service
  labels:
    {{- include "kokoro-fastapi.labels" . | nindent 4 }}
 spec:
  type: {{ .Values.service.type }}
  ports:
    - port: {{ .Values.kokoroTTS.port }}
      targetPort: kokoro-tts-http
      protocol: TCP
      name: kokoro-tts-http
  selector:
    {{- include "kokoro-fastapi.selectorLabels" . | nindent 4 }}
--- a/charts/kokoro-fastapi/templates/serviceaccount.yaml
+++ b/charts/kokoro-fastapi/templates/serviceaccount.yaml
@ -0,0 +1,12 @@
 {{- if .Values.serviceAccount.create -}}
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "kokoro-fastapi.serviceAccountName" . }}
  labels:
    {{- include "kokoro-fastapi.labels" . | nindent 4 }}
  {{- with .Values.serviceAccount.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
 {{- end }}
--- a/charts/kokoro-fastapi/templates/tests/test-connection.yaml
+++ b/charts/kokoro-fastapi/templates/tests/test-connection.yaml
@ -0,0 +1,15 @@
 apiVersion: v1
 kind: Pod
 metadata:
  name: "{{ include "kokoro-fastapi.fullname" . }}-test-connection"
  labels:
    {{- include "kokoro-fastapi.labels" . | nindent 4 }}
  annotations:
    "helm.sh/hook": test
 spec:
  containers:
    - name: wget
      image: busybox
      command: ['wget']
      args: ['{{ include "kokoro-fastapi.fullname" . }}:{{ .Values.service.port }}']
  restartPolicy: Never
--- a/charts/kokoro-fastapi/values.yaml
+++ b/charts/kokoro-fastapi/values.yaml
@ -0,0 +1,94 @@
 # Default values for kokoro-fastapi.
 # This is a YAML-formatted file.
 # Declare variables to be passed into your templates.
 replicaCount: 1
 images:
  pullPolicy: "Always"
  imagePullSecrets: [ ]
 nameOverride: ""
 fullnameOverride: ""
 serviceAccount:
  # Specifies whether a service account should be created
  create: true
  # Annotations to add to the service account
  annotations: {}
  # The name of the service account to use.
  # If not set and create is true, a name is generated using the fullname template
  name: ""
 podAnnotations: {}
 podSecurityContext: {}
  # fsGroup: 2000
 securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000
 service:
  type: ClusterIP
 ingress:
  enabled: false
  className: ""
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  host:
    name: kokoro.example.com
    endpoints:
      - paths:
          - "/"
        serviceName: "fastapi"
        servicePort: 8880
  tls: []
  #  - secretName: chart-example-tls
  #    hosts:
  #      - chart-example.local
 kokoroTTS:
  repository: "ghcr.io/remsky/kokoro-fastapi-gpu"
  tag: "latest"
  pullPolicy: Always
  serviceName: "fastapi"
  port: 8880
  replicaCount: 1
  resources:
    limits:
      nvidia.com/gpu: 1
    requests:
      nvidia.com/gpu: 1
  # We usually recommend not to specify default resources and to leave this as a conscious
  # choice for the user. This also increases chances charts run on environments with little
  # resources, such as Minikube. If you do want to specify resources, uncomment the following
  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
  # limits:
  #   cpu: 100m
  #   memory: 128Mi
  # requests:
  #   cpu: 100m
  #   memory: 128Mi
 autoscaling:
  enabled: false
  minReplicas: 1
  maxReplicas: 100
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80
 nodeSelector: {}
 tolerations: []
 affinity: {}
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \
    curl \
    ffmpeg \
    g++ \
-    && apt-get clean \
+&& apt-get clean \
-    && rm -rf /var/lib/apt/lists/* \
+&& rm -rf /var/lib/apt/lists/* \
-    && mkdir -p /usr/share/espeak-ng-data \
+&& mkdir -p /usr/share/espeak-ng-data \
-    && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
 # Install dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.10 && \
    uv sync --extra cpu
 # Copy project files including models
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@ -1,11 +1,11 @@
-FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
 # Set non-interactive frontend
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update && apt-get install -y \
    python3.10 \
-    python3.10-venv \
+    python3-venv \
    espeak-ng \
    espeak-ng-data \
    git \
@ -23,7 +23,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
    mv /root/.local/bin/uvx /usr/local/bin/ 
 # Create non-root user and set up directories and permissions
-RUN useradd -m -u 1000 appuser && \
+RUN useradd -m -u 1001 appuser && \
    mkdir -p /app/api/src/models/v1_0 && \
    chown -R appuser:appuser /app
@ -39,7 +39,7 @@ ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
 # Install dependencies with GPU extras (using cache mounts)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.10 && \
    uv sync --extra gpu
 # Copy project files including models
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,8 +18,6 @@ dependencies = [
    "scipy==1.14.1",
    # Audio processing
    "soundfile==0.13.0",
    # Text processing
    "phonemizer==3.3.0",
    "regex==2024.11.6",
    # Utilities
    "aiofiles==23.2.1",
@ -36,7 +34,9 @@ dependencies = [
    "kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
    'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
    "spacy==3.7.2",
-    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl"
+    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
    "inflect>=7.5.0",
    "phonemizer-fork>=3.3.2",
 ]
 [project.optional-dependencies]