Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting

2025-08-05 16:48:53 +00:00 · 2025-02-10 21:45:05 -05:00 · 2025-02-10 21:45:05 -05:00 · ab1c21130e
commit ab1c21130e
parent 9b76ce2071
10 changed files with 187 additions and 43 deletions
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -28,8 +28,11 @@ class Settings(BaseSettings):
    target_min_tokens: int = 175  # Target minimum tokens per chunk
    target_max_tokens: int = 250  # Target maximum tokens per chunk
    absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
    advanced_text_normalization: bool = True # Preproesses the text before misiki which leads 
-    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
+    gap_trim_ms: int = 1  # Base amount to trim from streaming chunk ends in milliseconds
    dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
    dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
    # Web Player Settings
    enable_web_player: bool = True  # Whether to serve the web player UI
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)
            logger.debug(
-                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'"
+                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
            )
            for result in pipeline.generate_from_tokens(
                tokens=tokens, voice=voice_path, speed=speed, model=self._model
@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend):
        """
        if not self.is_loaded:
            raise RuntimeError("Model not loaded")
        try:
            # Memory management for GPU
            if self._device == "cuda":
@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)
            logger.debug(
-                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'"
+                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
            )
            for result in pipeline(
                text, voice=voice_path, speed=speed, model=self._model
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,10 +4,12 @@ import struct
 from io import BytesIO
 import numpy as np
 import math
 import scipy.io.wavfile as wavfile
 import soundfile as sf
 from loguru import logger
 from pydub import AudioSegment
 from torch import norm
 from ..core.config import settings
 from .streaming_audio_writer import StreamingAudioWriter
@ -20,23 +22,66 @@ class AudioNormalizer:
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
        self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
    def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
        """Finds the indices of the first and last non-silent samples in audio data.
        Args:
            audio_data: Input audio data as numpy array
            chunk_text: The text sent to the model to generate the resulting speech
            speed: The speaking speed of the voice
            silence_threshold_db: How quiet audio has to be to be conssidered silent
            is_last_chunk: Whether this is the last chunk
        Returns:
            A tuple with the start of the non silent portion and with the end of the non silent portion
        """
        pad_multiplier=1
        split_character=chunk_text.strip()
        if len(split_character) > 0:
            split_character=split_character[-1]
            if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
                pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
        if not is_last_chunk:
            samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
        else:
            samples_to_pad_end=self.samples_to_pad_start
        # Convert dBFS threshold to amplitude
        amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
        # Find the first samples above the silence threshold at the start and end of the audio
        non_silent_index_start, non_silent_index_end = None,None 
        for X in range(0,len(audio_data)):
            #print(audio_data[X])
            if audio_data[X] > amplitude_threshold:
                non_silent_index_start=X
                break
        for X in range(len(audio_data) - 1, -1, -1):
            if audio_data[X] > amplitude_threshold:
                non_silent_index_end=X
                break
        # Handle the case where the entire audio is silent
        if non_silent_index_start == None or non_silent_index_end == None:
            return 0, len(audio_data)
        return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
    async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
-        """Convert audio data to int16 range and trim silence from start and end
+        """Convert audio data to int16 range
        Args:
            audio_data: Input audio data as numpy array
        Returns:
-            Normalized and trimmed audio data
+            Normalized audio data
        """
        if len(audio_data) == 0:
            raise ValueError("Empty audio data")
        # Trim start and end if enough samples
        if len(audio_data) > (2 * self.samples_to_trim):
            audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
        # Scale directly to int16 range with clipping
        return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
@ -71,6 +116,8 @@ class AudioService:
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
        speed: float = 1,
        chunk_text: str = "",
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
@ -81,6 +128,8 @@ class AudioService:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, ogg, pcm)
            speed: The speaking speed of the voice
            chunk_text: The text sent to the model to generate the resulting speech
            is_first_chunk: Whether this is the first chunk
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization
@ -96,8 +145,10 @@ class AudioService:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
            normalized_audio = await normalizer.normalize(audio_data)
-
+            normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
            # Get or create format-specific writer
            writer_key = f"{output_format}_{sample_rate}"
            if is_first_chunk or writer_key not in AudioService._writers:
@ -123,3 +174,27 @@ class AudioService:
            raise ValueError(
                f"Failed to convert audio stream to {output_format}: {str(e)}"
            )
    @staticmethod
    def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
        """Trim silence from start and end
        Args:
            audio_data: Input audio data as numpy array
            chunk_text: The text sent to the model to generate the resulting speech
            speed: The speaking speed of the voice
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization
        Returns:
            Trimmed audio data
        """
        if normalizer is None:
            normalizer = AudioNormalizer()
        # Trim start and end if enough samples
        if len(audio_data) > (2 * normalizer.samples_to_trim):
            audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
        # Find non silent portion and trim 
        start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
        return audio_data[start_index:end_index]
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing.
 import re
 from functools import lru_cache
 import inflect
 # Constants
 VALID_TLDS = [
@ -50,6 +51,26 @@ VALID_TLDS = [
    "io",
 ]
 VALID_UNITS = {
    "m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile",  # Length
    "g":"gram", "kg":"kilogram", "mg":"miligram",      # Mass
    "s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
    "l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter",  # Volume
    "kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed
    "°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin",     # Temperature
    "pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere",  # Pressure
    "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
    "v":"volt", "kv":"kilovolt", "mv":"mergavolt",      # Voltage
    "a":"amp", "ma":"megaamp", "ka":"kiloamp",      # Current
    "w":"watt", "kw":"kilowatt", "mw":"megawatt",      # Power
    "j":"joule", "kj":"kilojoule", "mj":"megajoule",      # Energy
    "Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm",      # Resistance (Ohm)
    "f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
    "b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size
    "kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second",
    "px":"pixel"  # CSS units
 }
 # Pre-compiled regex patterns for performance
 EMAIL_PATTERN = re.compile(
    r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
@ -61,6 +82,9 @@ URL_PATTERN = re.compile(
    re.IGNORECASE,
 )
 UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE)
 INFLECT_ENGINE=inflect.engine()
 def split_num(num: re.Match[str]) -> str:
    """Handle number splitting for various formats"""
@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str:
            return f"{left} oh {right}{s}"
    return f"{left} {right}{s}"
 def handle_units(u: re.Match[str]) -> str:
    unit=u.group(6).strip() 
    if unit.lower() in VALID_UNITS:
        unit=VALID_UNITS[unit.lower()].split(" ")
        number=u.group(1).strip()
        unit[0]=INFLECT_ENGINE.no(unit[0],number)
    return " ".join(unit)
 def handle_money(m: re.Match[str]) -> str:
    """Convert money expressions to spoken form"""
@ -187,14 +218,17 @@ def normalize_text(text: str) -> str:
    # Pre-process URLs first
    text = normalize_urls(text)
    # Pre-process numbers with units
    text=UNIT_PATTERN.sub(handle_units,text)
    # Replace quotes and brackets
    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
    text = text.replace("«", chr(8220)).replace("»", chr(8221))
    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
    text = text.replace("(", "«").replace(")", "»")
-    # Handle CJK punctuation
+    # Handle CJK punctuation and some non standard chars
-    for a, b in zip("、。！，：；？", ",.!,:;?"):
+    for a, b in zip("、。！，：；？–", ",.!,:;?-"):
        text = text.replace(a, b + " ")
    # Clean up whitespace
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@ -26,7 +26,7 @@ def process_text_chunk(
        List of token IDs
    """
    start_time = time.time()
-
+    
    if skip_phonemize:
        # Input is already phonemes, just tokenize
        t0 = time.time()
@ -35,12 +35,11 @@ def process_text_chunk(
    else:
        # Normal text processing pipeline
        t0 = time.time()
        normalized = normalize_text(text)
        t1 = time.time()
        t0 = time.time()
        phonemes = phonemize(
-            normalized, language, normalize=False
+            text, language, normalize=False
        )  # Already normalized
        t1 = time.time()
@ -50,7 +49,7 @@ def process_text_chunk(
    total_time = time.time() - start_time
    logger.debug(
-        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'"
+        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
    )
    return tokens
@ -61,7 +60,7 @@ async def yield_chunk(
 ) -> Tuple[str, List[int]]:
    """Yield a chunk with consistent logging."""
    logger.debug(
-        f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)"
+        f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
    )
    return text, tokens
@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]:
 def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
    """Process all sentences and return info."""
-    sentences = re.split(r"([.!?;:])", text)
+    if settings.advanced_text_normalization:
        text=normalize_text(text)
    sentences = re.split(r"([.!?;:])(?=\s|$)", text)
    results = []
    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
@ -128,7 +128,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
                current_chunk = []
@ -149,6 +149,7 @@ async def smart_split(
                    continue
                full_clause = clause + comma
                tokens = process_text_chunk(full_clause)
                count = len(tokens)
@ -166,7 +167,7 @@ async def smart_split(
                        chunk_text = " ".join(clause_chunk)
                        chunk_count += 1
                        logger.debug(
-                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                        )
                        yield chunk_text, clause_tokens
                    clause_chunk = [full_clause]
@ -178,7 +179,7 @@ async def smart_split(
                chunk_text = " ".join(clause_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                )
                yield chunk_text, clause_tokens
@ -192,7 +193,7 @@ async def smart_split(
            chunk_text = " ".join(current_chunk)
            chunk_count += 1
            logger.info(
-                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
            )
            yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -217,7 +218,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.info(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -229,7 +230,7 @@ async def smart_split(
        chunk_text = " ".join(current_chunk)
        chunk_count += 1
        logger.info(
-            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
        )
        yield chunk_text, current_tokens
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -67,6 +67,8 @@ class TTSService:
                        np.array([0], dtype=np.float32),  # Dummy data for type checking
                        24000,
                        output_format,
                        speed,
                        "",
                        is_first_chunk=False,
                        normalizer=normalizer,
                        is_last_chunk=True,
@ -97,15 +99,22 @@ class TTSService:
                                    chunk_audio,
                                    24000,
                                    output_format,
                                    speed,
                                    chunk_text,
                                    is_first_chunk=is_first,
                                    normalizer=normalizer,
                                    is_last_chunk=is_last,
                                    normalizer=normalizer,
                                )
                                yield converted
                            except Exception as e:
                                logger.error(f"Failed to convert audio: {str(e)}")
                        else:
-                            yield chunk_audio
+                            trimmed = await AudioService.trim_audio(chunk_audio,
                                                                    chunk_text,
                                                                    speed,
                                                                    is_last,
                                                                    normalizer)
                            yield trimmed
                else:
                    # For legacy backends, load voice tensor
                    voice_tensor = await self._voice_manager.load_voice(
@ -130,6 +139,8 @@ class TTSService:
                                chunk_audio,
                                24000,
                                output_format,
                                speed,
                                chunk_text,
                                is_first_chunk=is_first,
                                normalizer=normalizer,
                                is_last_chunk=is_last,
@ -138,7 +149,12 @@ class TTSService:
                        except Exception as e:
                            logger.error(f"Failed to convert audio: {str(e)}")
                    else:
-                        yield chunk_audio
+                        trimmed = await AudioService.trim_audio(chunk_audio,
                                                                    chunk_text,
                                                                    speed,
                                                                    is_last,
                                                                    normalizer)
                        yield trimmed
            except Exception as e:
                logger.error(f"Failed to process tokens: {str(e)}")
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \
    curl \
    ffmpeg \
    g++ \
-    && apt-get clean \
+&& apt-get clean \
-    && rm -rf /var/lib/apt/lists/* \
+&& rm -rf /var/lib/apt/lists/* \
-    && mkdir -p /usr/share/espeak-ng-data \
+&& mkdir -p /usr/share/espeak-ng-data \
-    && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
    mv /root/.local/bin/uvx /usr/local/bin/
 # Create non-root user and set up directories and permissions
-RUN useradd -m -u 1000 appuser && \
+RUN useradd -m -u 1001 appuser && \
    mkdir -p /app/api/src/models/v1_0 && \
    chown -R appuser:appuser /app
@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
 # Install dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.11 && \
    uv sync --extra cpu
 # Copy project files including models
@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api
 COPY --chown=appuser:appuser web ./web
 COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
 RUN sed -i 's/\r$//' ./entrypoint.sh
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@ -1,26 +1,29 @@
-FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
 # Set non-interactive frontend
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update && apt-get install -y \
    python3.10 \
-    python3.10-venv \
+    python3-venv \
    espeak-ng \
    espeak-ng-data \
    git \
    libsndfile1 \
    curl \
    ffmpeg \
- && apt-get clean && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean \
 && rm -rf /var/lib/apt/lists/* \
 && mkdir -p /usr/share/espeak-ng-data \
 && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
    mv /root/.local/bin/uv /usr/local/bin/ && \
-    mv /root/.local/bin/uvx /usr/local/bin/ && \
+    mv /root/.local/bin/uvx /usr/local/bin/
-    useradd -m -u 1000 appuser && \
+    
 # Create non-root user and set up directories and permissions
 RUN useradd -m -u 1001 appuser && \
    mkdir -p /app/api/src/models/v1_0 && \
    chown -R appuser:appuser /app 
@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
 # Install dependencies with GPU extras (using cache mounts)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.11 && \
    uv sync --extra gpu
 # Copy project files including models and sync again
@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api
 COPY --chown=appuser:appuser web ./web
 COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
 RUN sed -i 's/\r$//' ./entrypoint.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --extra gpu
--- a/pyproject.toml
+++ b/pyproject.toml
@ -36,7 +36,8 @@ dependencies = [
    "kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
    'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
    "spacy==3.7.2",
-    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl"
+    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
    "inflect>=7.5.0",
 ]
 [project.optional-dependencies]
--- a/start-gpu.bat
+++ b/start-gpu.bat
@ -0,0 +1,10 @@
 set PYTHONUTF8=1
 set USE_GPU=true
 set USE_ONNX=false
 set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api
 set MODEL_DIR=src\models
 set VOICES_DIR=src\voices\v1_0
 set WEB_PLAYER_PATH=%PROJECT_ROOT%\web
 call uv pip install -e ".[gpu]"
 call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880