Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting

2025-09-18 21:39:23 +00:00 · 2025-02-10 21:45:05 -05:00 · 2025-02-10 21:45:05 -05:00 · ab1c21130e
commit ab1c21130e
parent 9b76ce2071
10 changed files with 187 additions and 43 deletions
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@ -28,8 +28,11 @@ class Settings(BaseSettings):
    target_min_tokens: int = 175  # Target minimum tokens per chunk
    target_max_tokens: int = 250  # Target maximum tokens per chunk
    absolute_max_tokens: int = 450  # Absolute maximum tokens per chunk
+    advanced_text_normalization: bool = True # Preproesses the text before misiki which leads 

-    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
+    gap_trim_ms: int = 1  # Base amount to trim from streaming chunk ends in milliseconds
+    dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
+    dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}

    # Web Player Settings
    enable_web_player: bool = True  # Whether to serve the web player UI
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)

            logger.debug(
-                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'"
+                f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
            )
            for result in pipeline.generate_from_tokens(
                tokens=tokens, voice=voice_path, speed=speed, model=self._model
@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend):
        """
        if not self.is_loaded:
            raise RuntimeError("Model not loaded")
-
        try:
            # Memory management for GPU
            if self._device == "cuda":
@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend):
            pipeline = self._get_pipeline(pipeline_lang_code)

            logger.debug(
-                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'"
+                f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
            )
            for result in pipeline(
                text, voice=voice_path, speed=speed, model=self._model
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -4,10 +4,12 @@ import struct
 from io import BytesIO

 import numpy as np
+import math
 import scipy.io.wavfile as wavfile
 import soundfile as sf
 from loguru import logger
 from pydub import AudioSegment
+from torch import norm

 from ..core.config import settings
 from .streaming_audio_writer import StreamingAudioWriter
@ -20,23 +22,66 @@ class AudioNormalizer:
        self.chunk_trim_ms = settings.gap_trim_ms
        self.sample_rate = 24000  # Sample rate of the audio
        self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
+        self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
+
+    def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
+        """Finds the indices of the first and last non-silent samples in audio data.
+        
+        Args:
+            audio_data: Input audio data as numpy array
+            chunk_text: The text sent to the model to generate the resulting speech
+            speed: The speaking speed of the voice
+            silence_threshold_db: How quiet audio has to be to be conssidered silent
+            is_last_chunk: Whether this is the last chunk
+            
+        Returns:
+            A tuple with the start of the non silent portion and with the end of the non silent portion
+        """
+
+        pad_multiplier=1
+        split_character=chunk_text.strip()
+        if len(split_character) > 0:
+            split_character=split_character[-1]
+            if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
+                pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
+
+        if not is_last_chunk:
+            samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
+        else:
+            samples_to_pad_end=self.samples_to_pad_start
+        # Convert dBFS threshold to amplitude
+        amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
+        # Find the first samples above the silence threshold at the start and end of the audio
+        non_silent_index_start, non_silent_index_end = None,None 
+
+        for X in range(0,len(audio_data)):
+            #print(audio_data[X])
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_start=X
+                break
+        
+        for X in range(len(audio_data) - 1, -1, -1):
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_end=X
+                break
+
+        # Handle the case where the entire audio is silent
+        if non_silent_index_start == None or non_silent_index_end == None:
+            return 0, len(audio_data)
+
+        return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))

    async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
-        """Convert audio data to int16 range and trim silence from start and end
+        """Convert audio data to int16 range

        Args:
            audio_data: Input audio data as numpy array
-
        Returns:
-            Normalized and trimmed audio data
+            Normalized audio data
        """
        if len(audio_data) == 0:
            raise ValueError("Empty audio data")

-        # Trim start and end if enough samples
-        if len(audio_data) > (2 * self.samples_to_trim):
-            audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
-
        # Scale directly to int16 range with clipping
        return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)

@ -71,6 +116,8 @@ class AudioService:
        audio_data: np.ndarray,
        sample_rate: int,
        output_format: str,
+        speed: float = 1,
+        chunk_text: str = "",
        is_first_chunk: bool = True,
        is_last_chunk: bool = False,
        normalizer: AudioNormalizer = None,
@ -81,6 +128,8 @@ class AudioService:
            audio_data: Numpy array of audio samples
            sample_rate: Sample rate of the audio
            output_format: Target format (wav, mp3, ogg, pcm)
+            speed: The speaking speed of the voice
+            chunk_text: The text sent to the model to generate the resulting speech
            is_first_chunk: Whether this is the first chunk
            is_last_chunk: Whether this is the last chunk
            normalizer: Optional AudioNormalizer instance for consistent normalization
@ -96,8 +145,10 @@ class AudioService:
            # Always normalize audio to ensure proper amplitude scaling
            if normalizer is None:
                normalizer = AudioNormalizer()
+                
            normalized_audio = await normalizer.normalize(audio_data)
-
+            normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
+            
            # Get or create format-specific writer
            writer_key = f"{output_format}_{sample_rate}"
            if is_first_chunk or writer_key not in AudioService._writers:
@ -123,3 +174,27 @@ class AudioService:
            raise ValueError(
                f"Failed to convert audio stream to {output_format}: {str(e)}"
            )
+    @staticmethod
+    def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
+        """Trim silence from start and end
+
+        Args:
+            audio_data: Input audio data as numpy array
+            chunk_text: The text sent to the model to generate the resulting speech
+            speed: The speaking speed of the voice
+            is_last_chunk: Whether this is the last chunk
+            normalizer: Optional AudioNormalizer instance for consistent normalization
+            
+        Returns:
+            Trimmed audio data
+        """
+        if normalizer is None:
+            normalizer = AudioNormalizer()
+        
+        # Trim start and end if enough samples
+        if len(audio_data) > (2 * normalizer.samples_to_trim):
+            audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
+            
+        # Find non silent portion and trim 
+        start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
+        return audio_data[start_index:end_index]
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing.

 import re
 from functools import lru_cache
+import inflect

 # Constants
 VALID_TLDS = [
@ -50,6 +51,26 @@ VALID_TLDS = [
    "io",
 ]

+VALID_UNITS = {
+    "m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile",  # Length
+    "g":"gram", "kg":"kilogram", "mg":"miligram",      # Mass
+    "s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
+    "l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter",  # Volume
+    "kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed
+    "°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin",     # Temperature
+    "pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere",  # Pressure
+    "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
+    "v":"volt", "kv":"kilovolt", "mv":"mergavolt",      # Voltage
+    "a":"amp", "ma":"megaamp", "ka":"kiloamp",      # Current
+    "w":"watt", "kw":"kilowatt", "mw":"megawatt",      # Power
+    "j":"joule", "kj":"kilojoule", "mj":"megajoule",      # Energy
+    "Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm",      # Resistance (Ohm)
+    "f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
+    "b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size
+    "kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second",
+    "px":"pixel"  # CSS units
+}
+
 # Pre-compiled regex patterns for performance
 EMAIL_PATTERN = re.compile(
    r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
@ -61,6 +82,9 @@ URL_PATTERN = re.compile(
    re.IGNORECASE,
 )

+UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE)
+
+INFLECT_ENGINE=inflect.engine()

 def split_num(num: re.Match[str]) -> str:
    """Handle number splitting for various formats"""
@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str:
            return f"{left} oh {right}{s}"
    return f"{left} {right}{s}"

+def handle_units(u: re.Match[str]) -> str:
+    unit=u.group(6).strip() 
+    if unit.lower() in VALID_UNITS:
+        unit=VALID_UNITS[unit.lower()].split(" ")
+        number=u.group(1).strip()
+        unit[0]=INFLECT_ENGINE.no(unit[0],number)
+    return " ".join(unit)

 def handle_money(m: re.Match[str]) -> str:
    """Convert money expressions to spoken form"""
@ -187,14 +218,17 @@ def normalize_text(text: str) -> str:
    # Pre-process URLs first
    text = normalize_urls(text)

+    # Pre-process numbers with units
+    text=UNIT_PATTERN.sub(handle_units,text)
+    
    # Replace quotes and brackets
    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
    text = text.replace("«", chr(8220)).replace("»", chr(8221))
    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
    text = text.replace("(", "«").replace(")", "»")

-    # Handle CJK punctuation
-    for a, b in zip("、。！，：；？", ",.!,:;?"):
+    # Handle CJK punctuation and some non standard chars
+    for a, b in zip("、。！，：；？–", ",.!,:;?-"):
        text = text.replace(a, b + " ")

    # Clean up whitespace
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@ -26,7 +26,7 @@ def process_text_chunk(
        List of token IDs
    """
    start_time = time.time()
-
+    
    if skip_phonemize:
        # Input is already phonemes, just tokenize
        t0 = time.time()
@ -35,12 +35,11 @@ def process_text_chunk(
    else:
        # Normal text processing pipeline
        t0 = time.time()
-        normalized = normalize_text(text)
        t1 = time.time()

        t0 = time.time()
        phonemes = phonemize(
-            normalized, language, normalize=False
+            text, language, normalize=False
        )  # Already normalized
        t1 = time.time()

@ -50,7 +49,7 @@ def process_text_chunk(

    total_time = time.time() - start_time
    logger.debug(
-        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'"
+        f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
    )

    return tokens
@ -61,7 +60,7 @@ async def yield_chunk(
 ) -> Tuple[str, List[int]]:
    """Yield a chunk with consistent logging."""
    logger.debug(
-        f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)"
+        f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
    )
    return text, tokens

@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]:

 def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
    """Process all sentences and return info."""
-    sentences = re.split(r"([.!?;:])", text)
+    if settings.advanced_text_normalization:
+        text=normalize_text(text)
+    sentences = re.split(r"([.!?;:])(?=\s|$)", text)
    results = []
-
    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
@ -128,7 +128,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
                current_chunk = []
@ -149,6 +149,7 @@ async def smart_split(
                    continue

                full_clause = clause + comma
+                
                tokens = process_text_chunk(full_clause)
                count = len(tokens)

@ -166,7 +167,7 @@ async def smart_split(
                        chunk_text = " ".join(clause_chunk)
                        chunk_count += 1
                        logger.debug(
-                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                            f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                        )
                        yield chunk_text, clause_tokens
                    clause_chunk = [full_clause]
@ -178,7 +179,7 @@ async def smart_split(
                chunk_text = " ".join(clause_chunk)
                chunk_count += 1
                logger.debug(
-                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
+                    f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
                )
                yield chunk_text, clause_tokens

@ -192,7 +193,7 @@ async def smart_split(
            chunk_text = " ".join(current_chunk)
            chunk_count += 1
            logger.info(
-                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
            )
            yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -217,7 +218,7 @@ async def smart_split(
                chunk_text = " ".join(current_chunk)
                chunk_count += 1
                logger.info(
-                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+                    f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
                )
                yield chunk_text, current_tokens
            current_chunk = [sentence]
@ -229,7 +230,7 @@ async def smart_split(
        chunk_text = " ".join(current_chunk)
        chunk_count += 1
        logger.info(
-            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
+            f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
        )
        yield chunk_text, current_tokens

--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -67,6 +67,8 @@ class TTSService:
                        np.array([0], dtype=np.float32),  # Dummy data for type checking
                        24000,
                        output_format,
+                        speed,
+                        "",
                        is_first_chunk=False,
                        normalizer=normalizer,
                        is_last_chunk=True,
@ -97,15 +99,22 @@ class TTSService:
                                    chunk_audio,
                                    24000,
                                    output_format,
+                                    speed,
+                                    chunk_text,
                                    is_first_chunk=is_first,
-                                    normalizer=normalizer,
                                    is_last_chunk=is_last,
+                                    normalizer=normalizer,
                                )
                                yield converted
                            except Exception as e:
                                logger.error(f"Failed to convert audio: {str(e)}")
                        else:
-                            yield chunk_audio
+                            trimmed = await AudioService.trim_audio(chunk_audio,
+                                                                    chunk_text,
+                                                                    speed,
+                                                                    is_last,
+                                                                    normalizer)
+                            yield trimmed
                else:
                    # For legacy backends, load voice tensor
                    voice_tensor = await self._voice_manager.load_voice(
@ -130,6 +139,8 @@ class TTSService:
                                chunk_audio,
                                24000,
                                output_format,
+                                speed,
+                                chunk_text,
                                is_first_chunk=is_first,
                                normalizer=normalizer,
                                is_last_chunk=is_last,
@ -138,7 +149,12 @@ class TTSService:
                        except Exception as e:
                            logger.error(f"Failed to convert audio: {str(e)}")
                    else:
-                        yield chunk_audio
+                        trimmed = await AudioService.trim_audio(chunk_audio,
+                                                                    chunk_text,
+                                                                    speed,
+                                                                    is_last,
+                                                                    normalizer)
+                        yield trimmed
            except Exception as e:
                logger.error(f"Failed to process tokens: {str(e)}")

--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \
    curl \
    ffmpeg \
    g++ \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/* \
-    && mkdir -p /usr/share/espeak-ng-data \
-    && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/* \
+&& mkdir -p /usr/share/espeak-ng-data \
+&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/

 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
    mv /root/.local/bin/uvx /usr/local/bin/

 # Create non-root user and set up directories and permissions
-RUN useradd -m -u 1000 appuser && \
+RUN useradd -m -u 1001 appuser && \
    mkdir -p /app/api/src/models/v1_0 && \
    chown -R appuser:appuser /app

@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml

 # Install dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.11 && \
    uv sync --extra cpu

 # Copy project files including models
@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api
 COPY --chown=appuser:appuser web ./web
 COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
+RUN sed -i 's/\r$//' ./entrypoint.sh

 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@ -1,26 +1,29 @@
-FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
 # Set non-interactive frontend
 ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update && apt-get install -y \
    python3.10 \
-    python3.10-venv \
+    python3-venv \
    espeak-ng \
    espeak-ng-data \
    git \
    libsndfile1 \
    curl \
    ffmpeg \
- && apt-get clean && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/* \
 && mkdir -p /usr/share/espeak-ng-data \
 && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/

 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
    mv /root/.local/bin/uv /usr/local/bin/ && \
-    mv /root/.local/bin/uvx /usr/local/bin/ && \
-    useradd -m -u 1000 appuser && \
+    mv /root/.local/bin/uvx /usr/local/bin/
+    
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1001 appuser && \
    mkdir -p /app/api/src/models/v1_0 && \
    chown -R appuser:appuser /app 

@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml

 # Install dependencies with GPU extras (using cache mounts)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv && \
+    uv venv --python 3.11 && \
    uv sync --extra gpu

 # Copy project files including models and sync again
@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api
 COPY --chown=appuser:appuser web ./web
 COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
+RUN sed -i 's/\r$//' ./entrypoint.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --extra gpu

--- a/pyproject.toml
+++ b/pyproject.toml
@ -36,7 +36,8 @@ dependencies = [
    "kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
    'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
    "spacy==3.7.2",
-    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl"
+    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
+    "inflect>=7.5.0",
 ]

 [project.optional-dependencies]
--- a/start-gpu.bat
+++ b/start-gpu.bat
@ -0,0 +1,10 @@
+set PYTHONUTF8=1
+set USE_GPU=true
+set USE_ONNX=false
+set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api
+set MODEL_DIR=src\models
+set VOICES_DIR=src\voices\v1_0
+set WEB_PLAYER_PATH=%PROJECT_ROOT%\web
+
+call uv pip install -e ".[gpu]"
+call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880