diff --git a/api/src/core/config.py b/api/src/core/config.py index d2e369b..d361a5c 100644 --- a/api/src/core/config.py +++ b/api/src/core/config.py @@ -28,8 +28,11 @@ class Settings(BaseSettings): target_min_tokens: int = 175 # Target minimum tokens per chunk target_max_tokens: int = 250 # Target maximum tokens per chunk absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk + advanced_text_normalization: bool = True # Preproesses the text before misiki which leads - gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds + gap_trim_ms: int = 1 # Base amount to trim from streaming chunk ends in milliseconds + dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim + dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8} # Web Player Settings enable_web_player: bool = True # Whether to serve the web player UI diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py index 9f5e206..248593e 100644 --- a/api/src/inference/kokoro_v1.py +++ b/api/src/inference/kokoro_v1.py @@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend): pipeline = self._get_pipeline(pipeline_lang_code) logger.debug( - f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'" + f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'" ) for result in pipeline.generate_from_tokens( tokens=tokens, voice=voice_path, speed=speed, model=self._model @@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend): """ if not self.is_loaded: raise RuntimeError("Model not loaded") - try: # Memory management for GPU if self._device == "cuda": @@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend): pipeline = self._get_pipeline(pipeline_lang_code) logger.debug( - f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'" + f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'" ) for result in pipeline( text, voice=voice_path, speed=speed, model=self._model diff --git a/api/src/services/audio.py b/api/src/services/audio.py index 2055c9f..64062b8 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -4,10 +4,12 @@ import struct from io import BytesIO import numpy as np +import math import scipy.io.wavfile as wavfile import soundfile as sf from loguru import logger from pydub import AudioSegment +from torch import norm from ..core.config import settings from .streaming_audio_writer import StreamingAudioWriter @@ -20,23 +22,66 @@ class AudioNormalizer: self.chunk_trim_ms = settings.gap_trim_ms self.sample_rate = 24000 # Sample rate of the audio self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000) + self.samples_to_pad_start= int(50 * self.sample_rate / 1000) + + def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]: + """Finds the indices of the first and last non-silent samples in audio data. + + Args: + audio_data: Input audio data as numpy array + chunk_text: The text sent to the model to generate the resulting speech + speed: The speaking speed of the voice + silence_threshold_db: How quiet audio has to be to be conssidered silent + is_last_chunk: Whether this is the last chunk + + Returns: + A tuple with the start of the non silent portion and with the end of the non silent portion + """ + + pad_multiplier=1 + split_character=chunk_text.strip() + if len(split_character) > 0: + split_character=split_character[-1] + if split_character in settings.dynamic_gap_trim_padding_char_multiplier: + pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character] + + if not is_last_chunk: + samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0) + else: + samples_to_pad_end=self.samples_to_pad_start + # Convert dBFS threshold to amplitude + amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20)) + # Find the first samples above the silence threshold at the start and end of the audio + non_silent_index_start, non_silent_index_end = None,None + + for X in range(0,len(audio_data)): + #print(audio_data[X]) + if audio_data[X] > amplitude_threshold: + non_silent_index_start=X + break + + for X in range(len(audio_data) - 1, -1, -1): + if audio_data[X] > amplitude_threshold: + non_silent_index_end=X + break + + # Handle the case where the entire audio is silent + if non_silent_index_start == None or non_silent_index_end == None: + return 0, len(audio_data) + + return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data)) async def normalize(self, audio_data: np.ndarray) -> np.ndarray: - """Convert audio data to int16 range and trim silence from start and end + """Convert audio data to int16 range Args: audio_data: Input audio data as numpy array - Returns: - Normalized and trimmed audio data + Normalized audio data """ if len(audio_data) == 0: raise ValueError("Empty audio data") - # Trim start and end if enough samples - if len(audio_data) > (2 * self.samples_to_trim): - audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim] - # Scale directly to int16 range with clipping return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16) @@ -71,6 +116,8 @@ class AudioService: audio_data: np.ndarray, sample_rate: int, output_format: str, + speed: float = 1, + chunk_text: str = "", is_first_chunk: bool = True, is_last_chunk: bool = False, normalizer: AudioNormalizer = None, @@ -81,6 +128,8 @@ class AudioService: audio_data: Numpy array of audio samples sample_rate: Sample rate of the audio output_format: Target format (wav, mp3, ogg, pcm) + speed: The speaking speed of the voice + chunk_text: The text sent to the model to generate the resulting speech is_first_chunk: Whether this is the first chunk is_last_chunk: Whether this is the last chunk normalizer: Optional AudioNormalizer instance for consistent normalization @@ -96,8 +145,10 @@ class AudioService: # Always normalize audio to ensure proper amplitude scaling if normalizer is None: normalizer = AudioNormalizer() + normalized_audio = await normalizer.normalize(audio_data) - + normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer) + # Get or create format-specific writer writer_key = f"{output_format}_{sample_rate}" if is_first_chunk or writer_key not in AudioService._writers: @@ -123,3 +174,27 @@ class AudioService: raise ValueError( f"Failed to convert audio stream to {output_format}: {str(e)}" ) + @staticmethod + def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray: + """Trim silence from start and end + + Args: + audio_data: Input audio data as numpy array + chunk_text: The text sent to the model to generate the resulting speech + speed: The speaking speed of the voice + is_last_chunk: Whether this is the last chunk + normalizer: Optional AudioNormalizer instance for consistent normalization + + Returns: + Trimmed audio data + """ + if normalizer is None: + normalizer = AudioNormalizer() + + # Trim start and end if enough samples + if len(audio_data) > (2 * normalizer.samples_to_trim): + audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim] + + # Find non silent portion and trim + start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk) + return audio_data[start_index:end_index] \ No newline at end of file diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 383abbd..7c728fb 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing. import re from functools import lru_cache +import inflect # Constants VALID_TLDS = [ @@ -50,6 +51,26 @@ VALID_TLDS = [ "io", ] +VALID_UNITS = { + "m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile", # Length + "g":"gram", "kg":"kilogram", "mg":"miligram", # Mass + "s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time + "l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter", # Volume + "kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed + "°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin", # Temperature + "pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere", # Pressure + "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency + "v":"volt", "kv":"kilovolt", "mv":"mergavolt", # Voltage + "a":"amp", "ma":"megaamp", "ka":"kiloamp", # Current + "w":"watt", "kw":"kilowatt", "mw":"megawatt", # Power + "j":"joule", "kj":"kilojoule", "mj":"megajoule", # Energy + "Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm", # Resistance (Ohm) + "f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance + "b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size + "kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second", + "px":"pixel" # CSS units +} + # Pre-compiled regex patterns for performance EMAIL_PATTERN = re.compile( r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE @@ -61,6 +82,9 @@ URL_PATTERN = re.compile( re.IGNORECASE, ) +UNIT_PATTERN = re.compile(r"((??@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE) + +INFLECT_ENGINE=inflect.engine() def split_num(num: re.Match[str]) -> str: """Handle number splitting for various formats""" @@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str: return f"{left} oh {right}{s}" return f"{left} {right}{s}" +def handle_units(u: re.Match[str]) -> str: + unit=u.group(6).strip() + if unit.lower() in VALID_UNITS: + unit=VALID_UNITS[unit.lower()].split(" ") + number=u.group(1).strip() + unit[0]=INFLECT_ENGINE.no(unit[0],number) + return " ".join(unit) def handle_money(m: re.Match[str]) -> str: """Convert money expressions to spoken form""" @@ -187,14 +218,17 @@ def normalize_text(text: str) -> str: # Pre-process URLs first text = normalize_urls(text) + # Pre-process numbers with units + text=UNIT_PATTERN.sub(handle_units,text) + # Replace quotes and brackets text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace("(", "«").replace(")", "»") - # Handle CJK punctuation - for a, b in zip("、。!,:;?", ",.!,:;?"): + # Handle CJK punctuation and some non standard chars + for a, b in zip("、。!,:;?–", ",.!,:;?-"): text = text.replace(a, b + " ") # Clean up whitespace diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 924b014..0e3cfcf 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -26,7 +26,7 @@ def process_text_chunk( List of token IDs """ start_time = time.time() - + if skip_phonemize: # Input is already phonemes, just tokenize t0 = time.time() @@ -35,12 +35,11 @@ def process_text_chunk( else: # Normal text processing pipeline t0 = time.time() - normalized = normalize_text(text) t1 = time.time() t0 = time.time() phonemes = phonemize( - normalized, language, normalize=False + text, language, normalize=False ) # Already normalized t1 = time.time() @@ -50,7 +49,7 @@ def process_text_chunk( total_time = time.time() - start_time logger.debug( - f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'" + f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'" ) return tokens @@ -61,7 +60,7 @@ async def yield_chunk( ) -> Tuple[str, List[int]]: """Yield a chunk with consistent logging.""" logger.debug( - f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)" + f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)" ) return text, tokens @@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]: def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]: """Process all sentences and return info.""" - sentences = re.split(r"([.!?;:])", text) + if settings.advanced_text_normalization: + text=normalize_text(text) + sentences = re.split(r"([.!?;:])(?=\s|$)", text) results = [] - for i in range(0, len(sentences), 2): sentence = sentences[i].strip() punct = sentences[i + 1] if i + 1 < len(sentences) else "" @@ -128,7 +128,7 @@ async def smart_split( chunk_text = " ".join(current_chunk) chunk_count += 1 logger.debug( - f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" + f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)" ) yield chunk_text, current_tokens current_chunk = [] @@ -149,6 +149,7 @@ async def smart_split( continue full_clause = clause + comma + tokens = process_text_chunk(full_clause) count = len(tokens) @@ -166,7 +167,7 @@ async def smart_split( chunk_text = " ".join(clause_chunk) chunk_count += 1 logger.debug( - f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)" + f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)" ) yield chunk_text, clause_tokens clause_chunk = [full_clause] @@ -178,7 +179,7 @@ async def smart_split( chunk_text = " ".join(clause_chunk) chunk_count += 1 logger.debug( - f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)" + f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)" ) yield chunk_text, clause_tokens @@ -192,7 +193,7 @@ async def smart_split( chunk_text = " ".join(current_chunk) chunk_count += 1 logger.info( - f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" + f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)" ) yield chunk_text, current_tokens current_chunk = [sentence] @@ -217,7 +218,7 @@ async def smart_split( chunk_text = " ".join(current_chunk) chunk_count += 1 logger.info( - f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" + f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)" ) yield chunk_text, current_tokens current_chunk = [sentence] @@ -229,7 +230,7 @@ async def smart_split( chunk_text = " ".join(current_chunk) chunk_count += 1 logger.info( - f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" + f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)" ) yield chunk_text, current_tokens diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 44fb709..3d533d9 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -67,6 +67,8 @@ class TTSService: np.array([0], dtype=np.float32), # Dummy data for type checking 24000, output_format, + speed, + "", is_first_chunk=False, normalizer=normalizer, is_last_chunk=True, @@ -97,15 +99,22 @@ class TTSService: chunk_audio, 24000, output_format, + speed, + chunk_text, is_first_chunk=is_first, - normalizer=normalizer, is_last_chunk=is_last, + normalizer=normalizer, ) yield converted except Exception as e: logger.error(f"Failed to convert audio: {str(e)}") else: - yield chunk_audio + trimmed = await AudioService.trim_audio(chunk_audio, + chunk_text, + speed, + is_last, + normalizer) + yield trimmed else: # For legacy backends, load voice tensor voice_tensor = await self._voice_manager.load_voice( @@ -130,6 +139,8 @@ class TTSService: chunk_audio, 24000, output_format, + speed, + chunk_text, is_first_chunk=is_first, normalizer=normalizer, is_last_chunk=is_last, @@ -138,7 +149,12 @@ class TTSService: except Exception as e: logger.error(f"Failed to convert audio: {str(e)}") else: - yield chunk_audio + trimmed = await AudioService.trim_audio(chunk_audio, + chunk_text, + speed, + is_last, + normalizer) + yield trimmed except Exception as e: logger.error(f"Failed to process tokens: {str(e)}") diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile index 5d6f99e..369d008 100644 --- a/docker/cpu/Dockerfile +++ b/docker/cpu/Dockerfile @@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \ curl \ ffmpeg \ g++ \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /usr/share/espeak-ng-data \ - && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ +&& apt-get clean \ +&& rm -rf /var/lib/apt/lists/* \ +&& mkdir -p /usr/share/espeak-ng-data \ +&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ # Install UV using the installer script RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ @@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ mv /root/.local/bin/uvx /usr/local/bin/ # Create non-root user and set up directories and permissions -RUN useradd -m -u 1000 appuser && \ +RUN useradd -m -u 1001 appuser && \ mkdir -p /app/api/src/models/v1_0 && \ chown -R appuser:appuser /app @@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml # Install dependencies RUN --mount=type=cache,target=/root/.cache/uv \ - uv venv && \ + uv venv --python 3.11 && \ uv sync --extra cpu # Copy project files including models @@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api COPY --chown=appuser:appuser web ./web COPY --chown=appuser:appuser docker/scripts/ ./ RUN chmod +x ./entrypoint.sh +RUN sed -i 's/\r$//' ./entrypoint.sh # Set environment variables ENV PYTHONUNBUFFERED=1 \ diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile index ce0f646..b19cf80 100644 --- a/docker/gpu/Dockerfile +++ b/docker/gpu/Dockerfile @@ -1,26 +1,29 @@ -FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 +FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 # Set non-interactive frontend ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN apt-get update && apt-get install -y \ python3.10 \ - python3.10-venv \ + python3-venv \ espeak-ng \ espeak-ng-data \ git \ libsndfile1 \ curl \ ffmpeg \ - && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ && mkdir -p /usr/share/espeak-ng-data \ && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ # Install UV using the installer script RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ mv /root/.local/bin/uv /usr/local/bin/ && \ - mv /root/.local/bin/uvx /usr/local/bin/ && \ - useradd -m -u 1000 appuser && \ + mv /root/.local/bin/uvx /usr/local/bin/ + +# Create non-root user and set up directories and permissions +RUN useradd -m -u 1001 appuser && \ mkdir -p /app/api/src/models/v1_0 && \ chown -R appuser:appuser /app @@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml # Install dependencies with GPU extras (using cache mounts) RUN --mount=type=cache,target=/root/.cache/uv \ - uv venv && \ + uv venv --python 3.11 && \ uv sync --extra gpu # Copy project files including models and sync again @@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api COPY --chown=appuser:appuser web ./web COPY --chown=appuser:appuser docker/scripts/ ./ RUN chmod +x ./entrypoint.sh +RUN sed -i 's/\r$//' ./entrypoint.sh RUN --mount=type=cache,target=/root/.cache/uv \ uv sync --extra gpu diff --git a/pyproject.toml b/pyproject.toml index 336519e..4205fea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ "kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938", 'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170', "spacy==3.7.2", - "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl" + "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", + "inflect>=7.5.0", ] [project.optional-dependencies] diff --git a/start-gpu.bat b/start-gpu.bat new file mode 100644 index 0000000..d7067e4 --- /dev/null +++ b/start-gpu.bat @@ -0,0 +1,10 @@ +set PYTHONUTF8=1 +set USE_GPU=true +set USE_ONNX=false +set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api +set MODEL_DIR=src\models +set VOICES_DIR=src\voices\v1_0 +set WEB_PLAYER_PATH=%PROJECT_ROOT%\web + +call uv pip install -e ".[gpu]" +call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880 \ No newline at end of file