Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting

This commit is contained in:
Fireblade 2025-02-10 21:45:05 -05:00
parent 9b76ce2071
commit ab1c21130e
10 changed files with 187 additions and 43 deletions

View file

@ -28,8 +28,11 @@ class Settings(BaseSettings):
target_min_tokens: int = 175 # Target minimum tokens per chunk
target_max_tokens: int = 250 # Target maximum tokens per chunk
absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk
advanced_text_normalization: bool = True # Preproesses the text before misiki which leads
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
gap_trim_ms: int = 1 # Base amount to trim from streaming chunk ends in milliseconds
dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
# Web Player Settings
enable_web_player: bool = True # Whether to serve the web player UI

View file

@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend):
pipeline = self._get_pipeline(pipeline_lang_code)
logger.debug(
f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'"
f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
)
for result in pipeline.generate_from_tokens(
tokens=tokens, voice=voice_path, speed=speed, model=self._model
@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend):
"""
if not self.is_loaded:
raise RuntimeError("Model not loaded")
try:
# Memory management for GPU
if self._device == "cuda":
@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend):
pipeline = self._get_pipeline(pipeline_lang_code)
logger.debug(
f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'"
f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
)
for result in pipeline(
text, voice=voice_path, speed=speed, model=self._model

View file

@ -4,10 +4,12 @@ import struct
from io import BytesIO
import numpy as np
import math
import scipy.io.wavfile as wavfile
import soundfile as sf
from loguru import logger
from pydub import AudioSegment
from torch import norm
from ..core.config import settings
from .streaming_audio_writer import StreamingAudioWriter
@ -20,23 +22,66 @@ class AudioNormalizer:
self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range and trim silence from start and end
def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
"""Finds the indices of the first and last non-silent samples in audio data.
Args:
audio_data: Input audio data as numpy array
chunk_text: The text sent to the model to generate the resulting speech
speed: The speaking speed of the voice
silence_threshold_db: How quiet audio has to be to be conssidered silent
is_last_chunk: Whether this is the last chunk
Returns:
Normalized and trimmed audio data
A tuple with the start of the non silent portion and with the end of the non silent portion
"""
pad_multiplier=1
split_character=chunk_text.strip()
if len(split_character) > 0:
split_character=split_character[-1]
if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
if not is_last_chunk:
samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
else:
samples_to_pad_end=self.samples_to_pad_start
# Convert dBFS threshold to amplitude
amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
# Find the first samples above the silence threshold at the start and end of the audio
non_silent_index_start, non_silent_index_end = None,None
for X in range(0,len(audio_data)):
#print(audio_data[X])
if audio_data[X] > amplitude_threshold:
non_silent_index_start=X
break
for X in range(len(audio_data) - 1, -1, -1):
if audio_data[X] > amplitude_threshold:
non_silent_index_end=X
break
# Handle the case where the entire audio is silent
if non_silent_index_start == None or non_silent_index_end == None:
return 0, len(audio_data)
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range
Args:
audio_data: Input audio data as numpy array
Returns:
Normalized audio data
"""
if len(audio_data) == 0:
raise ValueError("Empty audio data")
# Trim start and end if enough samples
if len(audio_data) > (2 * self.samples_to_trim):
audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
# Scale directly to int16 range with clipping
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
@ -71,6 +116,8 @@ class AudioService:
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
speed: float = 1,
chunk_text: str = "",
is_first_chunk: bool = True,
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
@ -81,6 +128,8 @@ class AudioService:
audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, ogg, pcm)
speed: The speaking speed of the voice
chunk_text: The text sent to the model to generate the resulting speech
is_first_chunk: Whether this is the first chunk
is_last_chunk: Whether this is the last chunk
normalizer: Optional AudioNormalizer instance for consistent normalization
@ -96,7 +145,9 @@ class AudioService:
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = await normalizer.normalize(audio_data)
normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
# Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}"
@ -123,3 +174,27 @@ class AudioService:
raise ValueError(
f"Failed to convert audio stream to {output_format}: {str(e)}"
)
@staticmethod
def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
"""Trim silence from start and end
Args:
audio_data: Input audio data as numpy array
chunk_text: The text sent to the model to generate the resulting speech
speed: The speaking speed of the voice
is_last_chunk: Whether this is the last chunk
normalizer: Optional AudioNormalizer instance for consistent normalization
Returns:
Trimmed audio data
"""
if normalizer is None:
normalizer = AudioNormalizer()
# Trim start and end if enough samples
if len(audio_data) > (2 * normalizer.samples_to_trim):
audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
# Find non silent portion and trim
start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
return audio_data[start_index:end_index]

View file

@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing.
import re
from functools import lru_cache
import inflect
# Constants
VALID_TLDS = [
@ -50,6 +51,26 @@ VALID_TLDS = [
"io",
]
VALID_UNITS = {
"m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile", # Length
"g":"gram", "kg":"kilogram", "mg":"miligram", # Mass
"s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
"l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter", # Volume
"kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed
"°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin", # Temperature
"pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere", # Pressure
"hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
"v":"volt", "kv":"kilovolt", "mv":"mergavolt", # Voltage
"a":"amp", "ma":"megaamp", "ka":"kiloamp", # Current
"w":"watt", "kw":"kilowatt", "mw":"megawatt", # Power
"j":"joule", "kj":"kilojoule", "mj":"megajoule", # Energy
"Ω":"ohm", "":"kiloohm", "":"megaohm", # Resistance (Ohm)
"f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
"b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size
"kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second",
"px":"pixel" # CSS units
}
# Pre-compiled regex patterns for performance
EMAIL_PATTERN = re.compile(
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
@ -61,6 +82,9 @@ URL_PATTERN = re.compile(
re.IGNORECASE,
)
UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE)
INFLECT_ENGINE=inflect.engine()
def split_num(num: re.Match[str]) -> str:
"""Handle number splitting for various formats"""
@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str:
return f"{left} oh {right}{s}"
return f"{left} {right}{s}"
def handle_units(u: re.Match[str]) -> str:
unit=u.group(6).strip()
if unit.lower() in VALID_UNITS:
unit=VALID_UNITS[unit.lower()].split(" ")
number=u.group(1).strip()
unit[0]=INFLECT_ENGINE.no(unit[0],number)
return " ".join(unit)
def handle_money(m: re.Match[str]) -> str:
"""Convert money expressions to spoken form"""
@ -187,14 +218,17 @@ def normalize_text(text: str) -> str:
# Pre-process URLs first
text = normalize_urls(text)
# Pre-process numbers with units
text=UNIT_PATTERN.sub(handle_units,text)
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation
for a, b in zip("、。!,:;?", ",.!,:;?"):
# Handle CJK punctuation and some non standard chars
for a, b in zip("、。!,:;?", ",.!,:;?-"):
text = text.replace(a, b + " ")
# Clean up whitespace

View file

@ -35,12 +35,11 @@ def process_text_chunk(
else:
# Normal text processing pipeline
t0 = time.time()
normalized = normalize_text(text)
t1 = time.time()
t0 = time.time()
phonemes = phonemize(
normalized, language, normalize=False
text, language, normalize=False
) # Already normalized
t1 = time.time()
@ -50,7 +49,7 @@ def process_text_chunk(
total_time = time.time() - start_time
logger.debug(
f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'"
f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
)
return tokens
@ -61,7 +60,7 @@ async def yield_chunk(
) -> Tuple[str, List[int]]:
"""Yield a chunk with consistent logging."""
logger.debug(
f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)"
f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
)
return text, tokens
@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]:
def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
"""Process all sentences and return info."""
sentences = re.split(r"([.!?;:])", text)
if settings.advanced_text_normalization:
text=normalize_text(text)
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
results = []
for i in range(0, len(sentences), 2):
sentence = sentences[i].strip()
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
@ -128,7 +128,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk)
chunk_count += 1
logger.debug(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
)
yield chunk_text, current_tokens
current_chunk = []
@ -149,6 +149,7 @@ async def smart_split(
continue
full_clause = clause + comma
tokens = process_text_chunk(full_clause)
count = len(tokens)
@ -166,7 +167,7 @@ async def smart_split(
chunk_text = " ".join(clause_chunk)
chunk_count += 1
logger.debug(
f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
)
yield chunk_text, clause_tokens
clause_chunk = [full_clause]
@ -178,7 +179,7 @@ async def smart_split(
chunk_text = " ".join(clause_chunk)
chunk_count += 1
logger.debug(
f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
)
yield chunk_text, clause_tokens
@ -192,7 +193,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk)
chunk_count += 1
logger.info(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
)
yield chunk_text, current_tokens
current_chunk = [sentence]
@ -217,7 +218,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk)
chunk_count += 1
logger.info(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
)
yield chunk_text, current_tokens
current_chunk = [sentence]
@ -229,7 +230,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk)
chunk_count += 1
logger.info(
f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
)
yield chunk_text, current_tokens

View file

@ -67,6 +67,8 @@ class TTSService:
np.array([0], dtype=np.float32), # Dummy data for type checking
24000,
output_format,
speed,
"",
is_first_chunk=False,
normalizer=normalizer,
is_last_chunk=True,
@ -97,15 +99,22 @@ class TTSService:
chunk_audio,
24000,
output_format,
speed,
chunk_text,
is_first_chunk=is_first,
normalizer=normalizer,
is_last_chunk=is_last,
normalizer=normalizer,
)
yield converted
except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}")
else:
yield chunk_audio
trimmed = await AudioService.trim_audio(chunk_audio,
chunk_text,
speed,
is_last,
normalizer)
yield trimmed
else:
# For legacy backends, load voice tensor
voice_tensor = await self._voice_manager.load_voice(
@ -130,6 +139,8 @@ class TTSService:
chunk_audio,
24000,
output_format,
speed,
chunk_text,
is_first_chunk=is_first,
normalizer=normalizer,
is_last_chunk=is_last,
@ -138,7 +149,12 @@ class TTSService:
except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}")
else:
yield chunk_audio
trimmed = await AudioService.trim_audio(chunk_audio,
chunk_text,
speed,
is_last,
normalizer)
yield trimmed
except Exception as e:
logger.error(f"Failed to process tokens: {str(e)}")

View file

@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
mv /root/.local/bin/uvx /usr/local/bin/
# Create non-root user and set up directories and permissions
RUN useradd -m -u 1000 appuser && \
RUN useradd -m -u 1001 appuser && \
mkdir -p /app/api/src/models/v1_0 && \
chown -R appuser:appuser /app
@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
# Install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
uv venv && \
uv venv --python 3.11 && \
uv sync --extra cpu
# Copy project files including models
@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api
COPY --chown=appuser:appuser web ./web
COPY --chown=appuser:appuser docker/scripts/ ./
RUN chmod +x ./entrypoint.sh
RUN sed -i 's/\r$//' ./entrypoint.sh
# Set environment variables
ENV PYTHONUNBUFFERED=1 \

View file

@ -1,26 +1,29 @@
FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
# Set non-interactive frontend
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-venv \
python3-venv \
espeak-ng \
espeak-ng-data \
git \
libsndfile1 \
curl \
ffmpeg \
&& apt-get clean && rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /usr/share/espeak-ng-data \
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
# Install UV using the installer script
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
mv /root/.local/bin/uv /usr/local/bin/ && \
mv /root/.local/bin/uvx /usr/local/bin/ && \
useradd -m -u 1000 appuser && \
mv /root/.local/bin/uvx /usr/local/bin/
# Create non-root user and set up directories and permissions
RUN useradd -m -u 1001 appuser && \
mkdir -p /app/api/src/models/v1_0 && \
chown -R appuser:appuser /app
@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
# Install dependencies with GPU extras (using cache mounts)
RUN --mount=type=cache,target=/root/.cache/uv \
uv venv && \
uv venv --python 3.11 && \
uv sync --extra gpu
# Copy project files including models and sync again
@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api
COPY --chown=appuser:appuser web ./web
COPY --chown=appuser:appuser docker/scripts/ ./
RUN chmod +x ./entrypoint.sh
RUN sed -i 's/\r$//' ./entrypoint.sh
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --extra gpu

View file

@ -36,7 +36,8 @@ dependencies = [
"kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
"spacy==3.7.2",
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl"
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
"inflect>=7.5.0",
]
[project.optional-dependencies]

10
start-gpu.bat Normal file
View file

@ -0,0 +1,10 @@
set PYTHONUTF8=1
set USE_GPU=true
set USE_ONNX=false
set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api
set MODEL_DIR=src\models
set VOICES_DIR=src\voices\v1_0
set WEB_PLAYER_PATH=%PROJECT_ROOT%\web
call uv pip install -e ".[gpu]"
call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880