Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting

This commit is contained in:
Fireblade 2025-02-10 21:45:05 -05:00
parent 9b76ce2071
commit ab1c21130e
10 changed files with 187 additions and 43 deletions

View file

@ -28,8 +28,11 @@ class Settings(BaseSettings):
target_min_tokens: int = 175 # Target minimum tokens per chunk target_min_tokens: int = 175 # Target minimum tokens per chunk
target_max_tokens: int = 250 # Target maximum tokens per chunk target_max_tokens: int = 250 # Target maximum tokens per chunk
absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk
advanced_text_normalization: bool = True # Preproesses the text before misiki which leads
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds gap_trim_ms: int = 1 # Base amount to trim from streaming chunk ends in milliseconds
dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
# Web Player Settings # Web Player Settings
enable_web_player: bool = True # Whether to serve the web player UI enable_web_player: bool = True # Whether to serve the web player UI

View file

@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend):
pipeline = self._get_pipeline(pipeline_lang_code) pipeline = self._get_pipeline(pipeline_lang_code)
logger.debug( logger.debug(
f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'" f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
) )
for result in pipeline.generate_from_tokens( for result in pipeline.generate_from_tokens(
tokens=tokens, voice=voice_path, speed=speed, model=self._model tokens=tokens, voice=voice_path, speed=speed, model=self._model
@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend):
""" """
if not self.is_loaded: if not self.is_loaded:
raise RuntimeError("Model not loaded") raise RuntimeError("Model not loaded")
try: try:
# Memory management for GPU # Memory management for GPU
if self._device == "cuda": if self._device == "cuda":
@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend):
pipeline = self._get_pipeline(pipeline_lang_code) pipeline = self._get_pipeline(pipeline_lang_code)
logger.debug( logger.debug(
f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'" f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
) )
for result in pipeline( for result in pipeline(
text, voice=voice_path, speed=speed, model=self._model text, voice=voice_path, speed=speed, model=self._model

View file

@ -4,10 +4,12 @@ import struct
from io import BytesIO from io import BytesIO
import numpy as np import numpy as np
import math
import scipy.io.wavfile as wavfile import scipy.io.wavfile as wavfile
import soundfile as sf import soundfile as sf
from loguru import logger from loguru import logger
from pydub import AudioSegment from pydub import AudioSegment
from torch import norm
from ..core.config import settings from ..core.config import settings
from .streaming_audio_writer import StreamingAudioWriter from .streaming_audio_writer import StreamingAudioWriter
@ -20,23 +22,66 @@ class AudioNormalizer:
self.chunk_trim_ms = settings.gap_trim_ms self.chunk_trim_ms = settings.gap_trim_ms
self.sample_rate = 24000 # Sample rate of the audio self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000) self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
"""Finds the indices of the first and last non-silent samples in audio data.
Args:
audio_data: Input audio data as numpy array
chunk_text: The text sent to the model to generate the resulting speech
speed: The speaking speed of the voice
silence_threshold_db: How quiet audio has to be to be conssidered silent
is_last_chunk: Whether this is the last chunk
Returns:
A tuple with the start of the non silent portion and with the end of the non silent portion
"""
pad_multiplier=1
split_character=chunk_text.strip()
if len(split_character) > 0:
split_character=split_character[-1]
if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
if not is_last_chunk:
samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
else:
samples_to_pad_end=self.samples_to_pad_start
# Convert dBFS threshold to amplitude
amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
# Find the first samples above the silence threshold at the start and end of the audio
non_silent_index_start, non_silent_index_end = None,None
for X in range(0,len(audio_data)):
#print(audio_data[X])
if audio_data[X] > amplitude_threshold:
non_silent_index_start=X
break
for X in range(len(audio_data) - 1, -1, -1):
if audio_data[X] > amplitude_threshold:
non_silent_index_end=X
break
# Handle the case where the entire audio is silent
if non_silent_index_start == None or non_silent_index_end == None:
return 0, len(audio_data)
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
async def normalize(self, audio_data: np.ndarray) -> np.ndarray: async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range and trim silence from start and end """Convert audio data to int16 range
Args: Args:
audio_data: Input audio data as numpy array audio_data: Input audio data as numpy array
Returns: Returns:
Normalized and trimmed audio data Normalized audio data
""" """
if len(audio_data) == 0: if len(audio_data) == 0:
raise ValueError("Empty audio data") raise ValueError("Empty audio data")
# Trim start and end if enough samples
if len(audio_data) > (2 * self.samples_to_trim):
audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
# Scale directly to int16 range with clipping # Scale directly to int16 range with clipping
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16) return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
@ -71,6 +116,8 @@ class AudioService:
audio_data: np.ndarray, audio_data: np.ndarray,
sample_rate: int, sample_rate: int,
output_format: str, output_format: str,
speed: float = 1,
chunk_text: str = "",
is_first_chunk: bool = True, is_first_chunk: bool = True,
is_last_chunk: bool = False, is_last_chunk: bool = False,
normalizer: AudioNormalizer = None, normalizer: AudioNormalizer = None,
@ -81,6 +128,8 @@ class AudioService:
audio_data: Numpy array of audio samples audio_data: Numpy array of audio samples
sample_rate: Sample rate of the audio sample_rate: Sample rate of the audio
output_format: Target format (wav, mp3, ogg, pcm) output_format: Target format (wav, mp3, ogg, pcm)
speed: The speaking speed of the voice
chunk_text: The text sent to the model to generate the resulting speech
is_first_chunk: Whether this is the first chunk is_first_chunk: Whether this is the first chunk
is_last_chunk: Whether this is the last chunk is_last_chunk: Whether this is the last chunk
normalizer: Optional AudioNormalizer instance for consistent normalization normalizer: Optional AudioNormalizer instance for consistent normalization
@ -96,8 +145,10 @@ class AudioService:
# Always normalize audio to ensure proper amplitude scaling # Always normalize audio to ensure proper amplitude scaling
if normalizer is None: if normalizer is None:
normalizer = AudioNormalizer() normalizer = AudioNormalizer()
normalized_audio = await normalizer.normalize(audio_data) normalized_audio = await normalizer.normalize(audio_data)
normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
# Get or create format-specific writer # Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}" writer_key = f"{output_format}_{sample_rate}"
if is_first_chunk or writer_key not in AudioService._writers: if is_first_chunk or writer_key not in AudioService._writers:
@ -123,3 +174,27 @@ class AudioService:
raise ValueError( raise ValueError(
f"Failed to convert audio stream to {output_format}: {str(e)}" f"Failed to convert audio stream to {output_format}: {str(e)}"
) )
@staticmethod
def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
"""Trim silence from start and end
Args:
audio_data: Input audio data as numpy array
chunk_text: The text sent to the model to generate the resulting speech
speed: The speaking speed of the voice
is_last_chunk: Whether this is the last chunk
normalizer: Optional AudioNormalizer instance for consistent normalization
Returns:
Trimmed audio data
"""
if normalizer is None:
normalizer = AudioNormalizer()
# Trim start and end if enough samples
if len(audio_data) > (2 * normalizer.samples_to_trim):
audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
# Find non silent portion and trim
start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
return audio_data[start_index:end_index]

View file

@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing.
import re import re
from functools import lru_cache from functools import lru_cache
import inflect
# Constants # Constants
VALID_TLDS = [ VALID_TLDS = [
@ -50,6 +51,26 @@ VALID_TLDS = [
"io", "io",
] ]
VALID_UNITS = {
"m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile", # Length
"g":"gram", "kg":"kilogram", "mg":"miligram", # Mass
"s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
"l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter", # Volume
"kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed
"°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin", # Temperature
"pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere", # Pressure
"hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
"v":"volt", "kv":"kilovolt", "mv":"mergavolt", # Voltage
"a":"amp", "ma":"megaamp", "ka":"kiloamp", # Current
"w":"watt", "kw":"kilowatt", "mw":"megawatt", # Power
"j":"joule", "kj":"kilojoule", "mj":"megajoule", # Energy
"Ω":"ohm", "":"kiloohm", "":"megaohm", # Resistance (Ohm)
"f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
"b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size
"kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second",
"px":"pixel" # CSS units
}
# Pre-compiled regex patterns for performance # Pre-compiled regex patterns for performance
EMAIL_PATTERN = re.compile( EMAIL_PATTERN = re.compile(
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
@ -61,6 +82,9 @@ URL_PATTERN = re.compile(
re.IGNORECASE, re.IGNORECASE,
) )
UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE)
INFLECT_ENGINE=inflect.engine()
def split_num(num: re.Match[str]) -> str: def split_num(num: re.Match[str]) -> str:
"""Handle number splitting for various formats""" """Handle number splitting for various formats"""
@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str:
return f"{left} oh {right}{s}" return f"{left} oh {right}{s}"
return f"{left} {right}{s}" return f"{left} {right}{s}"
def handle_units(u: re.Match[str]) -> str:
unit=u.group(6).strip()
if unit.lower() in VALID_UNITS:
unit=VALID_UNITS[unit.lower()].split(" ")
number=u.group(1).strip()
unit[0]=INFLECT_ENGINE.no(unit[0],number)
return " ".join(unit)
def handle_money(m: re.Match[str]) -> str: def handle_money(m: re.Match[str]) -> str:
"""Convert money expressions to spoken form""" """Convert money expressions to spoken form"""
@ -187,14 +218,17 @@ def normalize_text(text: str) -> str:
# Pre-process URLs first # Pre-process URLs first
text = normalize_urls(text) text = normalize_urls(text)
# Pre-process numbers with units
text=UNIT_PATTERN.sub(handle_units,text)
# Replace quotes and brackets # Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"') text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»") text = text.replace("(", "«").replace(")", "»")
# Handle CJK punctuation # Handle CJK punctuation and some non standard chars
for a, b in zip("、。!,:;?", ",.!,:;?"): for a, b in zip("、。!,:;?", ",.!,:;?-"):
text = text.replace(a, b + " ") text = text.replace(a, b + " ")
# Clean up whitespace # Clean up whitespace

View file

@ -26,7 +26,7 @@ def process_text_chunk(
List of token IDs List of token IDs
""" """
start_time = time.time() start_time = time.time()
if skip_phonemize: if skip_phonemize:
# Input is already phonemes, just tokenize # Input is already phonemes, just tokenize
t0 = time.time() t0 = time.time()
@ -35,12 +35,11 @@ def process_text_chunk(
else: else:
# Normal text processing pipeline # Normal text processing pipeline
t0 = time.time() t0 = time.time()
normalized = normalize_text(text)
t1 = time.time() t1 = time.time()
t0 = time.time() t0 = time.time()
phonemes = phonemize( phonemes = phonemize(
normalized, language, normalize=False text, language, normalize=False
) # Already normalized ) # Already normalized
t1 = time.time() t1 = time.time()
@ -50,7 +49,7 @@ def process_text_chunk(
total_time = time.time() - start_time total_time = time.time() - start_time
logger.debug( logger.debug(
f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'" f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
) )
return tokens return tokens
@ -61,7 +60,7 @@ async def yield_chunk(
) -> Tuple[str, List[int]]: ) -> Tuple[str, List[int]]:
"""Yield a chunk with consistent logging.""" """Yield a chunk with consistent logging."""
logger.debug( logger.debug(
f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)" f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
) )
return text, tokens return text, tokens
@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]:
def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]: def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
"""Process all sentences and return info.""" """Process all sentences and return info."""
sentences = re.split(r"([.!?;:])", text) if settings.advanced_text_normalization:
text=normalize_text(text)
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
results = [] results = []
for i in range(0, len(sentences), 2): for i in range(0, len(sentences), 2):
sentence = sentences[i].strip() sentence = sentences[i].strip()
punct = sentences[i + 1] if i + 1 < len(sentences) else "" punct = sentences[i + 1] if i + 1 < len(sentences) else ""
@ -128,7 +128,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk) chunk_text = " ".join(current_chunk)
chunk_count += 1 chunk_count += 1
logger.debug( logger.debug(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
) )
yield chunk_text, current_tokens yield chunk_text, current_tokens
current_chunk = [] current_chunk = []
@ -149,6 +149,7 @@ async def smart_split(
continue continue
full_clause = clause + comma full_clause = clause + comma
tokens = process_text_chunk(full_clause) tokens = process_text_chunk(full_clause)
count = len(tokens) count = len(tokens)
@ -166,7 +167,7 @@ async def smart_split(
chunk_text = " ".join(clause_chunk) chunk_text = " ".join(clause_chunk)
chunk_count += 1 chunk_count += 1
logger.debug( logger.debug(
f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)" f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
) )
yield chunk_text, clause_tokens yield chunk_text, clause_tokens
clause_chunk = [full_clause] clause_chunk = [full_clause]
@ -178,7 +179,7 @@ async def smart_split(
chunk_text = " ".join(clause_chunk) chunk_text = " ".join(clause_chunk)
chunk_count += 1 chunk_count += 1
logger.debug( logger.debug(
f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)" f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
) )
yield chunk_text, clause_tokens yield chunk_text, clause_tokens
@ -192,7 +193,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk) chunk_text = " ".join(current_chunk)
chunk_count += 1 chunk_count += 1
logger.info( logger.info(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
) )
yield chunk_text, current_tokens yield chunk_text, current_tokens
current_chunk = [sentence] current_chunk = [sentence]
@ -217,7 +218,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk) chunk_text = " ".join(current_chunk)
chunk_count += 1 chunk_count += 1
logger.info( logger.info(
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
) )
yield chunk_text, current_tokens yield chunk_text, current_tokens
current_chunk = [sentence] current_chunk = [sentence]
@ -229,7 +230,7 @@ async def smart_split(
chunk_text = " ".join(current_chunk) chunk_text = " ".join(current_chunk)
chunk_count += 1 chunk_count += 1
logger.info( logger.info(
f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)" f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
) )
yield chunk_text, current_tokens yield chunk_text, current_tokens

View file

@ -67,6 +67,8 @@ class TTSService:
np.array([0], dtype=np.float32), # Dummy data for type checking np.array([0], dtype=np.float32), # Dummy data for type checking
24000, 24000,
output_format, output_format,
speed,
"",
is_first_chunk=False, is_first_chunk=False,
normalizer=normalizer, normalizer=normalizer,
is_last_chunk=True, is_last_chunk=True,
@ -97,15 +99,22 @@ class TTSService:
chunk_audio, chunk_audio,
24000, 24000,
output_format, output_format,
speed,
chunk_text,
is_first_chunk=is_first, is_first_chunk=is_first,
normalizer=normalizer,
is_last_chunk=is_last, is_last_chunk=is_last,
normalizer=normalizer,
) )
yield converted yield converted
except Exception as e: except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}") logger.error(f"Failed to convert audio: {str(e)}")
else: else:
yield chunk_audio trimmed = await AudioService.trim_audio(chunk_audio,
chunk_text,
speed,
is_last,
normalizer)
yield trimmed
else: else:
# For legacy backends, load voice tensor # For legacy backends, load voice tensor
voice_tensor = await self._voice_manager.load_voice( voice_tensor = await self._voice_manager.load_voice(
@ -130,6 +139,8 @@ class TTSService:
chunk_audio, chunk_audio,
24000, 24000,
output_format, output_format,
speed,
chunk_text,
is_first_chunk=is_first, is_first_chunk=is_first,
normalizer=normalizer, normalizer=normalizer,
is_last_chunk=is_last, is_last_chunk=is_last,
@ -138,7 +149,12 @@ class TTSService:
except Exception as e: except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}") logger.error(f"Failed to convert audio: {str(e)}")
else: else:
yield chunk_audio trimmed = await AudioService.trim_audio(chunk_audio,
chunk_text,
speed,
is_last,
normalizer)
yield trimmed
except Exception as e: except Exception as e:
logger.error(f"Failed to process tokens: {str(e)}") logger.error(f"Failed to process tokens: {str(e)}")

View file

@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \
curl \ curl \
ffmpeg \ ffmpeg \
g++ \ g++ \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& mkdir -p /usr/share/espeak-ng-data \ && mkdir -p /usr/share/espeak-ng-data \
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
# Install UV using the installer script # Install UV using the installer script
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
mv /root/.local/bin/uvx /usr/local/bin/ mv /root/.local/bin/uvx /usr/local/bin/
# Create non-root user and set up directories and permissions # Create non-root user and set up directories and permissions
RUN useradd -m -u 1000 appuser && \ RUN useradd -m -u 1001 appuser && \
mkdir -p /app/api/src/models/v1_0 && \ mkdir -p /app/api/src/models/v1_0 && \
chown -R appuser:appuser /app chown -R appuser:appuser /app
@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
# Install dependencies # Install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv venv && \ uv venv --python 3.11 && \
uv sync --extra cpu uv sync --extra cpu
# Copy project files including models # Copy project files including models
@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api
COPY --chown=appuser:appuser web ./web COPY --chown=appuser:appuser web ./web
COPY --chown=appuser:appuser docker/scripts/ ./ COPY --chown=appuser:appuser docker/scripts/ ./
RUN chmod +x ./entrypoint.sh RUN chmod +x ./entrypoint.sh
RUN sed -i 's/\r$//' ./entrypoint.sh
# Set environment variables # Set environment variables
ENV PYTHONUNBUFFERED=1 \ ENV PYTHONUNBUFFERED=1 \

View file

@ -1,26 +1,29 @@
FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
# Set non-interactive frontend # Set non-interactive frontend
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
python3.10 \ python3.10 \
python3.10-venv \ python3-venv \
espeak-ng \ espeak-ng \
espeak-ng-data \ espeak-ng-data \
git \ git \
libsndfile1 \ libsndfile1 \
curl \ curl \
ffmpeg \ ffmpeg \
&& apt-get clean && rm -rf /var/lib/apt/lists/* \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /usr/share/espeak-ng-data \ && mkdir -p /usr/share/espeak-ng-data \
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
# Install UV using the installer script # Install UV using the installer script
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
mv /root/.local/bin/uv /usr/local/bin/ && \ mv /root/.local/bin/uv /usr/local/bin/ && \
mv /root/.local/bin/uvx /usr/local/bin/ && \ mv /root/.local/bin/uvx /usr/local/bin/
useradd -m -u 1000 appuser && \
# Create non-root user and set up directories and permissions
RUN useradd -m -u 1001 appuser && \
mkdir -p /app/api/src/models/v1_0 && \ mkdir -p /app/api/src/models/v1_0 && \
chown -R appuser:appuser /app chown -R appuser:appuser /app
@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
# Install dependencies with GPU extras (using cache mounts) # Install dependencies with GPU extras (using cache mounts)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv venv && \ uv venv --python 3.11 && \
uv sync --extra gpu uv sync --extra gpu
# Copy project files including models and sync again # Copy project files including models and sync again
@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api
COPY --chown=appuser:appuser web ./web COPY --chown=appuser:appuser web ./web
COPY --chown=appuser:appuser docker/scripts/ ./ COPY --chown=appuser:appuser docker/scripts/ ./
RUN chmod +x ./entrypoint.sh RUN chmod +x ./entrypoint.sh
RUN sed -i 's/\r$//' ./entrypoint.sh
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --extra gpu uv sync --extra gpu

View file

@ -36,7 +36,8 @@ dependencies = [
"kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938", "kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170', 'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
"spacy==3.7.2", "spacy==3.7.2",
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl" "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
"inflect>=7.5.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]

10
start-gpu.bat Normal file
View file

@ -0,0 +1,10 @@
set PYTHONUTF8=1
set USE_GPU=true
set USE_ONNX=false
set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api
set MODEL_DIR=src\models
set VOICES_DIR=src\voices\v1_0
set WEB_PLAYER_PATH=%PROJECT_ROOT%\web
call uv pip install -e ".[gpu]"
call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880