mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting
This commit is contained in:
parent
9b76ce2071
commit
ab1c21130e
10 changed files with 187 additions and 43 deletions
|
@ -28,8 +28,11 @@ class Settings(BaseSettings):
|
|||
target_min_tokens: int = 175 # Target minimum tokens per chunk
|
||||
target_max_tokens: int = 250 # Target maximum tokens per chunk
|
||||
absolute_max_tokens: int = 450 # Absolute maximum tokens per chunk
|
||||
advanced_text_normalization: bool = True # Preproesses the text before misiki which leads
|
||||
|
||||
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds
|
||||
gap_trim_ms: int = 1 # Base amount to trim from streaming chunk ends in milliseconds
|
||||
dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
|
||||
dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
|
||||
|
||||
# Web Player Settings
|
||||
enable_web_player: bool = True # Whether to serve the web player UI
|
||||
|
|
|
@ -144,7 +144,7 @@ class KokoroV1(BaseModelBackend):
|
|||
pipeline = self._get_pipeline(pipeline_lang_code)
|
||||
|
||||
logger.debug(
|
||||
f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}...'"
|
||||
f"Generating audio from tokens with lang_code '{pipeline_lang_code}': '{tokens[:100]}{'...' if len(tokens) > 100 else ''}'"
|
||||
)
|
||||
for result in pipeline.generate_from_tokens(
|
||||
tokens=tokens, voice=voice_path, speed=speed, model=self._model
|
||||
|
@ -192,7 +192,6 @@ class KokoroV1(BaseModelBackend):
|
|||
"""
|
||||
if not self.is_loaded:
|
||||
raise RuntimeError("Model not loaded")
|
||||
|
||||
try:
|
||||
# Memory management for GPU
|
||||
if self._device == "cuda":
|
||||
|
@ -237,7 +236,7 @@ class KokoroV1(BaseModelBackend):
|
|||
pipeline = self._get_pipeline(pipeline_lang_code)
|
||||
|
||||
logger.debug(
|
||||
f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}...'"
|
||||
f"Generating audio for text with lang_code '{pipeline_lang_code}': '{text[:100]}{'...' if len(text) > 100 else ''}'"
|
||||
)
|
||||
for result in pipeline(
|
||||
text, voice=voice_path, speed=speed, model=self._model
|
||||
|
|
|
@ -4,10 +4,12 @@ import struct
|
|||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
import math
|
||||
import scipy.io.wavfile as wavfile
|
||||
import soundfile as sf
|
||||
from loguru import logger
|
||||
from pydub import AudioSegment
|
||||
from torch import norm
|
||||
|
||||
from ..core.config import settings
|
||||
from .streaming_audio_writer import StreamingAudioWriter
|
||||
|
@ -20,23 +22,66 @@ class AudioNormalizer:
|
|||
self.chunk_trim_ms = settings.gap_trim_ms
|
||||
self.sample_rate = 24000 # Sample rate of the audio
|
||||
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
|
||||
self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
|
||||
|
||||
def find_first_last_non_silent(self,audio_data: np.ndarray, chunk_text: str, speed: float, silence_threshold_db: int = -45, is_last_chunk: bool = False) -> tuple[int, int]:
|
||||
"""Finds the indices of the first and last non-silent samples in audio data.
|
||||
|
||||
Args:
|
||||
audio_data: Input audio data as numpy array
|
||||
chunk_text: The text sent to the model to generate the resulting speech
|
||||
speed: The speaking speed of the voice
|
||||
silence_threshold_db: How quiet audio has to be to be conssidered silent
|
||||
is_last_chunk: Whether this is the last chunk
|
||||
|
||||
Returns:
|
||||
A tuple with the start of the non silent portion and with the end of the non silent portion
|
||||
"""
|
||||
|
||||
pad_multiplier=1
|
||||
split_character=chunk_text.strip()
|
||||
if len(split_character) > 0:
|
||||
split_character=split_character[-1]
|
||||
if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
|
||||
pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
|
||||
|
||||
if not is_last_chunk:
|
||||
samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
|
||||
else:
|
||||
samples_to_pad_end=self.samples_to_pad_start
|
||||
# Convert dBFS threshold to amplitude
|
||||
amplitude_threshold = np.iinfo(audio_data.dtype).max * (10 ** (silence_threshold_db / 20))
|
||||
# Find the first samples above the silence threshold at the start and end of the audio
|
||||
non_silent_index_start, non_silent_index_end = None,None
|
||||
|
||||
for X in range(0,len(audio_data)):
|
||||
#print(audio_data[X])
|
||||
if audio_data[X] > amplitude_threshold:
|
||||
non_silent_index_start=X
|
||||
break
|
||||
|
||||
for X in range(len(audio_data) - 1, -1, -1):
|
||||
if audio_data[X] > amplitude_threshold:
|
||||
non_silent_index_end=X
|
||||
break
|
||||
|
||||
# Handle the case where the entire audio is silent
|
||||
if non_silent_index_start == None or non_silent_index_end == None:
|
||||
return 0, len(audio_data)
|
||||
|
||||
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
|
||||
|
||||
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
||||
"""Convert audio data to int16 range and trim silence from start and end
|
||||
"""Convert audio data to int16 range
|
||||
|
||||
Args:
|
||||
audio_data: Input audio data as numpy array
|
||||
|
||||
Returns:
|
||||
Normalized and trimmed audio data
|
||||
Normalized audio data
|
||||
"""
|
||||
if len(audio_data) == 0:
|
||||
raise ValueError("Empty audio data")
|
||||
|
||||
# Trim start and end if enough samples
|
||||
if len(audio_data) > (2 * self.samples_to_trim):
|
||||
audio_data = audio_data[self.samples_to_trim : -self.samples_to_trim]
|
||||
|
||||
# Scale directly to int16 range with clipping
|
||||
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
|
||||
|
||||
|
@ -71,6 +116,8 @@ class AudioService:
|
|||
audio_data: np.ndarray,
|
||||
sample_rate: int,
|
||||
output_format: str,
|
||||
speed: float = 1,
|
||||
chunk_text: str = "",
|
||||
is_first_chunk: bool = True,
|
||||
is_last_chunk: bool = False,
|
||||
normalizer: AudioNormalizer = None,
|
||||
|
@ -81,6 +128,8 @@ class AudioService:
|
|||
audio_data: Numpy array of audio samples
|
||||
sample_rate: Sample rate of the audio
|
||||
output_format: Target format (wav, mp3, ogg, pcm)
|
||||
speed: The speaking speed of the voice
|
||||
chunk_text: The text sent to the model to generate the resulting speech
|
||||
is_first_chunk: Whether this is the first chunk
|
||||
is_last_chunk: Whether this is the last chunk
|
||||
normalizer: Optional AudioNormalizer instance for consistent normalization
|
||||
|
@ -96,8 +145,10 @@ class AudioService:
|
|||
# Always normalize audio to ensure proper amplitude scaling
|
||||
if normalizer is None:
|
||||
normalizer = AudioNormalizer()
|
||||
|
||||
normalized_audio = await normalizer.normalize(audio_data)
|
||||
|
||||
normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
|
||||
|
||||
# Get or create format-specific writer
|
||||
writer_key = f"{output_format}_{sample_rate}"
|
||||
if is_first_chunk or writer_key not in AudioService._writers:
|
||||
|
@ -123,3 +174,27 @@ class AudioService:
|
|||
raise ValueError(
|
||||
f"Failed to convert audio stream to {output_format}: {str(e)}"
|
||||
)
|
||||
@staticmethod
|
||||
def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
|
||||
"""Trim silence from start and end
|
||||
|
||||
Args:
|
||||
audio_data: Input audio data as numpy array
|
||||
chunk_text: The text sent to the model to generate the resulting speech
|
||||
speed: The speaking speed of the voice
|
||||
is_last_chunk: Whether this is the last chunk
|
||||
normalizer: Optional AudioNormalizer instance for consistent normalization
|
||||
|
||||
Returns:
|
||||
Trimmed audio data
|
||||
"""
|
||||
if normalizer is None:
|
||||
normalizer = AudioNormalizer()
|
||||
|
||||
# Trim start and end if enough samples
|
||||
if len(audio_data) > (2 * normalizer.samples_to_trim):
|
||||
audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
|
||||
|
||||
# Find non silent portion and trim
|
||||
start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
|
||||
return audio_data[start_index:end_index]
|
|
@ -6,6 +6,7 @@ Converts them into a format suitable for text-to-speech processing.
|
|||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
import inflect
|
||||
|
||||
# Constants
|
||||
VALID_TLDS = [
|
||||
|
@ -50,6 +51,26 @@ VALID_TLDS = [
|
|||
"io",
|
||||
]
|
||||
|
||||
VALID_UNITS = {
|
||||
"m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile", # Length
|
||||
"g":"gram", "kg":"kilogram", "mg":"miligram", # Mass
|
||||
"s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
|
||||
"l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter", # Volume
|
||||
"kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second", # Speed
|
||||
"°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin", # Temperature
|
||||
"pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere", # Pressure
|
||||
"hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
|
||||
"v":"volt", "kv":"kilovolt", "mv":"mergavolt", # Voltage
|
||||
"a":"amp", "ma":"megaamp", "ka":"kiloamp", # Current
|
||||
"w":"watt", "kw":"kilowatt", "mw":"megawatt", # Power
|
||||
"j":"joule", "kj":"kilojoule", "mj":"megajoule", # Energy
|
||||
"Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm", # Resistance (Ohm)
|
||||
"f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
|
||||
"b":"byte", "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte", "pb":"petabyte", # Data size
|
||||
"kbps":"kilobyte per second","mbps":"megabyte per second","gbps":"gigabyte per second",
|
||||
"px":"pixel" # CSS units
|
||||
}
|
||||
|
||||
# Pre-compiled regex patterns for performance
|
||||
EMAIL_PATTERN = re.compile(
|
||||
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
|
||||
|
@ -61,6 +82,9 @@ URL_PATTERN = re.compile(
|
|||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{\|}~ \n]{1})""",re.IGNORECASE)
|
||||
|
||||
INFLECT_ENGINE=inflect.engine()
|
||||
|
||||
def split_num(num: re.Match[str]) -> str:
|
||||
"""Handle number splitting for various formats"""
|
||||
|
@ -86,6 +110,13 @@ def split_num(num: re.Match[str]) -> str:
|
|||
return f"{left} oh {right}{s}"
|
||||
return f"{left} {right}{s}"
|
||||
|
||||
def handle_units(u: re.Match[str]) -> str:
|
||||
unit=u.group(6).strip()
|
||||
if unit.lower() in VALID_UNITS:
|
||||
unit=VALID_UNITS[unit.lower()].split(" ")
|
||||
number=u.group(1).strip()
|
||||
unit[0]=INFLECT_ENGINE.no(unit[0],number)
|
||||
return " ".join(unit)
|
||||
|
||||
def handle_money(m: re.Match[str]) -> str:
|
||||
"""Convert money expressions to spoken form"""
|
||||
|
@ -187,14 +218,17 @@ def normalize_text(text: str) -> str:
|
|||
# Pre-process URLs first
|
||||
text = normalize_urls(text)
|
||||
|
||||
# Pre-process numbers with units
|
||||
text=UNIT_PATTERN.sub(handle_units,text)
|
||||
|
||||
# Replace quotes and brackets
|
||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
||||
text = text.replace("(", "«").replace(")", "»")
|
||||
|
||||
# Handle CJK punctuation
|
||||
for a, b in zip("、。!,:;?", ",.!,:;?"):
|
||||
# Handle CJK punctuation and some non standard chars
|
||||
for a, b in zip("、。!,:;?–", ",.!,:;?-"):
|
||||
text = text.replace(a, b + " ")
|
||||
|
||||
# Clean up whitespace
|
||||
|
|
|
@ -26,7 +26,7 @@ def process_text_chunk(
|
|||
List of token IDs
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
if skip_phonemize:
|
||||
# Input is already phonemes, just tokenize
|
||||
t0 = time.time()
|
||||
|
@ -35,12 +35,11 @@ def process_text_chunk(
|
|||
else:
|
||||
# Normal text processing pipeline
|
||||
t0 = time.time()
|
||||
normalized = normalize_text(text)
|
||||
t1 = time.time()
|
||||
|
||||
t0 = time.time()
|
||||
phonemes = phonemize(
|
||||
normalized, language, normalize=False
|
||||
text, language, normalize=False
|
||||
) # Already normalized
|
||||
t1 = time.time()
|
||||
|
||||
|
@ -50,7 +49,7 @@ def process_text_chunk(
|
|||
|
||||
total_time = time.time() - start_time
|
||||
logger.debug(
|
||||
f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}...'"
|
||||
f"Total processing took {total_time * 1000:.2f}ms for chunk: '{text[:50]}{'...' if len(text) > 50 else ''}'"
|
||||
)
|
||||
|
||||
return tokens
|
||||
|
@ -61,7 +60,7 @@ async def yield_chunk(
|
|||
) -> Tuple[str, List[int]]:
|
||||
"""Yield a chunk with consistent logging."""
|
||||
logger.debug(
|
||||
f"Yielding chunk {chunk_count}: '{text[:50]}...' ({len(tokens)} tokens)"
|
||||
f"Yielding chunk {chunk_count}: '{text[:50]}{'...' if len(text) > 50 else ''}' ({len(tokens)} tokens)"
|
||||
)
|
||||
return text, tokens
|
||||
|
||||
|
@ -88,9 +87,10 @@ def process_text(text: str, language: str = "a") -> List[int]:
|
|||
|
||||
def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
|
||||
"""Process all sentences and return info."""
|
||||
sentences = re.split(r"([.!?;:])", text)
|
||||
if settings.advanced_text_normalization:
|
||||
text=normalize_text(text)
|
||||
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
|
||||
results = []
|
||||
|
||||
for i in range(0, len(sentences), 2):
|
||||
sentence = sentences[i].strip()
|
||||
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
|
||||
|
@ -128,7 +128,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(current_chunk)
|
||||
chunk_count += 1
|
||||
logger.debug(
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
|
||||
)
|
||||
yield chunk_text, current_tokens
|
||||
current_chunk = []
|
||||
|
@ -149,6 +149,7 @@ async def smart_split(
|
|||
continue
|
||||
|
||||
full_clause = clause + comma
|
||||
|
||||
tokens = process_text_chunk(full_clause)
|
||||
count = len(tokens)
|
||||
|
||||
|
@ -166,7 +167,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(clause_chunk)
|
||||
chunk_count += 1
|
||||
logger.debug(
|
||||
f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
|
||||
f"Yielding clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
|
||||
)
|
||||
yield chunk_text, clause_tokens
|
||||
clause_chunk = [full_clause]
|
||||
|
@ -178,7 +179,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(clause_chunk)
|
||||
chunk_count += 1
|
||||
logger.debug(
|
||||
f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}...' ({clause_count} tokens)"
|
||||
f"Yielding final clause chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({clause_count} tokens)"
|
||||
)
|
||||
yield chunk_text, clause_tokens
|
||||
|
||||
|
@ -192,7 +193,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(current_chunk)
|
||||
chunk_count += 1
|
||||
logger.info(
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
|
||||
)
|
||||
yield chunk_text, current_tokens
|
||||
current_chunk = [sentence]
|
||||
|
@ -217,7 +218,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(current_chunk)
|
||||
chunk_count += 1
|
||||
logger.info(
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
|
||||
f"Yielding chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
|
||||
)
|
||||
yield chunk_text, current_tokens
|
||||
current_chunk = [sentence]
|
||||
|
@ -229,7 +230,7 @@ async def smart_split(
|
|||
chunk_text = " ".join(current_chunk)
|
||||
chunk_count += 1
|
||||
logger.info(
|
||||
f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}...' ({current_count} tokens)"
|
||||
f"Yielding final chunk {chunk_count}: '{chunk_text[:50]}{'...' if len(text) > 50 else ''}' ({current_count} tokens)"
|
||||
)
|
||||
yield chunk_text, current_tokens
|
||||
|
||||
|
|
|
@ -67,6 +67,8 @@ class TTSService:
|
|||
np.array([0], dtype=np.float32), # Dummy data for type checking
|
||||
24000,
|
||||
output_format,
|
||||
speed,
|
||||
"",
|
||||
is_first_chunk=False,
|
||||
normalizer=normalizer,
|
||||
is_last_chunk=True,
|
||||
|
@ -97,15 +99,22 @@ class TTSService:
|
|||
chunk_audio,
|
||||
24000,
|
||||
output_format,
|
||||
speed,
|
||||
chunk_text,
|
||||
is_first_chunk=is_first,
|
||||
normalizer=normalizer,
|
||||
is_last_chunk=is_last,
|
||||
normalizer=normalizer,
|
||||
)
|
||||
yield converted
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert audio: {str(e)}")
|
||||
else:
|
||||
yield chunk_audio
|
||||
trimmed = await AudioService.trim_audio(chunk_audio,
|
||||
chunk_text,
|
||||
speed,
|
||||
is_last,
|
||||
normalizer)
|
||||
yield trimmed
|
||||
else:
|
||||
# For legacy backends, load voice tensor
|
||||
voice_tensor = await self._voice_manager.load_voice(
|
||||
|
@ -130,6 +139,8 @@ class TTSService:
|
|||
chunk_audio,
|
||||
24000,
|
||||
output_format,
|
||||
speed,
|
||||
chunk_text,
|
||||
is_first_chunk=is_first,
|
||||
normalizer=normalizer,
|
||||
is_last_chunk=is_last,
|
||||
|
@ -138,7 +149,12 @@ class TTSService:
|
|||
except Exception as e:
|
||||
logger.error(f"Failed to convert audio: {str(e)}")
|
||||
else:
|
||||
yield chunk_audio
|
||||
trimmed = await AudioService.trim_audio(chunk_audio,
|
||||
chunk_text,
|
||||
speed,
|
||||
is_last,
|
||||
normalizer)
|
||||
yield trimmed
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process tokens: {str(e)}")
|
||||
|
||||
|
|
|
@ -9,10 +9,10 @@ RUN apt-get update && apt-get install -y \
|
|||
curl \
|
||||
ffmpeg \
|
||||
g++ \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /usr/share/espeak-ng-data \
|
||||
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /usr/share/espeak-ng-data \
|
||||
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
|
||||
|
||||
# Install UV using the installer script
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||
|
@ -20,7 +20,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
|||
mv /root/.local/bin/uvx /usr/local/bin/
|
||||
|
||||
# Create non-root user and set up directories and permissions
|
||||
RUN useradd -m -u 1000 appuser && \
|
||||
RUN useradd -m -u 1001 appuser && \
|
||||
mkdir -p /app/api/src/models/v1_0 && \
|
||||
chown -R appuser:appuser /app
|
||||
|
||||
|
@ -32,7 +32,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
|
|||
|
||||
# Install dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv venv && \
|
||||
uv venv --python 3.11 && \
|
||||
uv sync --extra cpu
|
||||
|
||||
# Copy project files including models
|
||||
|
@ -40,6 +40,7 @@ COPY --chown=appuser:appuser api ./api
|
|||
COPY --chown=appuser:appuser web ./web
|
||||
COPY --chown=appuser:appuser docker/scripts/ ./
|
||||
RUN chmod +x ./entrypoint.sh
|
||||
RUN sed -i 's/\r$//' ./entrypoint.sh
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
|
|
|
@ -1,26 +1,29 @@
|
|||
FROM --platform=$BUILDPLATFORM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||
FROM --platform=$BUILDPLATFORM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
|
||||
# Set non-interactive frontend
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-venv \
|
||||
python3-venv \
|
||||
espeak-ng \
|
||||
espeak-ng-data \
|
||||
git \
|
||||
libsndfile1 \
|
||||
curl \
|
||||
ffmpeg \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /usr/share/espeak-ng-data \
|
||||
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
|
||||
|
||||
# Install UV using the installer script
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||
mv /root/.local/bin/uv /usr/local/bin/ && \
|
||||
mv /root/.local/bin/uvx /usr/local/bin/ && \
|
||||
useradd -m -u 1000 appuser && \
|
||||
mv /root/.local/bin/uvx /usr/local/bin/
|
||||
|
||||
# Create non-root user and set up directories and permissions
|
||||
RUN useradd -m -u 1001 appuser && \
|
||||
mkdir -p /app/api/src/models/v1_0 && \
|
||||
chown -R appuser:appuser /app
|
||||
|
||||
|
@ -32,7 +35,7 @@ COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
|
|||
|
||||
# Install dependencies with GPU extras (using cache mounts)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv venv && \
|
||||
uv venv --python 3.11 && \
|
||||
uv sync --extra gpu
|
||||
|
||||
# Copy project files including models and sync again
|
||||
|
@ -40,6 +43,7 @@ COPY --chown=appuser:appuser api ./api
|
|||
COPY --chown=appuser:appuser web ./web
|
||||
COPY --chown=appuser:appuser docker/scripts/ ./
|
||||
RUN chmod +x ./entrypoint.sh
|
||||
RUN sed -i 's/\r$//' ./entrypoint.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --extra gpu
|
||||
|
||||
|
|
|
@ -36,7 +36,8 @@ dependencies = [
|
|||
"kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
|
||||
'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
|
||||
"spacy==3.7.2",
|
||||
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl"
|
||||
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
|
||||
"inflect>=7.5.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
|
10
start-gpu.bat
Normal file
10
start-gpu.bat
Normal file
|
@ -0,0 +1,10 @@
|
|||
set PYTHONUTF8=1
|
||||
set USE_GPU=true
|
||||
set USE_ONNX=false
|
||||
set PYTHONPATH=%PROJECT_ROOT%;%PROJECT_ROOT%\api
|
||||
set MODEL_DIR=src\models
|
||||
set VOICES_DIR=src\voices\v1_0
|
||||
set WEB_PLAYER_PATH=%PROJECT_ROOT%\web
|
||||
|
||||
call uv pip install -e ".[gpu]"
|
||||
call uv run uvicorn api.src.main:app --reload --host 0.0.0.0 --port 8880
|
Loading…
Add table
Reference in a new issue