Refactor configuration and enhance web interface: update GPU settings, add speed control, and improve input handling for audio generation

This commit is contained in:
remsky 2025-01-23 04:54:55 -07:00
parent ba577d348e
commit 8eb3525382
9 changed files with 104 additions and 30 deletions

@ -1 +0,0 @@
Subproject commit c97b7bbc3e60f447383c79b2f94fee861ff156ac

View file

@ -11,7 +11,7 @@
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- OpenAI-compatible Speech endpoint, with inline voice combination, and mapped naming/models for strict systems
- NVIDIA GPU accelerated or CPU inference (ONNX, Pytorch) (~80-300mb modelfile)
- NVIDIA GPU accelerated or CPU inference (ONNX, Pytorch)
- very fast generation time
- 35x-100x+ real time speed via 4060Ti+
- 5x+ real time speed via M3 Pro CPU

View file

@ -13,8 +13,8 @@ class Settings(BaseSettings):
output_dir: str = "output"
output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB
default_voice: str = "af"
use_gpu: bool = False # Whether to use GPU acceleration if available
use_onnx: bool = True # Whether to use ONNX runtime
use_gpu: bool = True # Whether to use GPU acceleration if available
use_onnx: bool = False # Whether to use ONNX runtime
allow_local_voice_saving: bool = False # Whether to allow saving combined voices locally
# Container absolute paths

View file

@ -11,7 +11,7 @@ class ONNXCPUConfig(BaseModel):
instance_timeout: int = Field(300, description="Session timeout in seconds")
# Runtime settings
num_threads: int = Field(4, description="Number of threads for parallel operations")
num_threads: int = Field(8, description="Number of threads for parallel operations")
inter_op_threads: int = Field(4, description="Number of threads for operator parallelism")
execution_mode: str = Field("parallel", description="ONNX execution mode")
optimization_level: str = Field("all", description="ONNX optimization level")
@ -55,7 +55,6 @@ class PyTorchGPUConfig(BaseModel):
"""PyTorch GPU backend configuration."""
device_id: int = Field(0, description="CUDA device ID")
use_fp16: bool = Field(True, description="Whether to use FP16 precision")
use_triton: bool = Field(True, description="Whether to use Triton for CUDA kernels")
memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")

View file

@ -85,17 +85,23 @@ class ONNXCPUBackend(BaseModelBackend):
style_input = voice[len(tokens) + 2].numpy() # Adjust index for start/end tokens
speed_input = np.full(1, speed, dtype=np.float32)
# Run inference
result = self._session.run(
None,
{
"tokens": tokens_input,
"style": style_input,
"speed": speed_input
}
)
# Build base inputs
inputs = {
"style": style_input,
"speed": speed_input
}
return result[0]
# Try both possible token input names #TODO:
for token_name in ["tokens", "input_ids"]:
try:
inputs[token_name] = tokens_input
result = self._session.run(None, inputs)
return result[0]
except Exception:
del inputs[token_name]
continue
raise RuntimeError("Model does not accept either 'tokens' or 'input_ids' as input name")
except Exception as e:
raise RuntimeError(f"Generation failed: {e}")

View file

@ -37,7 +37,7 @@ def main(custom_models: List[str] = None):
# Default ONNX model if no arguments provided
default_models = [
"https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.onnx",
"https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx"
# "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx"
]
# Use provided models or default

View file

@ -14,7 +14,9 @@ class KokoroPlayer {
waveContainer: document.getElementById('wave-container'),
timeDisplay: document.getElementById('time-display'),
downloadBtn: document.getElementById('download-btn'),
status: document.getElementById('status')
status: document.getElementById('status'),
speedSlider: document.getElementById('speed-slider'),
speedValue: document.getElementById('speed-value')
};
this.isGenerating = false;
@ -201,6 +203,11 @@ class KokoroPlayer {
this.elements.playPauseBtn.addEventListener('click', () => this.togglePlayPause());
this.elements.downloadBtn.addEventListener('click', () => this.downloadAudio());
this.elements.speedSlider.addEventListener('input', (e) => {
const speed = parseFloat(e.target.value);
this.elements.speedValue.textContent = speed.toFixed(1);
});
document.addEventListener('click', (e) => {
if (!this.elements.voiceSearch.contains(e.target) &&
!this.elements.voiceDropdown.contains(e.target)) {
@ -329,7 +336,8 @@ class KokoroPlayer {
input: text,
voice: voice,
response_format: 'mp3',
stream: true
stream: true,
speed: parseFloat(this.elements.speedSlider.value)
}),
signal: this.currentController.signal
});
@ -418,11 +426,13 @@ class KokoroPlayer {
if (this.audioChunks.length === 0) return;
const format = this.elements.formatSelect.value;
const voice = Array.from(this.selectedVoiceSet).join('+');
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const blob = new Blob(this.audioChunks, { type: `audio/${format}` });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `generated-speech.${format}`;
a.download = `${voice}_${timestamp}.${format}`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);

View file

@ -26,7 +26,7 @@
<div class="overlay"></div>
<div class="badges-container">
<a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank" class="badge">
<img src="https://img.shields.io/badge/HexGrad%2FKokoro--82M-black?logo=huggingface&logoColor=white&labelColor=black&style=for-the-badge" alt="HexGrad/Kokoro-82M on Hugging Face">
<img src="https://img.shields.io/badge/Powered--by--HexGrad%2FKokoro--82M-black?logo=huggingface&logoColor=white&labelColor=black&style=for-the-badge" alt="HexGrad/Kokoro-82M on Hugging Face">
</a>
<div class="badge">
<a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="dark" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Kokoro-FastAPI</a>
@ -68,15 +68,21 @@
</div>
</div>
<div class="options">
<label>
<input type="checkbox" id="autoplay-toggle" checked>
Auto-play
</label>
<select id="format-select" class="format-select">
<option value="mp3">MP3</option>
<option value="wav">WAV</option>
<option value="pcm">PCM</option>
</select>
<div class="option-group">
<label>
<input type="checkbox" id="autoplay-toggle" checked>
Auto-play
</label>
<select id="format-select" class="format-select">
<option value="mp3">MP3</option>
<option value="wav">WAV</option>
<option value="pcm">PCM</option>
</select>
</div>
<div class="speed-control">
<label for="speed-slider">Speed: <span id="speed-value">1.0</span>x</label>
<input type="range" id="speed-slider" min="0.1" max="4" step="0.1" value="1.0">
</div>
</div>
<div class="button-group">
<button id="generate-btn">

View file

@ -129,6 +129,60 @@ textarea::placeholder {
flex-wrap: wrap;
}
.option-group {
display: flex;
align-items: center;
gap: 2rem;
}
.speed-control {
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.speed-control label {
color: var(--text-light);
font-size: 0.875rem;
}
.speed-control input[type="range"] {
width: 150px;
height: 4px;
-webkit-appearance: none;
background: rgba(99, 102, 241, 0.2);
border-radius: 2px;
outline: none;
}
.speed-control input[type="range"]::-webkit-slider-thumb {
-webkit-appearance: none;
width: 16px;
height: 16px;
background: var(--fg-color);
border-radius: 50%;
cursor: pointer;
transition: transform 0.2s ease;
}
.speed-control input[type="range"]::-webkit-slider-thumb:hover {
transform: scale(1.1);
}
.speed-control input[type="range"]::-moz-range-thumb {
width: 16px;
height: 16px;
background: var(--fg-color);
border: none;
border-radius: 50%;
cursor: pointer;
transition: transform 0.2s ease;
}
.speed-control input[type="range"]::-moz-range-thumb:hover {
transform: scale(1.1);
}
.options label {
display: flex;
align-items: center;