mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Refactor configuration and enhance web interface: update GPU settings, add speed control, and improve input handling for audio generation
This commit is contained in:
parent
ba577d348e
commit
8eb3525382
9 changed files with 104 additions and 30 deletions
|
@ -1 +0,0 @@
|
|||
Subproject commit c97b7bbc3e60f447383c79b2f94fee861ff156ac
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
|
||||
- OpenAI-compatible Speech endpoint, with inline voice combination, and mapped naming/models for strict systems
|
||||
- NVIDIA GPU accelerated or CPU inference (ONNX, Pytorch) (~80-300mb modelfile)
|
||||
- NVIDIA GPU accelerated or CPU inference (ONNX, Pytorch)
|
||||
- very fast generation time
|
||||
- 35x-100x+ real time speed via 4060Ti+
|
||||
- 5x+ real time speed via M3 Pro CPU
|
||||
|
|
|
@ -13,8 +13,8 @@ class Settings(BaseSettings):
|
|||
output_dir: str = "output"
|
||||
output_dir_size_limit_mb: float = 500.0 # Maximum size of output directory in MB
|
||||
default_voice: str = "af"
|
||||
use_gpu: bool = False # Whether to use GPU acceleration if available
|
||||
use_onnx: bool = True # Whether to use ONNX runtime
|
||||
use_gpu: bool = True # Whether to use GPU acceleration if available
|
||||
use_onnx: bool = False # Whether to use ONNX runtime
|
||||
allow_local_voice_saving: bool = False # Whether to allow saving combined voices locally
|
||||
|
||||
# Container absolute paths
|
||||
|
|
|
@ -11,7 +11,7 @@ class ONNXCPUConfig(BaseModel):
|
|||
instance_timeout: int = Field(300, description="Session timeout in seconds")
|
||||
|
||||
# Runtime settings
|
||||
num_threads: int = Field(4, description="Number of threads for parallel operations")
|
||||
num_threads: int = Field(8, description="Number of threads for parallel operations")
|
||||
inter_op_threads: int = Field(4, description="Number of threads for operator parallelism")
|
||||
execution_mode: str = Field("parallel", description="ONNX execution mode")
|
||||
optimization_level: str = Field("all", description="ONNX optimization level")
|
||||
|
@ -55,7 +55,6 @@ class PyTorchGPUConfig(BaseModel):
|
|||
"""PyTorch GPU backend configuration."""
|
||||
|
||||
device_id: int = Field(0, description="CUDA device ID")
|
||||
use_fp16: bool = Field(True, description="Whether to use FP16 precision")
|
||||
use_triton: bool = Field(True, description="Whether to use Triton for CUDA kernels")
|
||||
memory_threshold: float = Field(0.8, description="Memory threshold for cleanup")
|
||||
retry_on_oom: bool = Field(True, description="Whether to retry on OOM errors")
|
||||
|
|
|
@ -85,17 +85,23 @@ class ONNXCPUBackend(BaseModelBackend):
|
|||
style_input = voice[len(tokens) + 2].numpy() # Adjust index for start/end tokens
|
||||
speed_input = np.full(1, speed, dtype=np.float32)
|
||||
|
||||
# Run inference
|
||||
result = self._session.run(
|
||||
None,
|
||||
{
|
||||
"tokens": tokens_input,
|
||||
"style": style_input,
|
||||
"speed": speed_input
|
||||
}
|
||||
)
|
||||
# Build base inputs
|
||||
inputs = {
|
||||
"style": style_input,
|
||||
"speed": speed_input
|
||||
}
|
||||
|
||||
return result[0]
|
||||
# Try both possible token input names #TODO:
|
||||
for token_name in ["tokens", "input_ids"]:
|
||||
try:
|
||||
inputs[token_name] = tokens_input
|
||||
result = self._session.run(None, inputs)
|
||||
return result[0]
|
||||
except Exception:
|
||||
del inputs[token_name]
|
||||
continue
|
||||
|
||||
raise RuntimeError("Model does not accept either 'tokens' or 'input_ids' as input name")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Generation failed: {e}")
|
||||
|
|
|
@ -37,7 +37,7 @@ def main(custom_models: List[str] = None):
|
|||
# Default ONNX model if no arguments provided
|
||||
default_models = [
|
||||
"https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19.onnx",
|
||||
"https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx"
|
||||
# "https://github.com/remsky/Kokoro-FastAPI/releases/download/v0.1.0/kokoro-v0_19_fp16.onnx"
|
||||
]
|
||||
|
||||
# Use provided models or default
|
||||
|
|
16
web/app.js
16
web/app.js
|
@ -14,7 +14,9 @@ class KokoroPlayer {
|
|||
waveContainer: document.getElementById('wave-container'),
|
||||
timeDisplay: document.getElementById('time-display'),
|
||||
downloadBtn: document.getElementById('download-btn'),
|
||||
status: document.getElementById('status')
|
||||
status: document.getElementById('status'),
|
||||
speedSlider: document.getElementById('speed-slider'),
|
||||
speedValue: document.getElementById('speed-value')
|
||||
};
|
||||
|
||||
this.isGenerating = false;
|
||||
|
@ -201,6 +203,11 @@ class KokoroPlayer {
|
|||
this.elements.playPauseBtn.addEventListener('click', () => this.togglePlayPause());
|
||||
this.elements.downloadBtn.addEventListener('click', () => this.downloadAudio());
|
||||
|
||||
this.elements.speedSlider.addEventListener('input', (e) => {
|
||||
const speed = parseFloat(e.target.value);
|
||||
this.elements.speedValue.textContent = speed.toFixed(1);
|
||||
});
|
||||
|
||||
document.addEventListener('click', (e) => {
|
||||
if (!this.elements.voiceSearch.contains(e.target) &&
|
||||
!this.elements.voiceDropdown.contains(e.target)) {
|
||||
|
@ -329,7 +336,8 @@ class KokoroPlayer {
|
|||
input: text,
|
||||
voice: voice,
|
||||
response_format: 'mp3',
|
||||
stream: true
|
||||
stream: true,
|
||||
speed: parseFloat(this.elements.speedSlider.value)
|
||||
}),
|
||||
signal: this.currentController.signal
|
||||
});
|
||||
|
@ -418,11 +426,13 @@ class KokoroPlayer {
|
|||
if (this.audioChunks.length === 0) return;
|
||||
|
||||
const format = this.elements.formatSelect.value;
|
||||
const voice = Array.from(this.selectedVoiceSet).join('+');
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
||||
const blob = new Blob(this.audioChunks, { type: `audio/${format}` });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `generated-speech.${format}`;
|
||||
a.download = `${voice}_${timestamp}.${format}`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
<div class="overlay"></div>
|
||||
<div class="badges-container">
|
||||
<a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank" class="badge">
|
||||
<img src="https://img.shields.io/badge/HexGrad%2FKokoro--82M-black?logo=huggingface&logoColor=white&labelColor=black&style=for-the-badge" alt="HexGrad/Kokoro-82M on Hugging Face">
|
||||
<img src="https://img.shields.io/badge/Powered--by--HexGrad%2FKokoro--82M-black?logo=huggingface&logoColor=white&labelColor=black&style=for-the-badge" alt="HexGrad/Kokoro-82M on Hugging Face">
|
||||
</a>
|
||||
<div class="badge">
|
||||
<a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="dark" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Kokoro-FastAPI</a>
|
||||
|
@ -68,15 +68,21 @@
|
|||
</div>
|
||||
</div>
|
||||
<div class="options">
|
||||
<label>
|
||||
<input type="checkbox" id="autoplay-toggle" checked>
|
||||
Auto-play
|
||||
</label>
|
||||
<select id="format-select" class="format-select">
|
||||
<option value="mp3">MP3</option>
|
||||
<option value="wav">WAV</option>
|
||||
<option value="pcm">PCM</option>
|
||||
</select>
|
||||
<div class="option-group">
|
||||
<label>
|
||||
<input type="checkbox" id="autoplay-toggle" checked>
|
||||
Auto-play
|
||||
</label>
|
||||
<select id="format-select" class="format-select">
|
||||
<option value="mp3">MP3</option>
|
||||
<option value="wav">WAV</option>
|
||||
<option value="pcm">PCM</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="speed-control">
|
||||
<label for="speed-slider">Speed: <span id="speed-value">1.0</span>x</label>
|
||||
<input type="range" id="speed-slider" min="0.1" max="4" step="0.1" value="1.0">
|
||||
</div>
|
||||
</div>
|
||||
<div class="button-group">
|
||||
<button id="generate-btn">
|
||||
|
|
|
@ -129,6 +129,60 @@ textarea::placeholder {
|
|||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.option-group {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.speed-control {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.speed-control label {
|
||||
color: var(--text-light);
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.speed-control input[type="range"] {
|
||||
width: 150px;
|
||||
height: 4px;
|
||||
-webkit-appearance: none;
|
||||
background: rgba(99, 102, 241, 0.2);
|
||||
border-radius: 2px;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.speed-control input[type="range"]::-webkit-slider-thumb {
|
||||
-webkit-appearance: none;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
background: var(--fg-color);
|
||||
border-radius: 50%;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.speed-control input[type="range"]::-webkit-slider-thumb:hover {
|
||||
transform: scale(1.1);
|
||||
}
|
||||
|
||||
.speed-control input[type="range"]::-moz-range-thumb {
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
background: var(--fg-color);
|
||||
border: none;
|
||||
border-radius: 50%;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.speed-control input[type="range"]::-moz-range-thumb:hover {
|
||||
transform: scale(1.1);
|
||||
}
|
||||
|
||||
.options label {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
|
|
Loading…
Add table
Reference in a new issue