mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Merge branch 'streaming-word-timestamps' of https://github.com/fireblade2534/Kokoro-FastAPI into streaming-word-timestamps
This commit is contained in:
commit
842d056552
6 changed files with 31 additions and 18 deletions
20
README.md
20
README.md
|
@ -12,15 +12,20 @@
|
||||||
|
|
||||||
[](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6)
|
[](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6)
|
||||||
|
|
||||||
|
|
||||||
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
|
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
|
||||||
- Multi-language support (English, Japanese, Korean, Chinese, Vietnamese)
|
- Multi-language support (English, Japanese, Korean, Chinese, _Vietnamese soon_)
|
||||||
- OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch
|
- OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch
|
||||||
- ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim
|
- ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim
|
||||||
- Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web
|
- Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web
|
||||||
- Phoneme-based audio generation, phoneme generation
|
- Phoneme-based audio generation, phoneme generation
|
||||||
- (new) Per-word timestamped caption generation
|
- Per-word timestamped caption generation
|
||||||
- (new) Voice mixing with weighted combinations
|
- Voice mixing with weighted combinations
|
||||||
|
|
||||||
|
### Integration Guides
|
||||||
|
[](https://github.com/remsky/Kokoro-FastAPI/wiki/Setup-Kubernetes) [](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-DigitalOcean) [](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-SillyTavern)
|
||||||
|
[](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-OpenWebUi)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Get Started
|
## Get Started
|
||||||
|
@ -36,8 +41,8 @@ Refer to the core/config.py file for a full list of variables which can be manag
|
||||||
# the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds
|
# the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds
|
||||||
# it may have some bonus features however, and feedback/testing is welcome
|
# it may have some bonus features however, and feedback/testing is welcome
|
||||||
|
|
||||||
docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.1 # CPU, or:
|
docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.2 # CPU, or:
|
||||||
docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.1 #NVIDIA GPU
|
docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.2 #NVIDIA GPU
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,6 +126,7 @@ with client.audio.speech.with_streaming_response.create(
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
<details>
|
<details>
|
||||||
<summary>OpenAI-Compatible Speech Endpoint</summary>
|
<summary>OpenAI-Compatible Speech Endpoint</summary>
|
||||||
|
@ -351,7 +357,7 @@ cd docker/cpu
|
||||||
docker compose up --build
|
docker compose up --build
|
||||||
|
|
||||||
```
|
```
|
||||||
*Note: Overall speed may have reduced somewhat with the structural changes to accomodate streaming. Looking into it*
|
*Note: Overall speed may have reduced somewhat with the structural changes to accommodate streaming. Looking into it*
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
|
@ -149,8 +149,8 @@ async def stream_audio_chunks(
|
||||||
voice=voice_name,
|
voice=voice_name,
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
output_format=request.response_format,
|
output_format=request.response_format,
|
||||||
lang_code=request.lang_code or request.voice[0],
|
lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(),
|
||||||
normalization_options=request.normalization_options,
|
normalization_options=request.normalization_options
|
||||||
return_timestamps=unique_properties["return_timestamps"],
|
return_timestamps=unique_properties["return_timestamps"],
|
||||||
):
|
):
|
||||||
|
|
||||||
|
@ -212,7 +212,9 @@ async def create_speech(
|
||||||
if request.return_download_link:
|
if request.return_download_link:
|
||||||
from ..services.temp_manager import TempFileWriter
|
from ..services.temp_manager import TempFileWriter
|
||||||
|
|
||||||
temp_writer = TempFileWriter(request.response_format)
|
# Use download_format if specified, otherwise use response_format
|
||||||
|
output_format = request.download_format or request.response_format
|
||||||
|
temp_writer = TempFileWriter(output_format)
|
||||||
await temp_writer.__aenter__() # Initialize temp file
|
await temp_writer.__aenter__() # Initialize temp file
|
||||||
|
|
||||||
# Get download path immediately after temp file creation
|
# Get download path immediately after temp file creation
|
||||||
|
@ -220,7 +222,7 @@ async def create_speech(
|
||||||
|
|
||||||
# Create response headers with download path
|
# Create response headers with download path
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
"Content-Disposition": f"attachment; filename=speech.{output_format}",
|
||||||
"X-Accel-Buffering": "no",
|
"X-Accel-Buffering": "no",
|
||||||
"Cache-Control": "no-cache",
|
"Cache-Control": "no-cache",
|
||||||
"Transfer-Encoding": "chunked",
|
"Transfer-Encoding": "chunked",
|
||||||
|
|
|
@ -112,7 +112,7 @@ class StreamingAudioWriter:
|
||||||
parameters.extend(
|
parameters.extend(
|
||||||
[
|
[
|
||||||
"-q:a",
|
"-q:a",
|
||||||
"2",
|
"0", # Highest quality
|
||||||
"-write_xing",
|
"-write_xing",
|
||||||
"1", # XING header for MP3
|
"1", # XING header for MP3
|
||||||
"-id3v1",
|
"-id3v1",
|
||||||
|
@ -142,7 +142,7 @@ class StreamingAudioWriter:
|
||||||
self.encoder.export(
|
self.encoder.export(
|
||||||
output_buffer,
|
output_buffer,
|
||||||
**format_args,
|
**format_args,
|
||||||
bitrate="192k",
|
bitrate="192k", # Optimal for 24kHz/16-bit mono source
|
||||||
parameters=parameters,
|
parameters=parameters,
|
||||||
)
|
)
|
||||||
self.encoder = None
|
self.encoder = None
|
||||||
|
@ -189,10 +189,10 @@ class StreamingAudioWriter:
|
||||||
self.encoder.export(
|
self.encoder.export(
|
||||||
output_buffer,
|
output_buffer,
|
||||||
**format_args,
|
**format_args,
|
||||||
bitrate="192k",
|
bitrate="192k", # Optimal for 24kHz/16-bit mono source
|
||||||
parameters=[
|
parameters=[
|
||||||
"-q:a",
|
"-q:a",
|
||||||
"2",
|
"0", # Highest quality for chunks too
|
||||||
"-write_xing",
|
"-write_xing",
|
||||||
"0", # No XING headers for chunks
|
"0", # No XING headers for chunks
|
||||||
],
|
],
|
||||||
|
|
|
@ -61,6 +61,10 @@ class OpenAISpeechRequest(BaseModel):
|
||||||
default="mp3",
|
default="mp3",
|
||||||
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
|
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
|
||||||
)
|
)
|
||||||
|
download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Optional different format for the final download. If not provided, uses response_format.",
|
||||||
|
)
|
||||||
speed: float = Field(
|
speed: float = Field(
|
||||||
default=1.0,
|
default=1.0,
|
||||||
ge=0.25,
|
ge=0.25,
|
||||||
|
|
|
@ -45,8 +45,8 @@ ingress:
|
||||||
host:
|
host:
|
||||||
name: kokoro.example.com
|
name: kokoro.example.com
|
||||||
endpoints:
|
endpoints:
|
||||||
backend:
|
- paths:
|
||||||
path: "/"
|
- "/"
|
||||||
serviceName: "fastapi"
|
serviceName: "fastapi"
|
||||||
servicePort: 8880
|
servicePort: 8880
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,8 @@ export class AudioService {
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
input: text,
|
input: text,
|
||||||
voice: voice,
|
voice: voice,
|
||||||
response_format: 'mp3',
|
response_format: 'mp3', // Always use mp3 for streaming playback
|
||||||
|
download_format: document.getElementById('format-select').value || 'mp3', // Format for final download
|
||||||
stream: true,
|
stream: true,
|
||||||
speed: speed,
|
speed: speed,
|
||||||
return_download_link: true,
|
return_download_link: true,
|
||||||
|
|
Loading…
Add table
Reference in a new issue