Merge branch 'streaming-word-timestamps' of https://github.com/fireblade2534/Kokoro-FastAPI into streaming-word-timestamps

This commit is contained in:
Fireblade 2025-02-14 14:36:20 -05:00
commit 842d056552
6 changed files with 31 additions and 18 deletions

View file

@ -12,15 +12,20 @@
[![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-1.0::9901c2b-blue)](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6) [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-1.0::9901c2b-blue)](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6)
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- Multi-language support (English, Japanese, Korean, Chinese, Vietnamese) - Multi-language support (English, Japanese, Korean, Chinese, _Vietnamese soon_)
- OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch - OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch
- ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim - ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim
- Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web - Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web
- Phoneme-based audio generation, phoneme generation - Phoneme-based audio generation, phoneme generation
- (new) Per-word timestamped caption generation - Per-word timestamped caption generation
- (new) Voice mixing with weighted combinations - Voice mixing with weighted combinations
### Integration Guides
[![Helm Chart](https://img.shields.io/badge/Helm%20Chart-black?style=flat&logo=helm&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Setup-Kubernetes) [![DigitalOcean](https://img.shields.io/badge/DigitalOcean-black?style=flat&logo=digitalocean&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-DigitalOcean) [![SillyTavern](https://img.shields.io/badge/SillyTavern-black?style=flat&color=red)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-SillyTavern)
[![OpenWebUI](https://img.shields.io/badge/OpenWebUI-black?style=flat&color=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-OpenWebUi)
## Get Started ## Get Started
@ -36,8 +41,8 @@ Refer to the core/config.py file for a full list of variables which can be manag
# the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds # the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds
# it may have some bonus features however, and feedback/testing is welcome # it may have some bonus features however, and feedback/testing is welcome
docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.1 # CPU, or: docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.2 # CPU, or:
docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.1 #NVIDIA GPU docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.2 #NVIDIA GPU
``` ```
@ -121,6 +126,7 @@ with client.audio.speech.with_streaming_response.create(
</details> </details>
## Features ## Features
<details> <details>
<summary>OpenAI-Compatible Speech Endpoint</summary> <summary>OpenAI-Compatible Speech Endpoint</summary>
@ -351,7 +357,7 @@ cd docker/cpu
docker compose up --build docker compose up --build
``` ```
*Note: Overall speed may have reduced somewhat with the structural changes to accomodate streaming. Looking into it* *Note: Overall speed may have reduced somewhat with the structural changes to accommodate streaming. Looking into it*
</details> </details>
<details> <details>

View file

@ -149,8 +149,8 @@ async def stream_audio_chunks(
voice=voice_name, voice=voice_name,
speed=request.speed, speed=request.speed,
output_format=request.response_format, output_format=request.response_format,
lang_code=request.lang_code or request.voice[0], lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(),
normalization_options=request.normalization_options, normalization_options=request.normalization_options
return_timestamps=unique_properties["return_timestamps"], return_timestamps=unique_properties["return_timestamps"],
): ):
@ -212,7 +212,9 @@ async def create_speech(
if request.return_download_link: if request.return_download_link:
from ..services.temp_manager import TempFileWriter from ..services.temp_manager import TempFileWriter
temp_writer = TempFileWriter(request.response_format) # Use download_format if specified, otherwise use response_format
output_format = request.download_format or request.response_format
temp_writer = TempFileWriter(output_format)
await temp_writer.__aenter__() # Initialize temp file await temp_writer.__aenter__() # Initialize temp file
# Get download path immediately after temp file creation # Get download path immediately after temp file creation
@ -220,7 +222,7 @@ async def create_speech(
# Create response headers with download path # Create response headers with download path
headers = { headers = {
"Content-Disposition": f"attachment; filename=speech.{request.response_format}", "Content-Disposition": f"attachment; filename=speech.{output_format}",
"X-Accel-Buffering": "no", "X-Accel-Buffering": "no",
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
"Transfer-Encoding": "chunked", "Transfer-Encoding": "chunked",

View file

@ -112,7 +112,7 @@ class StreamingAudioWriter:
parameters.extend( parameters.extend(
[ [
"-q:a", "-q:a",
"2", "0", # Highest quality
"-write_xing", "-write_xing",
"1", # XING header for MP3 "1", # XING header for MP3
"-id3v1", "-id3v1",
@ -142,7 +142,7 @@ class StreamingAudioWriter:
self.encoder.export( self.encoder.export(
output_buffer, output_buffer,
**format_args, **format_args,
bitrate="192k", bitrate="192k", # Optimal for 24kHz/16-bit mono source
parameters=parameters, parameters=parameters,
) )
self.encoder = None self.encoder = None
@ -189,10 +189,10 @@ class StreamingAudioWriter:
self.encoder.export( self.encoder.export(
output_buffer, output_buffer,
**format_args, **format_args,
bitrate="192k", bitrate="192k", # Optimal for 24kHz/16-bit mono source
parameters=[ parameters=[
"-q:a", "-q:a",
"2", "0", # Highest quality for chunks too
"-write_xing", "-write_xing",
"0", # No XING headers for chunks "0", # No XING headers for chunks
], ],

View file

@ -61,6 +61,10 @@ class OpenAISpeechRequest(BaseModel):
default="mp3", default="mp3",
description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.", description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
) )
download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = Field(
default=None,
description="Optional different format for the final download. If not provided, uses response_format.",
)
speed: float = Field( speed: float = Field(
default=1.0, default=1.0,
ge=0.25, ge=0.25,

View file

@ -45,8 +45,8 @@ ingress:
host: host:
name: kokoro.example.com name: kokoro.example.com
endpoints: endpoints:
backend: - paths:
path: "/" - "/"
serviceName: "fastapi" serviceName: "fastapi"
servicePort: 8880 servicePort: 8880

View file

@ -39,7 +39,8 @@ export class AudioService {
body: JSON.stringify({ body: JSON.stringify({
input: text, input: text,
voice: voice, voice: voice,
response_format: 'mp3', response_format: 'mp3', // Always use mp3 for streaming playback
download_format: document.getElementById('format-select').value || 'mp3', // Format for final download
stream: true, stream: true,
speed: speed, speed: speed,
return_download_link: true, return_download_link: true,