Merge branch 'streaming-word-timestamps' of https://github.com/fireblade2534/Kokoro-FastAPI into streaming-word-timestamps

2025-04-13 09:39:17 +00:00 · 2025-02-14 14:36:20 -05:00 · 2025-02-14 14:36:20 -05:00 · 842d056552
commit 842d056552
parent 9c1ced237b b71bab45d4
6 changed files with 31 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -12,15 +12,20 @@
 [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-1.0::9901c2b-blue)](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6)
 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
- Multi-language support (English, Japanese, Korean, Chinese, Vietnamese)
+- Multi-language support (English, Japanese, Korean, Chinese, _Vietnamese soon_)
 - OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch 
 - ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim
 - Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web
 - Phoneme-based audio generation, phoneme generation
- (new) Per-word timestamped caption generation
+- Per-word timestamped caption generation
- (new) Voice mixing with weighted combinations
+- Voice mixing with weighted combinations
 ### Integration Guides
 [![Helm Chart](https://img.shields.io/badge/Helm%20Chart-black?style=flat&logo=helm&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Setup-Kubernetes) [![DigitalOcean](https://img.shields.io/badge/DigitalOcean-black?style=flat&logo=digitalocean&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-DigitalOcean) [![SillyTavern](https://img.shields.io/badge/SillyTavern-black?style=flat&color=red)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-SillyTavern)
 [![OpenWebUI](https://img.shields.io/badge/OpenWebUI-black?style=flat&color=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-OpenWebUi)
 ## Get Started
@ -36,8 +41,8 @@ Refer to the core/config.py file for a full list of variables which can be manag
 # the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds
 # it may have some bonus features however, and feedback/testing is welcome
-docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.1 # CPU, or:
+docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.2 # CPU, or:
-docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.1  #NVIDIA GPU
+docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.2  #NVIDIA GPU
 ```
@ -121,6 +126,7 @@ with client.audio.speech.with_streaming_response.create(
 </details>
 ## Features 
 <details>
 <summary>OpenAI-Compatible Speech Endpoint</summary>
@ -351,7 +357,7 @@ cd docker/cpu
 docker compose up --build
 ```
-*Note: Overall speed may have reduced somewhat with the structural changes to accomodate streaming. Looking into it* 
+*Note: Overall speed may have reduced somewhat with the structural changes to accommodate streaming. Looking into it* 
 </details>
 <details>
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@ -149,8 +149,8 @@ async def stream_audio_chunks(
            voice=voice_name,
            speed=request.speed,
            output_format=request.response_format,
-            lang_code=request.lang_code or request.voice[0],
+            lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(),
-            normalization_options=request.normalization_options,
+            normalization_options=request.normalization_options
            return_timestamps=unique_properties["return_timestamps"],
        ):
@ -212,7 +212,9 @@ async def create_speech(
            if request.return_download_link:
                from ..services.temp_manager import TempFileWriter
-                temp_writer = TempFileWriter(request.response_format)
+                # Use download_format if specified, otherwise use response_format
                output_format = request.download_format or request.response_format
                temp_writer = TempFileWriter(output_format)
                await temp_writer.__aenter__()  # Initialize temp file
                # Get download path immediately after temp file creation
@ -220,7 +222,7 @@ async def create_speech(
                # Create response headers with download path
                headers = {
-                    "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+                    "Content-Disposition": f"attachment; filename=speech.{output_format}",
                    "X-Accel-Buffering": "no",
                    "Cache-Control": "no-cache",
                    "Transfer-Encoding": "chunked",
--- a/api/src/services/streaming_audio_writer.py
+++ b/api/src/services/streaming_audio_writer.py
@ -112,7 +112,7 @@ class StreamingAudioWriter:
                        parameters.extend(
                            [
                                "-q:a",
-                                "2",
+                                "0",  # Highest quality
                                "-write_xing",
                                "1",  # XING header for MP3
                                "-id3v1",
@ -142,7 +142,7 @@ class StreamingAudioWriter:
                    self.encoder.export(
                        output_buffer,
                        **format_args,
-                        bitrate="192k",
+                        bitrate="192k",  # Optimal for 24kHz/16-bit mono source
                        parameters=parameters,
                    )
                    self.encoder = None
@ -189,10 +189,10 @@ class StreamingAudioWriter:
            self.encoder.export(
                output_buffer,
                **format_args,
-                bitrate="192k",
+                bitrate="192k",  # Optimal for 24kHz/16-bit mono source
                parameters=[
                    "-q:a",
-                    "2",
+                    "0",  # Highest quality for chunks too
                    "-write_xing",
                    "0",  # No XING headers for chunks
                ],
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@ -61,6 +61,10 @@ class OpenAISpeechRequest(BaseModel):
        default="mp3",
        description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
    )
    download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = Field(
        default=None,
        description="Optional different format for the final download. If not provided, uses response_format.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.25,
--- a/charts/kokoro-fastapi/values.yaml
+++ b/charts/kokoro-fastapi/values.yaml
@ -45,8 +45,8 @@ ingress:
  host:
    name: kokoro.example.com
    endpoints:
-      backend:
+      - paths:
-        path: "/"
+          - "/"
        serviceName: "fastapi"
        servicePort: 8880
--- a/web/src/services/AudioService.js
+++ b/web/src/services/AudioService.js
@ -39,7 +39,8 @@ export class AudioService {
                body: JSON.stringify({
                    input: text,
                    voice: voice,
-                    response_format: 'mp3',
+                    response_format: 'mp3', // Always use mp3 for streaming playback
                    download_format: document.getElementById('format-select').value || 'mp3', // Format for final download
                    stream: true,
                    speed: speed,
                    return_download_link: true,