diff --git a/README.md b/README.md index e7106ec..48aaae6 100644 --- a/README.md +++ b/README.md @@ -12,15 +12,20 @@ [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-1.0::9901c2b-blue)](https://huggingface.co/hexgrad/Kokoro-82M/commit/9901c2b79161b6e898b7ea857ae5298f47b8b0d6) - Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model -- Multi-language support (English, Japanese, Korean, Chinese, Vietnamese) +- Multi-language support (English, Japanese, Korean, Chinese, _Vietnamese soon_) - OpenAI-compatible Speech endpoint, NVIDIA GPU accelerated or CPU inference with PyTorch - ONNX support coming soon, see v0.1.5 and earlier for legacy ONNX support in the interim - Debug endpoints for monitoring system stats, integrated web UI on localhost:8880/web - Phoneme-based audio generation, phoneme generation -- (new) Per-word timestamped caption generation -- (new) Voice mixing with weighted combinations +- Per-word timestamped caption generation +- Voice mixing with weighted combinations + +### Integration Guides + [![Helm Chart](https://img.shields.io/badge/Helm%20Chart-black?style=flat&logo=helm&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Setup-Kubernetes) [![DigitalOcean](https://img.shields.io/badge/DigitalOcean-black?style=flat&logo=digitalocean&logoColor=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-DigitalOcean) [![SillyTavern](https://img.shields.io/badge/SillyTavern-black?style=flat&color=red)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-SillyTavern) +[![OpenWebUI](https://img.shields.io/badge/OpenWebUI-black?style=flat&color=white)](https://github.com/remsky/Kokoro-FastAPI/wiki/Integrations-OpenWebUi) + + ## Get Started @@ -36,8 +41,8 @@ Refer to the core/config.py file for a full list of variables which can be manag # the `latest` tag can be used, but should not be considered stable as it may include `nightly` branch builds # it may have some bonus features however, and feedback/testing is welcome -docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.1 # CPU, or: -docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.1 #NVIDIA GPU +docker run -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-cpu:v0.2.2 # CPU, or: +docker run --gpus all -p 8880:8880 ghcr.io/remsky/kokoro-fastapi-gpu:v0.2.2 #NVIDIA GPU ``` @@ -121,6 +126,7 @@ with client.audio.speech.with_streaming_response.create( + ## Features
OpenAI-Compatible Speech Endpoint @@ -351,7 +357,7 @@ cd docker/cpu docker compose up --build ``` -*Note: Overall speed may have reduced somewhat with the structural changes to accomodate streaming. Looking into it* +*Note: Overall speed may have reduced somewhat with the structural changes to accommodate streaming. Looking into it*
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 94bde54..146876d 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -149,8 +149,8 @@ async def stream_audio_chunks( voice=voice_name, speed=request.speed, output_format=request.response_format, - lang_code=request.lang_code or request.voice[0], - normalization_options=request.normalization_options, + lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(), + normalization_options=request.normalization_options return_timestamps=unique_properties["return_timestamps"], ): @@ -212,7 +212,9 @@ async def create_speech( if request.return_download_link: from ..services.temp_manager import TempFileWriter - temp_writer = TempFileWriter(request.response_format) + # Use download_format if specified, otherwise use response_format + output_format = request.download_format or request.response_format + temp_writer = TempFileWriter(output_format) await temp_writer.__aenter__() # Initialize temp file # Get download path immediately after temp file creation @@ -220,7 +222,7 @@ async def create_speech( # Create response headers with download path headers = { - "Content-Disposition": f"attachment; filename=speech.{request.response_format}", + "Content-Disposition": f"attachment; filename=speech.{output_format}", "X-Accel-Buffering": "no", "Cache-Control": "no-cache", "Transfer-Encoding": "chunked", diff --git a/api/src/services/streaming_audio_writer.py b/api/src/services/streaming_audio_writer.py index 6c31e47..1a45eec 100644 --- a/api/src/services/streaming_audio_writer.py +++ b/api/src/services/streaming_audio_writer.py @@ -112,7 +112,7 @@ class StreamingAudioWriter: parameters.extend( [ "-q:a", - "2", + "0", # Highest quality "-write_xing", "1", # XING header for MP3 "-id3v1", @@ -142,7 +142,7 @@ class StreamingAudioWriter: self.encoder.export( output_buffer, **format_args, - bitrate="192k", + bitrate="192k", # Optimal for 24kHz/16-bit mono source parameters=parameters, ) self.encoder = None @@ -189,10 +189,10 @@ class StreamingAudioWriter: self.encoder.export( output_buffer, **format_args, - bitrate="192k", + bitrate="192k", # Optimal for 24kHz/16-bit mono source parameters=[ "-q:a", - "2", + "0", # Highest quality for chunks too "-write_xing", "0", # No XING headers for chunks ], diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py index fb323d5..2608597 100644 --- a/api/src/structures/schemas.py +++ b/api/src/structures/schemas.py @@ -61,6 +61,10 @@ class OpenAISpeechRequest(BaseModel): default="mp3", description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.", ) + download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = Field( + default=None, + description="Optional different format for the final download. If not provided, uses response_format.", + ) speed: float = Field( default=1.0, ge=0.25, diff --git a/charts/kokoro-fastapi/values.yaml b/charts/kokoro-fastapi/values.yaml index 05419d9..0db2f95 100644 --- a/charts/kokoro-fastapi/values.yaml +++ b/charts/kokoro-fastapi/values.yaml @@ -45,8 +45,8 @@ ingress: host: name: kokoro.example.com endpoints: - backend: - path: "/" + - paths: + - "/" serviceName: "fastapi" servicePort: 8880 diff --git a/web/src/services/AudioService.js b/web/src/services/AudioService.js index 44d0727..cee33d4 100644 --- a/web/src/services/AudioService.js +++ b/web/src/services/AudioService.js @@ -39,7 +39,8 @@ export class AudioService { body: JSON.stringify({ input: text, voice: voice, - response_format: 'mp3', + response_format: 'mp3', // Always use mp3 for streaming playback + download_format: document.getElementById('format-select').value || 'mp3', // Format for final download stream: true, speed: speed, return_download_link: true,