From 607df6e03bb196d0a1c036a4d77c033506d45c92 Mon Sep 17 00:00:00 2001 From: remsky Date: Tue, 31 Dec 2024 03:46:31 -0700 Subject: [PATCH] Update README and tests to clarify audio format support and enhance documentation --- README.md | 14 +++++++------- examples/test_openai_tts.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a40da19..6022932 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,10 @@ [![Tests](https://img.shields.io/badge/tests-33%20passed-darkgreen)]() [![Coverage](https://img.shields.io/badge/coverage-97%25-darkgreen)]() -FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model. - -OpenAI-compatible API with NVIDIA GPU support, with automatic chunking/stitching for long texts, and very fast generation time (~35-49x RTF) +FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model, providing an OpenAI-compatible endpoint with: +- NVIDIA GPU acceleration enabled +- automatic chunking/stitching for long texts +- very fast generation time (~35-49x RTF) ## Quick Start @@ -57,7 +58,7 @@ response = requests.post( "model": "kokoro", # Not used but required for compatibility "input": "Hello world!", "voice": "af_bella", - "response_format": "mp3", # Supported: mp3, wav, opus, flac, aac + "response_format": "mp3", # Supported: mp3, wav, opus, flac "speed": 1.0 } ) @@ -90,6 +91,7 @@ Benchmarking was performed on generation via the local API using text lengths up - NVIDIA 4060Ti 16gb GPU @ CUDA 12.1 - 11th Gen i7-11700 @ 2.5GHz - 64gb RAM +- WAV native output - H.G. Wells - The Time Machine (full text)

@@ -106,11 +108,9 @@ Key Performance Metrics: ## Features - OpenAI-compatible API endpoints -- Multiple audio formats: mp3, wav, opus, flac, aac +- Multiple audio formats: mp3, wav, opus, flac, (aac & pcm not implemented) - Automatic text chunking and audio stitching - GPU-accelerated inference -- Queue handling via SQLite -- Progress tracking for long generations ## Model diff --git a/examples/test_openai_tts.py b/examples/test_openai_tts.py index 932aa11..7cc8104 100644 --- a/examples/test_openai_tts.py +++ b/examples/test_openai_tts.py @@ -60,7 +60,7 @@ def test_speed(speed: float): # Test different formats for format in ["wav", "mp3", "opus", "aac", "flac", "pcm"]: - test_format(format) + test_format(format) # aac and pcm should fail as they are not supported # Test different speeds for speed in [0.25, 1.0, 2.0, 4.0]: # 5.0 should fail as it's out of range