diff --git a/.gitignore b/.gitignore index cf4c28f..d16b2a0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,6 @@ ui/data/* *.db *.pyc *.pth -*.pt Kokoro-82M/* __pycache__/ diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 8c8a5d5..d86e00a 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -93,6 +93,7 @@ async def create_speech( "Content-Disposition": f"attachment; filename=speech.{request.response_format}", "X-Accel-Buffering": "no", # Disable proxy buffering "Cache-Control": "no-cache", # Prevent caching + "Transfer-Encoding": "chunked", # Enable chunked transfer encoding }, ) else: diff --git a/api/src/services/audio.py b/api/src/services/audio.py index c0aeed0..4c5a415 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -104,7 +104,7 @@ class AudioService: # Raw 16-bit PCM samples, no header buffer.write(normalized_audio.tobytes()) elif output_format == "wav": - # Always use soundfile for WAV to ensure proper headers and normalization + # WAV format with headers sf.write( buffer, normalized_audio, @@ -113,14 +113,14 @@ class AudioService: subtype="PCM_16", ) elif output_format == "mp3": - # Use format settings or defaults + # MP3 format with proper framing settings = format_settings.get("mp3", {}) if format_settings else {} settings = {**AudioService.DEFAULT_SETTINGS["mp3"], **settings} sf.write( buffer, normalized_audio, sample_rate, format="MP3", **settings ) - elif output_format == "opus": + # Opus format in OGG container settings = format_settings.get("opus", {}) if format_settings else {} settings = {**AudioService.DEFAULT_SETTINGS["opus"], **settings} sf.write( @@ -131,8 +131,8 @@ class AudioService: subtype="OPUS", **settings, ) - elif output_format == "flac": + # FLAC format with proper framing if is_first_chunk: logger.info("Starting FLAC stream...") settings = format_settings.get("flac", {}) if format_settings else {} @@ -145,15 +145,14 @@ class AudioService: subtype="PCM_16", **settings, ) + elif output_format == "aac": + raise ValueError( + "Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm." + ) else: - if output_format == "aac": - raise ValueError( - "Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm." - ) - else: - raise ValueError( - f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm." - ) + raise ValueError( + f"Format {output_format} not supported. Supported formats are: wav, mp3, opus, flac, pcm, aac." + ) buffer.seek(0) return buffer.getvalue() diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index bbd35b8..61471a8 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -177,7 +177,7 @@ class TTSService: ) if chunk_audio is not None: - # Convert chunk with proper header handling + # Convert chunk with proper streaming header handling chunk_bytes = AudioService.convert_audio( chunk_audio, 24000, @@ -185,6 +185,7 @@ class TTSService: is_first_chunk=is_first, normalizer=stream_normalizer, is_last_chunk=(next_chunk is None), # Last if no next chunk + stream=True # Ensure proper streaming format handling ) yield chunk_bytes diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 6fdd9ea..900e6ae 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -181,7 +181,7 @@ def mock_tts_service(monkeypatch): # Mock TTSModel.generate_from_tokens since we call it directly mock_generate = Mock(return_value=np.zeros(48000)) monkeypatch.setattr( - "api.src.routers.text_processing.TTSModel.generate_from_tokens", mock_generate + "api.src.routers.development.TTSModel.generate_from_tokens", mock_generate ) return mock_service @@ -192,5 +192,5 @@ def mock_audio_service(monkeypatch): """Mock AudioService""" mock_service = Mock() mock_service.convert_audio.return_value = b"mock audio data" - monkeypatch.setattr("api.src.routers.text_processing.AudioService", mock_service) + monkeypatch.setattr("api.src.routers.development.AudioService", mock_service) return mock_service diff --git a/api/tests/test_audio_service.py b/api/tests/test_audio_service.py index 6a22921..758e4f4 100644 --- a/api/tests/test_audio_service.py +++ b/api/tests/test_audio_service.py @@ -63,7 +63,7 @@ def test_convert_to_aac_raises_error(sample_audio): audio_data, sample_rate = sample_audio with pytest.raises( ValueError, - match="Format aac not supported. Supported formats are: wav, mp3, opus, flac, pcm.", + match="Failed to convert audio to aac: Format aac not currently supported. Supported formats are: wav, mp3, opus, flac, pcm.", ): AudioService.convert_audio(audio_data, sample_rate, "aac") diff --git a/api/tests/test_text_processing.py b/api/tests/test_text_processing.py index e5d65df..aacb973 100644 --- a/api/tests/test_text_processing.py +++ b/api/tests/test_text_processing.py @@ -20,8 +20,8 @@ async def async_client(): @pytest.mark.asyncio async def test_phonemize_endpoint(async_client): """Test phoneme generation endpoint""" - with patch("api.src.routers.text_processing.phonemize") as mock_phonemize, patch( - "api.src.routers.text_processing.tokenize" + with patch("api.src.routers.development.phonemize") as mock_phonemize, patch( + "api.src.routers.development.tokenize" ) as mock_tokenize: # Setup mocks mock_phonemize.return_value = "həlˈoʊ" @@ -56,7 +56,7 @@ async def test_generate_from_phonemes( ): """Test audio generation from phonemes""" with patch( - "api.src.routers.text_processing.TTSService", return_value=mock_tts_service + "api.src.routers.development.TTSService", return_value=mock_tts_service ): response = await async_client.post( "/text/generate_from_phonemes", @@ -76,7 +76,7 @@ async def test_generate_from_phonemes_invalid_voice(async_client, mock_tts_servi """Test audio generation with invalid voice""" mock_tts_service._get_voice_path.return_value = None with patch( - "api.src.routers.text_processing.TTSService", return_value=mock_tts_service + "api.src.routers.development.TTSService", return_value=mock_tts_service ): response = await async_client.post( "/text/generate_from_phonemes", @@ -111,7 +111,7 @@ async def test_generate_from_phonemes_invalid_speed(async_client, monkeypatch): async def test_generate_from_phonemes_empty_phonemes(async_client, mock_tts_service): """Test audio generation with empty phonemes""" with patch( - "api.src.routers.text_processing.TTSService", return_value=mock_tts_service + "api.src.routers.development.TTSService", return_value=mock_tts_service ): response = await async_client.post( "/text/generate_from_phonemes", diff --git a/docker-compose.cpu.yml b/docker-compose.cpu.yml index 01bda87..a78417c 100644 --- a/docker-compose.cpu.yml +++ b/docker-compose.cpu.yml @@ -26,11 +26,11 @@ services: start_period: 1s kokoro-tts: - # image: ghcr.io/remsky/kokoro-fastapi:latest-cpu - # Uncomment below to build from source instead of using the released image - build: - context: . - dockerfile: Dockerfile.cpu + image: ghcr.io/remsky/kokoro-fastapi:latest-cpu + # Uncomment below (and comment out above) to build from source instead of using the released image + # build: + # context: . + # dockerfile: Dockerfile.cpu volumes: - ./api/src:/app/api/src - ./Kokoro-82M:/app/Kokoro-82M @@ -52,8 +52,8 @@ services: # Gradio UI service [Comment out everything below if you don't need it] gradio-ui: - # image: ghcr.io/remsky/kokoro-fastapi:latest-ui - # Uncomment below to build from source instead of using the released image + image: ghcr.io/remsky/kokoro-fastapi:latest-ui + # Uncomment below (and comment out above) to build from source instead of using the released image build: context: ./ui ports: @@ -63,3 +63,4 @@ services: - ./ui/app.py:/app/app.py # Mount app.py for hot reload environment: - GRADIO_WATCH=True # Enable hot reloading + - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered diff --git a/docker-compose.yml b/docker-compose.yml index bc7e81c..286cd68 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,10 +32,10 @@ services: start_period: 1s kokoro-tts: - # image: ghcr.io/remsky/kokoro-fastapi:latest - # Uncomment below to build from source instead of using the released image - build: - context: . + image: ghcr.io/remsky/kokoro-fastapi:latest + # Uncomment below (and comment out above) to build from source instead of using the released image + # build: + # context: . volumes: - ./api/src:/app/api/src - ./Kokoro-82M:/app/Kokoro-82M @@ -50,16 +50,22 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8880/v1/audio/voices"] + interval: 10s + timeout: 5s + retries: 30 + start_period: 30s depends_on: model-fetcher: condition: service_healthy # Gradio UI service [Comment out everything below if you don't need it] gradio-ui: - # image: ghcr.io/remsky/kokoro-fastapi:latest-ui - # Uncomment below to build from source instead of using the released image - build: - context: ./ui + image: ghcr.io/remsky/kokoro-fastapi:latest-ui + # Uncomment below (and comment out above) to build from source instead of using the released image + # build: + # context: ./ui ports: - "7860:7860" volumes: @@ -67,3 +73,7 @@ services: - ./ui/app.py:/app/app.py # Mount app.py for hot reload environment: - GRADIO_WATCH=True # Enable hot reloading + - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + depends_on: + kokoro-tts: + condition: service_healthy diff --git a/examples/openai_streaming_audio.py b/examples/openai_streaming_audio.py index 9e80bbd..35ef58f 100644 --- a/examples/openai_streaming_audio.py +++ b/examples/openai_streaming_audio.py @@ -1,23 +1,25 @@ #!/usr/bin/env rye run python -# %% import time from pathlib import Path from openai import OpenAI # gets OPENAI_API_KEY from your environment variables -openai = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed-for-local") +openai = OpenAI(base_url="http://localhost:8880/v1", api_key="not-needed-for-local") speech_file_path = Path(__file__).parent / "speech.mp3" - - - - def main() -> None: stream_to_speakers() + # Create text-to-speech audio file + with openai.audio.speech.with_streaming_response.create( + model="kokoro", + voice="af_bella", + input="the quick brown fox jumped over the lazy dogs", + ) as response: + response.stream_to_file(speech_file_path) def stream_to_speakers() -> None: @@ -31,9 +33,12 @@ def stream_to_speakers() -> None: with openai.audio.speech.with_streaming_response.create( model="kokoro", - voice=VOICE, - response_format="mp3", # similar to WAV, but without a header chunk at the start. - input="""My dear sir, that is just where you are wrong. That is just where the whole world has gone wrong. We are always getting away from the present moment. Our mental existences, which are immaterial and have no dimensions, are passing along the Time-Dimension""", + voice="af_bella", + response_format="pcm", # similar to WAV, but without a header chunk at the start. + input="""I see skies of blue and clouds of white + The bright blessed days, the dark sacred nights + And I think to myself + What a wonderful world""", ) as response: print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") for chunk in response.iter_bytes(chunk_size=1024): @@ -44,5 +49,3 @@ def stream_to_speakers() -> None: if __name__ == "__main__": main() - -# %% diff --git a/examples/stream_tts_playback.py b/examples/stream_tts_playback.py index b72a8ee..d231fe7 100644 --- a/examples/stream_tts_playback.py +++ b/examples/stream_tts_playback.py @@ -74,7 +74,7 @@ def play_streaming_tts(text: str, output_file: str = None, voice: str = "af"): all_audio_data.extend(chunk) # Log progress every 10 chunks - if chunk_count % 10 == 0: + if chunk_count % 100 == 0: elapsed = time.time() - start_time print( f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed"