diff --git a/.coveragerc b/.coveragerc index c66579a..bea78e2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -7,6 +7,7 @@ omit = MagicMock/* test_*.py examples/* + src/builds/* [report] exclude_lines = diff --git a/README.md b/README.md index 6f8efda..7695dd2 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ Kokoro TTS Banner

-# Kokoro TTS API +# _`FastKoko`_ [![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]() -[![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]() +[![Coverage](https://img.shields.io/badge/coverage-60%25-grey)]() [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky) Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model @@ -35,8 +35,9 @@ The service can be accessed through either the API endpoints or the Gradio web i - Using Docker Compose (Full setup including UI): ```bash - docker compose up --build # for GPU - docker compose -f docker-compose.cpu.yml up --build # for CPU + cd docker/gpu # OR + # cd docker/cpu # Run this or the above + docker compose up --build ``` - OR running the API alone using Docker (model + voice packs baked in): ```bash @@ -350,6 +351,27 @@ See `examples/phoneme_examples/generate_phonemes.py` for a sample script. ## Known Issues +
+Versioning & Development + +I'm doing what I can to keep things stable, but we are on an early and rapid set of build cycles here. +If you run into trouble, you may have to roll back a version on the release tags if something comes up, or build up from source and/or troubleshoot + submit a PR. Will leave the branch up here for the last known stable points: + +`v0.0.5post1` + +Free and open source is a community effort, and I love working on this project, though there's only really so many hours in a day. If you'd like to support the work, feel free to open a PR, buy me a coffee, or report any bugs/features/etc you find during use. + + + Buy Me A Coffee + + + +
+
Linux GPU Permissions diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml index beb2a8f..08403a5 100644 --- a/docker/cpu/docker-compose.yml +++ b/docker/cpu/docker-compose.yml @@ -35,3 +35,4 @@ services: environment: - GRADIO_WATCH=True # Enable hot reloading - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered + - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view \ No newline at end of file diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml index c89c0b2..5775abe 100644 --- a/docker/gpu/docker-compose.yml +++ b/docker/gpu/docker-compose.yml @@ -32,4 +32,4 @@ services: environment: - GRADIO_WATCH=1 # Enable hot reloading - PYTHONUNBUFFERED=1 # Ensure Python output is not buffered - - DISABLE_LOCAL_SAVING=true # Set to 'true' to disable local saving and hide file view + - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view diff --git a/ui/lib/api.py b/ui/lib/api.py index 1528656..ca0d7e8 100644 --- a/ui/lib/api.py +++ b/ui/lib/api.py @@ -36,15 +36,18 @@ def check_api_status() -> Tuple[bool, List[str]]: def text_to_speech( - text: str, voice_id: str, format: str, speed: float + text: str, voice_id: str | list, format: str, speed: float ) -> Optional[str]: """Generate speech from text using TTS API.""" if not text.strip(): return None + # Handle multiple voices + voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id) + # Create output filename timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - output_filename = f"output_{timestamp}_voice-{voice_id}_speed-{speed}.{format}" + output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}" output_path = os.path.join(OUTPUTS_DIR, output_filename) try: @@ -53,7 +56,7 @@ def text_to_speech( json={ "model": "kokoro", "input": text, - "voice": voice_id, + "voice": voice_str, "response_format": format, "speed": float(speed), }, diff --git a/ui/lib/components/input.py b/ui/lib/components/input.py index 793a89e..a2c4d33 100644 --- a/ui/lib/components/input.py +++ b/ui/lib/components/input.py @@ -5,54 +5,78 @@ import gradio as gr from .. import files -def create_input_column() -> Tuple[gr.Column, dict]: +def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]: """Create the input column with text input and file handling.""" with gr.Column(scale=1) as col: - with gr.Tabs() as tabs: - # Set first tab as selected by default - tabs.selected = 0 - # Direct Input Tab - with gr.TabItem("Direct Input"): - text_input = gr.Textbox( - label="Text to speak", placeholder="Enter text here...", lines=4 - ) - text_submit = gr.Button("Generate Speech", variant="primary", size="lg") + text_input = gr.Textbox( + label="Text to speak", placeholder="Enter text here...", lines=4 + ) + + # Always show file upload but handle differently based on disable_local_saving + file_upload = gr.File( + label="Upload Text File (.txt)", file_types=[".txt"] + ) + + if not disable_local_saving: + # Show full interface with tabs when saving is enabled + with gr.Tabs() as tabs: + # Set first tab as selected by default + tabs.selected = 0 + # Direct Input Tab + with gr.TabItem("Direct Input"): + text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg") - # File Input Tab - with gr.TabItem("From File"): - # Existing files dropdown - input_files_list = gr.Dropdown( - label="Select Existing File", - choices=files.list_input_files(), - value=None, - ) - - # Simple file upload - file_upload = gr.File( - label="Upload Text File (.txt)", file_types=[".txt"] - ) - - file_preview = gr.Textbox( - label="File Content Preview", interactive=False, lines=4 - ) - - with gr.Row(): - file_submit = gr.Button( - "Generate Speech", variant="primary", size="lg" - ) - clear_files = gr.Button( - "Clear Files", variant="secondary", size="lg" + # File Input Tab + with gr.TabItem("From File"): + # Existing files dropdown + input_files_list = gr.Dropdown( + label="Select Existing File", + choices=files.list_input_files(), + value=None, ) - components = { - "tabs": tabs, - "text_input": text_input, - "file_select": input_files_list, - "file_upload": file_upload, - "file_preview": file_preview, - "text_submit": text_submit, - "file_submit": file_submit, - "clear_files": clear_files, - } + file_preview = gr.Textbox( + label="File Content Preview", interactive=False, lines=4 + ) + + with gr.Row(): + file_submit = gr.Button( + "Generate Speech", variant="primary", size="lg" + ) + clear_files = gr.Button( + "Clear Files", variant="secondary", size="lg" + ) + else: + # Just show the generate button when saving is disabled + text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg") + tabs = None + input_files_list = None + file_preview = None + file_submit = None + clear_files = None + + # Initialize components based on disable_local_saving + if disable_local_saving: + components = { + "tabs": None, + "text_input": text_input, + "text_submit": text_submit_direct, + "file_select": None, + "file_upload": file_upload, # Keep file upload even when saving is disabled + "file_preview": None, + "file_submit": None, + "clear_files": None, + } + else: + components = { + "tabs": tabs, + "text_input": text_input, + "text_submit": text_submit_direct, + "file_select": input_files_list, + "file_upload": file_upload, + "file_preview": file_preview, + "file_submit": file_submit, + "clear_files": clear_files, + } return col, components diff --git a/ui/lib/components/model.py b/ui/lib/components/model.py index 444d0f8..a659d2c 100644 --- a/ui/lib/components/model.py +++ b/ui/lib/components/model.py @@ -20,9 +20,10 @@ def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, di voice_input = gr.Dropdown( choices=voice_ids, - label="Voice", + label="Voice(s)", value=voice_ids[0] if voice_ids else None, interactive=True, + multiselect=True, ) format_input = gr.Dropdown( choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3" diff --git a/ui/lib/components/output.py b/ui/lib/components/output.py index 129f06e..083829e 100644 --- a/ui/lib/components/output.py +++ b/ui/lib/components/output.py @@ -9,7 +9,11 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column, """Create the output column with audio player and file list.""" with gr.Column(scale=1) as col: gr.Markdown("### Latest Output") - audio_output = gr.Audio(label="Generated Speech", type="filepath") + audio_output = gr.Audio( + label="Generated Speech", + type="filepath", + waveform_options={"waveform_color": "#4C87AB"} + ) # Create file-related components with visible=False when local saving is disabled gr.Markdown("### Generated Files", visible=not disable_local_saving) @@ -17,7 +21,7 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column, label="Previous Outputs", choices=files.list_output_files() if not disable_local_saving else [], value=None, - allow_custom_value=False, + allow_custom_value=True, visible=not disable_local_saving, ) diff --git a/ui/lib/files.py b/ui/lib/files.py index 867f4f4..1391e0a 100644 --- a/ui/lib/files.py +++ b/ui/lib/files.py @@ -11,12 +11,14 @@ def list_input_files() -> List[str]: def list_output_files() -> List[str]: - """List all output audio files.""" - return [ + """List all output audio files, sorted by most recent first.""" + files = [ os.path.join(OUTPUTS_DIR, f) for f in os.listdir(OUTPUTS_DIR) if any(f.endswith(ext) for ext in AUDIO_FORMATS) ] + # Sort files by modification time, most recent first + return sorted(files, key=os.path.getmtime, reverse=True) def read_text_file(filename: str) -> str: diff --git a/ui/lib/handlers.py b/ui/lib/handlers.py index eba6cda..71b8d9b 100644 --- a/ui/lib/handlers.py +++ b/ui/lib/handlers.py @@ -6,7 +6,7 @@ import gradio as gr from . import api, files -def setup_event_handlers(components: dict): +def setup_event_handlers(components: dict, disable_local_saving: bool = False): """Set up all event handlers for the UI components.""" def refresh_status(): @@ -58,27 +58,37 @@ def setup_event_handlers(components: dict): def handle_file_upload(file): if file is None: - return gr.update(choices=files.list_input_files()) + return "" if disable_local_saving else [gr.update(choices=files.list_input_files())] try: - # Copy file to inputs directory - filename = os.path.basename(file.name) - target_path = os.path.join(files.INPUTS_DIR, filename) + # Read the file content + with open(file.name, 'r', encoding='utf-8') as f: + text_content = f.read() - # Handle duplicate filenames - base, ext = os.path.splitext(filename) - counter = 1 - while os.path.exists(target_path): - new_name = f"{base}_{counter}{ext}" - target_path = os.path.join(files.INPUTS_DIR, new_name) - counter += 1 + if disable_local_saving: + # When saving is disabled, put content directly in text input + # Normalize whitespace by replacing newlines with spaces + normalized_text = ' '.join(text_content.split()) + return normalized_text + else: + # When saving is enabled, save file and update dropdown + filename = os.path.basename(file.name) + target_path = os.path.join(files.INPUTS_DIR, filename) - shutil.copy2(file.name, target_path) + # Handle duplicate filenames + base, ext = os.path.splitext(filename) + counter = 1 + while os.path.exists(target_path): + new_name = f"{base}_{counter}{ext}" + target_path = os.path.join(files.INPUTS_DIR, new_name) + counter += 1 + + shutil.copy2(file.name, target_path) + return [gr.update(choices=files.list_input_files())] except Exception as e: - print(f"Error uploading file: {e}") - - return gr.update(choices=files.list_input_files()) + print(f"Error handling file: {e}") + return "" if disable_local_saving else [gr.update(choices=files.list_input_files())] def generate_from_text(text, voice, format, speed): """Generate speech from direct text input""" @@ -91,7 +101,10 @@ def setup_event_handlers(components: dict): gr.Warning("Please enter text in the input box") return [None, gr.update(choices=files.list_output_files())] - files.save_text(text) + # Only save text if local saving is enabled + if not disable_local_saving: + files.save_text(text) + result = api.text_to_speech(text, voice, format, speed) if result is None: gr.Warning("Failed to generate speech. Please try again.") @@ -162,45 +175,7 @@ def setup_event_handlers(components: dict): outputs=[components["model"]["status_btn"], components["model"]["voice"]], ) - components["input"]["file_select"].change( - fn=handle_file_select, - inputs=[components["input"]["file_select"]], - outputs=[components["input"]["file_preview"]], - ) - - components["input"]["file_upload"].upload( - fn=handle_file_upload, - inputs=[components["input"]["file_upload"]], - outputs=[components["input"]["file_select"]], - ) - - components["output"]["play_btn"].click( - fn=play_selected, - inputs=[components["output"]["output_files"]], - outputs=[components["output"]["selected_audio"]], - ) - - # Connect clear files button - components["input"]["clear_files"].click( - fn=clear_files, - inputs=[ - components["model"]["voice"], - components["model"]["format"], - components["model"]["speed"], - ], - outputs=[ - components["input"]["file_select"], - components["input"]["file_upload"], - components["input"]["file_preview"], - components["output"]["audio_output"], - components["output"]["output_files"], - components["model"]["voice"], - components["model"]["format"], - components["model"]["speed"], - ], - ) - - # Connect submit buttons for each tab + # Connect text submit button (always present) components["input"]["text_submit"].click( fn=generate_from_text, inputs=[ @@ -215,26 +190,70 @@ def setup_event_handlers(components: dict): ], ) - # Connect clear outputs button - components["output"]["clear_outputs"].click( - fn=clear_outputs, - outputs=[ - components["output"]["audio_output"], - components["output"]["output_files"], - components["output"]["selected_audio"], - ], - ) + # Only connect file-related handlers if components exist + if components["input"]["file_select"] is not None: + components["input"]["file_select"].change( + fn=handle_file_select, + inputs=[components["input"]["file_select"]], + outputs=[components["input"]["file_preview"]], + ) - components["input"]["file_submit"].click( - fn=generate_from_file, - inputs=[ - components["input"]["file_select"], - components["model"]["voice"], - components["model"]["format"], - components["model"]["speed"], - ], - outputs=[ - components["output"]["audio_output"], - components["output"]["output_files"], - ], - ) + if components["input"]["file_upload"] is not None: + # File upload handler - output depends on disable_local_saving + components["input"]["file_upload"].upload( + fn=handle_file_upload, + inputs=[components["input"]["file_upload"]], + outputs=[components["input"]["text_input"] if disable_local_saving else components["input"]["file_select"]], + ) + + if components["output"]["play_btn"] is not None: + components["output"]["play_btn"].click( + fn=play_selected, + inputs=[components["output"]["output_files"]], + outputs=[components["output"]["selected_audio"]], + ) + + if components["input"]["clear_files"] is not None: + components["input"]["clear_files"].click( + fn=clear_files, + inputs=[ + components["model"]["voice"], + components["model"]["format"], + components["model"]["speed"], + ], + outputs=[ + components["input"]["file_select"], + components["input"]["file_upload"], + components["input"]["file_preview"], + components["output"]["audio_output"], + components["output"]["output_files"], + components["model"]["voice"], + components["model"]["format"], + components["model"]["speed"], + ], + ) + + if components["output"]["clear_outputs"] is not None: + components["output"]["clear_outputs"].click( + fn=clear_outputs, + outputs=[ + components["output"]["audio_output"], + components["output"]["output_files"], + components["output"]["selected_audio"], + ], + ) + + if components["input"]["file_submit"] is not None: + components["input"]["file_submit"].click( + fn=generate_from_file, + inputs=[ + components["input"]["file_select"], + components["model"]["voice"], + components["model"]["format"], + components["model"]["speed"], + ], + outputs=[ + components["output"]["audio_output"], + components["output"]["output_files"], + ], + ) diff --git a/ui/lib/interface.py b/ui/lib/interface.py index 02d3083..1ae344c 100644 --- a/ui/lib/interface.py +++ b/ui/lib/interface.py @@ -26,7 +26,7 @@ def create_interface(): # Main interface with gr.Row(): # Create columns - input_col, input_components = create_input_column() + input_col, input_components = create_input_column(disable_local_saving) model_col, model_components = create_model_column( available_voices ) # Pass initial voices @@ -40,7 +40,7 @@ def create_interface(): } # Set up event handlers - setup_event_handlers(components) + setup_event_handlers(components, disable_local_saving) # Add periodic status check with Timer def update_status(): diff --git a/ui/tests/test_api.py b/ui/tests/test_api.py index fe5dbe7..d682326 100644 --- a/ui/tests/test_api.py +++ b/ui/tests/test_api.py @@ -106,24 +106,54 @@ def test_get_status_html_unavailable(): def test_text_to_speech_api_params(mock_response, tmp_path): """Test correct API parameters are sent""" - with patch("requests.post") as mock_post, patch( - "ui.lib.api.OUTPUTS_DIR", str(tmp_path) - ), patch("builtins.open", mock_open()): - mock_post.return_value = mock_response({}) - api.text_to_speech("test text", "voice1", "mp3", 1.5) + test_cases = [ + # Single voice as string + ("voice1", "voice1"), + # Multiple voices as list + (["voice1", "voice2"], "voice1+voice2"), + # Single voice as list + (["voice1"], "voice1"), + ] - mock_post.assert_called_once() - args, kwargs = mock_post.call_args + for input_voice, expected_voice in test_cases: + with patch("requests.post") as mock_post, patch( + "ui.lib.api.OUTPUTS_DIR", str(tmp_path) + ), patch("builtins.open", mock_open()): + mock_post.return_value = mock_response({}) + api.text_to_speech("test text", input_voice, "mp3", 1.5) - # Check request body - assert kwargs["json"] == { - "model": "kokoro", - "input": "test text", - "voice": "voice1", - "response_format": "mp3", - "speed": 1.5, - } + mock_post.assert_called_once() + args, kwargs = mock_post.call_args - # Check headers and timeout - assert kwargs["headers"] == {"Content-Type": "application/json"} - assert kwargs["timeout"] == 300 + # Check request body + assert kwargs["json"] == { + "model": "kokoro", + "input": "test text", + "voice": expected_voice, + "response_format": "mp3", + "speed": 1.5, + } + + # Check headers and timeout + assert kwargs["headers"] == {"Content-Type": "application/json"} + assert kwargs["timeout"] == 300 + + +def test_text_to_speech_output_filename(mock_response, tmp_path): + """Test output filename contains correct voice identifier""" + test_cases = [ + # Single voice + ("voice1", lambda f: "voice-voice1" in f), + # Multiple voices + (["voice1", "voice2"], lambda f: "voice-voice1+voice2" in f), + ] + + for input_voice, filename_check in test_cases: + with patch("requests.post", return_value=mock_response({})), patch( + "ui.lib.api.OUTPUTS_DIR", str(tmp_path) + ), patch("builtins.open", mock_open()) as mock_file: + result = api.text_to_speech("test text", input_voice, "mp3", 1.0) + + assert result is not None + assert filename_check(result), f"Expected voice pattern not found in filename: {result}" + mock_file.assert_called_once() diff --git a/ui/tests/test_components.py b/ui/tests/test_components.py index d9576c0..9e2b796 100644 --- a/ui/tests/test_components.py +++ b/ui/tests/test_components.py @@ -36,8 +36,10 @@ def test_model_column_default_values(): expected_choices = [(voice_id, voice_id) for voice_id in voice_ids] assert components["voice"].choices == expected_choices # Value is not converted to tuple format for the value property - assert components["voice"].value == voice_ids[0] + assert components["voice"].value == [voice_ids[0]] assert components["voice"].interactive is True + assert components["voice"].multiselect is True + assert components["voice"].label == "Voice(s)" # Test format dropdown # Gradio Dropdown converts choices to (value, label) tuples diff --git a/ui/tests/test_interface.py b/ui/tests/test_interface.py index cff4825..15c60ba 100644 --- a/ui/tests/test_interface.py +++ b/ui/tests/test_interface.py @@ -136,7 +136,7 @@ def test_interface_components_presence(): required_components = { "Text to speak", - "Voice", + "Voice(s)", "Audio Format", "Speed", "Generated Speech",