ci: enhance local saving feature, update voice selection to support multiple voices, and improve output filename generation

2025-04-13 09:39:17 +00:00 · 2025-01-14 03:47:25 -07:00 · 2025-01-14 03:47:25 -07:00 · 5cc3bacac1
commit 5cc3bacac1
parent 9edc7fd7fc
14 changed files with 266 additions and 157 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -7,6 +7,7 @@ omit =
    MagicMock/*
    test_*.py
    examples/*
+    src/builds/*

 [report]
 exclude_lines =
--- a/README.md
+++ b/README.md
@ -2,9 +2,9 @@
  <img src="githubbanner.png" alt="Kokoro TTS Banner">
 </p>

-# Kokoro TTS API
+# <sub><sub>_`FastKoko`_ </sub></sub>
 [![Tests](https://img.shields.io/badge/tests-117%20passed-darkgreen)]()
-[![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]()
+[![Coverage](https://img.shields.io/badge/coverage-60%25-grey)]()
 [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [![Buy Me A Coffee](https://img.shields.io/badge/BMC-✨☕-gray?style=flat-square)](https://www.buymeacoffee.com/remsky)

 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
@ -35,8 +35,9 @@ The service can be accessed through either the API endpoints or the Gradio web i
   
   - Using Docker Compose (Full setup including UI):
        ```bash
-        docker compose up --build # for GPU
-        docker compose -f docker-compose.cpu.yml up --build # for CPU
+        cd docker/gpu # OR 
+        # cd docker/cpu # Run this or the above
+        docker compose up --build 
        ```
   - OR running the API alone using Docker (model + voice packs baked in):
        ```bash
@ -350,6 +351,27 @@ See `examples/phoneme_examples/generate_phonemes.py` for a sample script.

 ## Known Issues

+<details>
+<summary>Versioning & Development</summary>
+
+I'm doing what I can to keep things stable, but we are on an early and rapid set of build cycles here.
+If you run into trouble, you may have to roll back a version on the release tags if something comes up, or build up from source and/or troubleshoot + submit a PR. Will leave the branch up here for the last known stable points:
+
+`v0.0.5post1`
+
+Free and open source is a community effort, and I love working on this project, though there's only really so many hours in a day. If you'd like to support the work, feel free to open a PR, buy me a coffee, or report any bugs/features/etc you find during use.
+
+  <a href="https://www.buymeacoffee.com/remsky" target="_blank">
+    <img 
+      src="https://cdn.buymeacoffee.com/buttons/v2/default-violet.png" 
+      alt="Buy Me A Coffee" 
+      style="height: 30px !important;width: 110px !important;"
+    >
+  </a>
+
+  
+</details>
+
 <details>
 <summary>Linux GPU Permissions</summary>

--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@ -35,3 +35,4 @@ services:
    environment:
      - GRADIO_WATCH=True  # Enable hot reloading
      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
+      - DISABLE_LOCAL_SAVING=false  # Set to 'true' to disable local saving and hide file view
--- a/docker/gpu/docker-compose.yml
+++ b/docker/gpu/docker-compose.yml
@ -32,4 +32,4 @@ services:
    environment:
      - GRADIO_WATCH=1  # Enable hot reloading
      - PYTHONUNBUFFERED=1  # Ensure Python output is not buffered
-      - DISABLE_LOCAL_SAVING=true  # Set to 'true' to disable local saving and hide file view
+      - DISABLE_LOCAL_SAVING=false  # Set to 'true' to disable local saving and hide file view
--- a/ui/lib/api.py
+++ b/ui/lib/api.py
@ -36,15 +36,18 @@ def check_api_status() -> Tuple[bool, List[str]]:


 def text_to_speech(
-    text: str, voice_id: str, format: str, speed: float
+    text: str, voice_id: str | list, format: str, speed: float
 ) -> Optional[str]:
    """Generate speech from text using TTS API."""
    if not text.strip():
        return None

+    # Handle multiple voices
+    voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id)
+
    # Create output filename
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    output_filename = f"output_{timestamp}_voice-{voice_id}_speed-{speed}.{format}"
+    output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}"
    output_path = os.path.join(OUTPUTS_DIR, output_filename)

    try:
@ -53,7 +56,7 @@ def text_to_speech(
            json={
                "model": "kokoro",
                "input": text,
-                "voice": voice_id,
+                "voice": voice_str,
                "response_format": format,
                "speed": float(speed),
            },
--- a/ui/lib/components/input.py
+++ b/ui/lib/components/input.py
@ -5,54 +5,78 @@ import gradio as gr
 from .. import files


-def create_input_column() -> Tuple[gr.Column, dict]:
+def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
    """Create the input column with text input and file handling."""
    with gr.Column(scale=1) as col:
-        with gr.Tabs() as tabs:
-            # Set first tab as selected by default
-            tabs.selected = 0
-            # Direct Input Tab
-            with gr.TabItem("Direct Input"):
-                text_input = gr.Textbox(
-                    label="Text to speak", placeholder="Enter text here...", lines=4
-                )
-                text_submit = gr.Button("Generate Speech", variant="primary", size="lg")
+        text_input = gr.Textbox(
+            label="Text to speak", placeholder="Enter text here...", lines=4
+        )
+        
+        # Always show file upload but handle differently based on disable_local_saving
+        file_upload = gr.File(
+            label="Upload Text File (.txt)", file_types=[".txt"]
+        )
+        
+        if not disable_local_saving:
+            # Show full interface with tabs when saving is enabled
+            with gr.Tabs() as tabs:
+                # Set first tab as selected by default
+                tabs.selected = 0
+                # Direct Input Tab
+                with gr.TabItem("Direct Input"):
+                    text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg")

-            # File Input Tab
-            with gr.TabItem("From File"):
-                # Existing files dropdown
-                input_files_list = gr.Dropdown(
-                    label="Select Existing File",
-                    choices=files.list_input_files(),
-                    value=None,
-                )
-
-                # Simple file upload
-                file_upload = gr.File(
-                    label="Upload Text File (.txt)", file_types=[".txt"]
-                )
-
-                file_preview = gr.Textbox(
-                    label="File Content Preview", interactive=False, lines=4
-                )
-
-                with gr.Row():
-                    file_submit = gr.Button(
-                        "Generate Speech", variant="primary", size="lg"
-                    )
-                    clear_files = gr.Button(
-                        "Clear Files", variant="secondary", size="lg"
+                # File Input Tab
+                with gr.TabItem("From File"):
+                    # Existing files dropdown
+                    input_files_list = gr.Dropdown(
+                        label="Select Existing File",
+                        choices=files.list_input_files(),
+                        value=None,
                    )

-    components = {
-        "tabs": tabs,
-        "text_input": text_input,
-        "file_select": input_files_list,
-        "file_upload": file_upload,
-        "file_preview": file_preview,
-        "text_submit": text_submit,
-        "file_submit": file_submit,
-        "clear_files": clear_files,
-    }
+                    file_preview = gr.Textbox(
+                        label="File Content Preview", interactive=False, lines=4
+                    )
+
+                    with gr.Row():
+                        file_submit = gr.Button(
+                            "Generate Speech", variant="primary", size="lg"
+                        )
+                        clear_files = gr.Button(
+                            "Clear Files", variant="secondary", size="lg"
+                        )
+        else:
+            # Just show the generate button when saving is disabled
+            text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg")
+            tabs = None
+            input_files_list = None
+            file_preview = None
+            file_submit = None
+            clear_files = None
+
+    # Initialize components based on disable_local_saving
+    if disable_local_saving:
+        components = {
+            "tabs": None,
+            "text_input": text_input,
+            "text_submit": text_submit_direct,
+            "file_select": None,
+            "file_upload": file_upload,  # Keep file upload even when saving is disabled
+            "file_preview": None,
+            "file_submit": None,
+            "clear_files": None,
+        }
+    else:
+        components = {
+            "tabs": tabs,
+            "text_input": text_input,
+            "text_submit": text_submit_direct,
+            "file_select": input_files_list,
+            "file_upload": file_upload,
+            "file_preview": file_preview,
+            "file_submit": file_submit,
+            "clear_files": clear_files,
+        }

    return col, components
--- a/ui/lib/components/model.py
+++ b/ui/lib/components/model.py
@ -20,9 +20,10 @@ def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, di

        voice_input = gr.Dropdown(
            choices=voice_ids,
-            label="Voice",
+            label="Voice(s)",
            value=voice_ids[0] if voice_ids else None,
            interactive=True,
+            multiselect=True,
        )
        format_input = gr.Dropdown(
            choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3"
--- a/ui/lib/components/output.py
+++ b/ui/lib/components/output.py
@ -9,7 +9,11 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column,
    """Create the output column with audio player and file list."""
    with gr.Column(scale=1) as col:
        gr.Markdown("### Latest Output")
-        audio_output = gr.Audio(label="Generated Speech", type="filepath")
+        audio_output = gr.Audio(
+            label="Generated Speech",
+            type="filepath",
+            waveform_options={"waveform_color": "#4C87AB"}
+        )

        # Create file-related components with visible=False when local saving is disabled
        gr.Markdown("### Generated Files", visible=not disable_local_saving)
@ -17,7 +21,7 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column,
            label="Previous Outputs",
            choices=files.list_output_files() if not disable_local_saving else [],
            value=None,
-            allow_custom_value=False,
+            allow_custom_value=True,
            visible=not disable_local_saving,
        )

--- a/ui/lib/files.py
+++ b/ui/lib/files.py
@ -11,12 +11,14 @@ def list_input_files() -> List[str]:


 def list_output_files() -> List[str]:
-    """List all output audio files."""
-    return [
+    """List all output audio files, sorted by most recent first."""
+    files = [
        os.path.join(OUTPUTS_DIR, f)
        for f in os.listdir(OUTPUTS_DIR)
        if any(f.endswith(ext) for ext in AUDIO_FORMATS)
    ]
+    # Sort files by modification time, most recent first
+    return sorted(files, key=os.path.getmtime, reverse=True)


 def read_text_file(filename: str) -> str:
--- a/ui/lib/handlers.py
+++ b/ui/lib/handlers.py
@ -6,7 +6,7 @@ import gradio as gr
 from . import api, files


-def setup_event_handlers(components: dict):
+def setup_event_handlers(components: dict, disable_local_saving: bool = False):
    """Set up all event handlers for the UI components."""

    def refresh_status():
@ -58,27 +58,37 @@ def setup_event_handlers(components: dict):

    def handle_file_upload(file):
        if file is None:
-            return gr.update(choices=files.list_input_files())
+            return "" if disable_local_saving else [gr.update(choices=files.list_input_files())]

        try:
-            # Copy file to inputs directory
-            filename = os.path.basename(file.name)
-            target_path = os.path.join(files.INPUTS_DIR, filename)
+            # Read the file content
+            with open(file.name, 'r', encoding='utf-8') as f:
+                text_content = f.read()

-            # Handle duplicate filenames
-            base, ext = os.path.splitext(filename)
-            counter = 1
-            while os.path.exists(target_path):
-                new_name = f"{base}_{counter}{ext}"
-                target_path = os.path.join(files.INPUTS_DIR, new_name)
-                counter += 1
+            if disable_local_saving:
+                # When saving is disabled, put content directly in text input
+                # Normalize whitespace by replacing newlines with spaces
+                normalized_text = ' '.join(text_content.split())
+                return normalized_text
+            else:
+                # When saving is enabled, save file and update dropdown
+                filename = os.path.basename(file.name)
+                target_path = os.path.join(files.INPUTS_DIR, filename)

-            shutil.copy2(file.name, target_path)
+                # Handle duplicate filenames
+                base, ext = os.path.splitext(filename)
+                counter = 1
+                while os.path.exists(target_path):
+                    new_name = f"{base}_{counter}{ext}"
+                    target_path = os.path.join(files.INPUTS_DIR, new_name)
+                    counter += 1
+
+                shutil.copy2(file.name, target_path)
+                return [gr.update(choices=files.list_input_files())]

        except Exception as e:
-            print(f"Error uploading file: {e}")
-
-        return gr.update(choices=files.list_input_files())
+            print(f"Error handling file: {e}")
+            return "" if disable_local_saving else [gr.update(choices=files.list_input_files())]

    def generate_from_text(text, voice, format, speed):
        """Generate speech from direct text input"""
@ -91,7 +101,10 @@ def setup_event_handlers(components: dict):
            gr.Warning("Please enter text in the input box")
            return [None, gr.update(choices=files.list_output_files())]

-        files.save_text(text)
+        # Only save text if local saving is enabled
+        if not disable_local_saving:
+            files.save_text(text)
+            
        result = api.text_to_speech(text, voice, format, speed)
        if result is None:
            gr.Warning("Failed to generate speech. Please try again.")
@ -162,45 +175,7 @@ def setup_event_handlers(components: dict):
        outputs=[components["model"]["status_btn"], components["model"]["voice"]],
    )

-    components["input"]["file_select"].change(
-        fn=handle_file_select,
-        inputs=[components["input"]["file_select"]],
-        outputs=[components["input"]["file_preview"]],
-    )
-
-    components["input"]["file_upload"].upload(
-        fn=handle_file_upload,
-        inputs=[components["input"]["file_upload"]],
-        outputs=[components["input"]["file_select"]],
-    )
-
-    components["output"]["play_btn"].click(
-        fn=play_selected,
-        inputs=[components["output"]["output_files"]],
-        outputs=[components["output"]["selected_audio"]],
-    )
-
-    # Connect clear files button
-    components["input"]["clear_files"].click(
-        fn=clear_files,
-        inputs=[
-            components["model"]["voice"],
-            components["model"]["format"],
-            components["model"]["speed"],
-        ],
-        outputs=[
-            components["input"]["file_select"],
-            components["input"]["file_upload"],
-            components["input"]["file_preview"],
-            components["output"]["audio_output"],
-            components["output"]["output_files"],
-            components["model"]["voice"],
-            components["model"]["format"],
-            components["model"]["speed"],
-        ],
-    )
-
-    # Connect submit buttons for each tab
+    # Connect text submit button (always present)
    components["input"]["text_submit"].click(
        fn=generate_from_text,
        inputs=[
@ -215,26 +190,70 @@ def setup_event_handlers(components: dict):
        ],
    )

-    # Connect clear outputs button
-    components["output"]["clear_outputs"].click(
-        fn=clear_outputs,
-        outputs=[
-            components["output"]["audio_output"],
-            components["output"]["output_files"],
-            components["output"]["selected_audio"],
-        ],
-    )
+    # Only connect file-related handlers if components exist
+    if components["input"]["file_select"] is not None:
+        components["input"]["file_select"].change(
+            fn=handle_file_select,
+            inputs=[components["input"]["file_select"]],
+            outputs=[components["input"]["file_preview"]],
+        )

-    components["input"]["file_submit"].click(
-        fn=generate_from_file,
-        inputs=[
-            components["input"]["file_select"],
-            components["model"]["voice"],
-            components["model"]["format"],
-            components["model"]["speed"],
-        ],
-        outputs=[
-            components["output"]["audio_output"],
-            components["output"]["output_files"],
-        ],
-    )
+    if components["input"]["file_upload"] is not None:
+        # File upload handler - output depends on disable_local_saving
+        components["input"]["file_upload"].upload(
+            fn=handle_file_upload,
+            inputs=[components["input"]["file_upload"]],
+            outputs=[components["input"]["text_input"] if disable_local_saving else components["input"]["file_select"]],
+        )
+
+    if components["output"]["play_btn"] is not None:
+        components["output"]["play_btn"].click(
+            fn=play_selected,
+            inputs=[components["output"]["output_files"]],
+            outputs=[components["output"]["selected_audio"]],
+        )
+
+    if components["input"]["clear_files"] is not None:
+        components["input"]["clear_files"].click(
+            fn=clear_files,
+            inputs=[
+                components["model"]["voice"],
+                components["model"]["format"],
+                components["model"]["speed"],
+            ],
+            outputs=[
+                components["input"]["file_select"],
+                components["input"]["file_upload"],
+                components["input"]["file_preview"],
+                components["output"]["audio_output"],
+                components["output"]["output_files"],
+                components["model"]["voice"],
+                components["model"]["format"],
+                components["model"]["speed"],
+            ],
+        )
+
+    if components["output"]["clear_outputs"] is not None:
+        components["output"]["clear_outputs"].click(
+            fn=clear_outputs,
+            outputs=[
+                components["output"]["audio_output"],
+                components["output"]["output_files"],
+                components["output"]["selected_audio"],
+            ],
+        )
+
+    if components["input"]["file_submit"] is not None:
+        components["input"]["file_submit"].click(
+            fn=generate_from_file,
+            inputs=[
+                components["input"]["file_select"],
+                components["model"]["voice"],
+                components["model"]["format"],
+                components["model"]["speed"],
+            ],
+            outputs=[
+                components["output"]["audio_output"],
+                components["output"]["output_files"],
+            ],
+        )
--- a/ui/lib/interface.py
+++ b/ui/lib/interface.py
@ -26,7 +26,7 @@ def create_interface():
        # Main interface
        with gr.Row():
            # Create columns
-            input_col, input_components = create_input_column()
+            input_col, input_components = create_input_column(disable_local_saving)
            model_col, model_components = create_model_column(
                available_voices
            )  # Pass initial voices
@ -40,7 +40,7 @@ def create_interface():
            }

            # Set up event handlers
-            setup_event_handlers(components)
+            setup_event_handlers(components, disable_local_saving)

        # Add periodic status check with Timer
        def update_status():
--- a/ui/tests/test_api.py
+++ b/ui/tests/test_api.py
@ -106,24 +106,54 @@ def test_get_status_html_unavailable():

 def test_text_to_speech_api_params(mock_response, tmp_path):
    """Test correct API parameters are sent"""
-    with patch("requests.post") as mock_post, patch(
-        "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
-    ), patch("builtins.open", mock_open()):
-        mock_post.return_value = mock_response({})
-        api.text_to_speech("test text", "voice1", "mp3", 1.5)
+    test_cases = [
+        # Single voice as string
+        ("voice1", "voice1"),
+        # Multiple voices as list
+        (["voice1", "voice2"], "voice1+voice2"),
+        # Single voice as list
+        (["voice1"], "voice1"),
+    ]

-        mock_post.assert_called_once()
-        args, kwargs = mock_post.call_args
+    for input_voice, expected_voice in test_cases:
+        with patch("requests.post") as mock_post, patch(
+            "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
+        ), patch("builtins.open", mock_open()):
+            mock_post.return_value = mock_response({})
+            api.text_to_speech("test text", input_voice, "mp3", 1.5)

-        # Check request body
-        assert kwargs["json"] == {
-            "model": "kokoro",
-            "input": "test text",
-            "voice": "voice1",
-            "response_format": "mp3",
-            "speed": 1.5,
-        }
+            mock_post.assert_called_once()
+            args, kwargs = mock_post.call_args

-        # Check headers and timeout
-        assert kwargs["headers"] == {"Content-Type": "application/json"}
-        assert kwargs["timeout"] == 300
+            # Check request body
+            assert kwargs["json"] == {
+                "model": "kokoro",
+                "input": "test text",
+                "voice": expected_voice,
+                "response_format": "mp3",
+                "speed": 1.5,
+            }
+
+            # Check headers and timeout
+            assert kwargs["headers"] == {"Content-Type": "application/json"}
+            assert kwargs["timeout"] == 300
+
+
+def test_text_to_speech_output_filename(mock_response, tmp_path):
+    """Test output filename contains correct voice identifier"""
+    test_cases = [
+        # Single voice
+        ("voice1", lambda f: "voice-voice1" in f),
+        # Multiple voices
+        (["voice1", "voice2"], lambda f: "voice-voice1+voice2" in f),
+    ]
+
+    for input_voice, filename_check in test_cases:
+        with patch("requests.post", return_value=mock_response({})), patch(
+            "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
+        ), patch("builtins.open", mock_open()) as mock_file:
+            result = api.text_to_speech("test text", input_voice, "mp3", 1.0)
+
+            assert result is not None
+            assert filename_check(result), f"Expected voice pattern not found in filename: {result}"
+            mock_file.assert_called_once()
--- a/ui/tests/test_components.py
+++ b/ui/tests/test_components.py
@ -36,8 +36,10 @@ def test_model_column_default_values():
    expected_choices = [(voice_id, voice_id) for voice_id in voice_ids]
    assert components["voice"].choices == expected_choices
    # Value is not converted to tuple format for the value property
-    assert components["voice"].value == voice_ids[0]
+    assert components["voice"].value == [voice_ids[0]]
    assert components["voice"].interactive is True
+    assert components["voice"].multiselect is True
+    assert components["voice"].label == "Voice(s)"

    # Test format dropdown
    # Gradio Dropdown converts choices to (value, label) tuples
--- a/ui/tests/test_interface.py
+++ b/ui/tests/test_interface.py
@ -136,7 +136,7 @@ def test_interface_components_presence():

        required_components = {
            "Text to speak",
-            "Voice",
+            "Voice(s)",
            "Audio Format",
            "Speed",
            "Generated Speech",