diff --git a/.coveragerc b/.coveragerc
index c66579a..bea78e2 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -7,6 +7,7 @@ omit =
MagicMock/*
test_*.py
examples/*
+ src/builds/*
[report]
exclude_lines =
diff --git a/README.md b/README.md
index 6f8efda..7695dd2 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
-# Kokoro TTS API
+# _`FastKoko`_
[]()
-[]()
+[]()
[](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) [](https://www.buymeacoffee.com/remsky)
Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model
@@ -35,8 +35,9 @@ The service can be accessed through either the API endpoints or the Gradio web i
- Using Docker Compose (Full setup including UI):
```bash
- docker compose up --build # for GPU
- docker compose -f docker-compose.cpu.yml up --build # for CPU
+ cd docker/gpu # OR
+ # cd docker/cpu # Run this or the above
+ docker compose up --build
```
- OR running the API alone using Docker (model + voice packs baked in):
```bash
@@ -350,6 +351,27 @@ See `examples/phoneme_examples/generate_phonemes.py` for a sample script.
## Known Issues
+
+Versioning & Development
+
+I'm doing what I can to keep things stable, but we are on an early and rapid set of build cycles here.
+If you run into trouble, you may have to roll back a version on the release tags if something comes up, or build up from source and/or troubleshoot + submit a PR. Will leave the branch up here for the last known stable points:
+
+`v0.0.5post1`
+
+Free and open source is a community effort, and I love working on this project, though there's only really so many hours in a day. If you'd like to support the work, feel free to open a PR, buy me a coffee, or report any bugs/features/etc you find during use.
+
+
+
+
+
+
+
+
Linux GPU Permissions
diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml
index beb2a8f..08403a5 100644
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@@ -35,3 +35,4 @@ services:
environment:
- GRADIO_WATCH=True # Enable hot reloading
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
+ - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view
\ No newline at end of file
diff --git a/docker/gpu/docker-compose.yml b/docker/gpu/docker-compose.yml
index c89c0b2..5775abe 100644
--- a/docker/gpu/docker-compose.yml
+++ b/docker/gpu/docker-compose.yml
@@ -32,4 +32,4 @@ services:
environment:
- GRADIO_WATCH=1 # Enable hot reloading
- PYTHONUNBUFFERED=1 # Ensure Python output is not buffered
- - DISABLE_LOCAL_SAVING=true # Set to 'true' to disable local saving and hide file view
+ - DISABLE_LOCAL_SAVING=false # Set to 'true' to disable local saving and hide file view
diff --git a/ui/lib/api.py b/ui/lib/api.py
index 1528656..ca0d7e8 100644
--- a/ui/lib/api.py
+++ b/ui/lib/api.py
@@ -36,15 +36,18 @@ def check_api_status() -> Tuple[bool, List[str]]:
def text_to_speech(
- text: str, voice_id: str, format: str, speed: float
+ text: str, voice_id: str | list, format: str, speed: float
) -> Optional[str]:
"""Generate speech from text using TTS API."""
if not text.strip():
return None
+ # Handle multiple voices
+ voice_str = voice_id if isinstance(voice_id, str) else "+".join(voice_id)
+
# Create output filename
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
- output_filename = f"output_{timestamp}_voice-{voice_id}_speed-{speed}.{format}"
+ output_filename = f"output_{timestamp}_voice-{voice_str}_speed-{speed}.{format}"
output_path = os.path.join(OUTPUTS_DIR, output_filename)
try:
@@ -53,7 +56,7 @@ def text_to_speech(
json={
"model": "kokoro",
"input": text,
- "voice": voice_id,
+ "voice": voice_str,
"response_format": format,
"speed": float(speed),
},
diff --git a/ui/lib/components/input.py b/ui/lib/components/input.py
index 793a89e..a2c4d33 100644
--- a/ui/lib/components/input.py
+++ b/ui/lib/components/input.py
@@ -5,54 +5,78 @@ import gradio as gr
from .. import files
-def create_input_column() -> Tuple[gr.Column, dict]:
+def create_input_column(disable_local_saving: bool = False) -> Tuple[gr.Column, dict]:
"""Create the input column with text input and file handling."""
with gr.Column(scale=1) as col:
- with gr.Tabs() as tabs:
- # Set first tab as selected by default
- tabs.selected = 0
- # Direct Input Tab
- with gr.TabItem("Direct Input"):
- text_input = gr.Textbox(
- label="Text to speak", placeholder="Enter text here...", lines=4
- )
- text_submit = gr.Button("Generate Speech", variant="primary", size="lg")
+ text_input = gr.Textbox(
+ label="Text to speak", placeholder="Enter text here...", lines=4
+ )
+
+ # Always show file upload but handle differently based on disable_local_saving
+ file_upload = gr.File(
+ label="Upload Text File (.txt)", file_types=[".txt"]
+ )
+
+ if not disable_local_saving:
+ # Show full interface with tabs when saving is enabled
+ with gr.Tabs() as tabs:
+ # Set first tab as selected by default
+ tabs.selected = 0
+ # Direct Input Tab
+ with gr.TabItem("Direct Input"):
+ text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg")
- # File Input Tab
- with gr.TabItem("From File"):
- # Existing files dropdown
- input_files_list = gr.Dropdown(
- label="Select Existing File",
- choices=files.list_input_files(),
- value=None,
- )
-
- # Simple file upload
- file_upload = gr.File(
- label="Upload Text File (.txt)", file_types=[".txt"]
- )
-
- file_preview = gr.Textbox(
- label="File Content Preview", interactive=False, lines=4
- )
-
- with gr.Row():
- file_submit = gr.Button(
- "Generate Speech", variant="primary", size="lg"
- )
- clear_files = gr.Button(
- "Clear Files", variant="secondary", size="lg"
+ # File Input Tab
+ with gr.TabItem("From File"):
+ # Existing files dropdown
+ input_files_list = gr.Dropdown(
+ label="Select Existing File",
+ choices=files.list_input_files(),
+ value=None,
)
- components = {
- "tabs": tabs,
- "text_input": text_input,
- "file_select": input_files_list,
- "file_upload": file_upload,
- "file_preview": file_preview,
- "text_submit": text_submit,
- "file_submit": file_submit,
- "clear_files": clear_files,
- }
+ file_preview = gr.Textbox(
+ label="File Content Preview", interactive=False, lines=4
+ )
+
+ with gr.Row():
+ file_submit = gr.Button(
+ "Generate Speech", variant="primary", size="lg"
+ )
+ clear_files = gr.Button(
+ "Clear Files", variant="secondary", size="lg"
+ )
+ else:
+ # Just show the generate button when saving is disabled
+ text_submit_direct = gr.Button("Generate Speech", variant="primary", size="lg")
+ tabs = None
+ input_files_list = None
+ file_preview = None
+ file_submit = None
+ clear_files = None
+
+ # Initialize components based on disable_local_saving
+ if disable_local_saving:
+ components = {
+ "tabs": None,
+ "text_input": text_input,
+ "text_submit": text_submit_direct,
+ "file_select": None,
+ "file_upload": file_upload, # Keep file upload even when saving is disabled
+ "file_preview": None,
+ "file_submit": None,
+ "clear_files": None,
+ }
+ else:
+ components = {
+ "tabs": tabs,
+ "text_input": text_input,
+ "text_submit": text_submit_direct,
+ "file_select": input_files_list,
+ "file_upload": file_upload,
+ "file_preview": file_preview,
+ "file_submit": file_submit,
+ "clear_files": clear_files,
+ }
return col, components
diff --git a/ui/lib/components/model.py b/ui/lib/components/model.py
index 444d0f8..a659d2c 100644
--- a/ui/lib/components/model.py
+++ b/ui/lib/components/model.py
@@ -20,9 +20,10 @@ def create_model_column(voice_ids: Optional[list] = None) -> Tuple[gr.Column, di
voice_input = gr.Dropdown(
choices=voice_ids,
- label="Voice",
+ label="Voice(s)",
value=voice_ids[0] if voice_ids else None,
interactive=True,
+ multiselect=True,
)
format_input = gr.Dropdown(
choices=config.AUDIO_FORMATS, label="Audio Format", value="mp3"
diff --git a/ui/lib/components/output.py b/ui/lib/components/output.py
index 129f06e..083829e 100644
--- a/ui/lib/components/output.py
+++ b/ui/lib/components/output.py
@@ -9,7 +9,11 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column,
"""Create the output column with audio player and file list."""
with gr.Column(scale=1) as col:
gr.Markdown("### Latest Output")
- audio_output = gr.Audio(label="Generated Speech", type="filepath")
+ audio_output = gr.Audio(
+ label="Generated Speech",
+ type="filepath",
+ waveform_options={"waveform_color": "#4C87AB"}
+ )
# Create file-related components with visible=False when local saving is disabled
gr.Markdown("### Generated Files", visible=not disable_local_saving)
@@ -17,7 +21,7 @@ def create_output_column(disable_local_saving: bool = False) -> Tuple[gr.Column,
label="Previous Outputs",
choices=files.list_output_files() if not disable_local_saving else [],
value=None,
- allow_custom_value=False,
+ allow_custom_value=True,
visible=not disable_local_saving,
)
diff --git a/ui/lib/files.py b/ui/lib/files.py
index 867f4f4..1391e0a 100644
--- a/ui/lib/files.py
+++ b/ui/lib/files.py
@@ -11,12 +11,14 @@ def list_input_files() -> List[str]:
def list_output_files() -> List[str]:
- """List all output audio files."""
- return [
+ """List all output audio files, sorted by most recent first."""
+ files = [
os.path.join(OUTPUTS_DIR, f)
for f in os.listdir(OUTPUTS_DIR)
if any(f.endswith(ext) for ext in AUDIO_FORMATS)
]
+ # Sort files by modification time, most recent first
+ return sorted(files, key=os.path.getmtime, reverse=True)
def read_text_file(filename: str) -> str:
diff --git a/ui/lib/handlers.py b/ui/lib/handlers.py
index eba6cda..71b8d9b 100644
--- a/ui/lib/handlers.py
+++ b/ui/lib/handlers.py
@@ -6,7 +6,7 @@ import gradio as gr
from . import api, files
-def setup_event_handlers(components: dict):
+def setup_event_handlers(components: dict, disable_local_saving: bool = False):
"""Set up all event handlers for the UI components."""
def refresh_status():
@@ -58,27 +58,37 @@ def setup_event_handlers(components: dict):
def handle_file_upload(file):
if file is None:
- return gr.update(choices=files.list_input_files())
+ return "" if disable_local_saving else [gr.update(choices=files.list_input_files())]
try:
- # Copy file to inputs directory
- filename = os.path.basename(file.name)
- target_path = os.path.join(files.INPUTS_DIR, filename)
+ # Read the file content
+ with open(file.name, 'r', encoding='utf-8') as f:
+ text_content = f.read()
- # Handle duplicate filenames
- base, ext = os.path.splitext(filename)
- counter = 1
- while os.path.exists(target_path):
- new_name = f"{base}_{counter}{ext}"
- target_path = os.path.join(files.INPUTS_DIR, new_name)
- counter += 1
+ if disable_local_saving:
+ # When saving is disabled, put content directly in text input
+ # Normalize whitespace by replacing newlines with spaces
+ normalized_text = ' '.join(text_content.split())
+ return normalized_text
+ else:
+ # When saving is enabled, save file and update dropdown
+ filename = os.path.basename(file.name)
+ target_path = os.path.join(files.INPUTS_DIR, filename)
- shutil.copy2(file.name, target_path)
+ # Handle duplicate filenames
+ base, ext = os.path.splitext(filename)
+ counter = 1
+ while os.path.exists(target_path):
+ new_name = f"{base}_{counter}{ext}"
+ target_path = os.path.join(files.INPUTS_DIR, new_name)
+ counter += 1
+
+ shutil.copy2(file.name, target_path)
+ return [gr.update(choices=files.list_input_files())]
except Exception as e:
- print(f"Error uploading file: {e}")
-
- return gr.update(choices=files.list_input_files())
+ print(f"Error handling file: {e}")
+ return "" if disable_local_saving else [gr.update(choices=files.list_input_files())]
def generate_from_text(text, voice, format, speed):
"""Generate speech from direct text input"""
@@ -91,7 +101,10 @@ def setup_event_handlers(components: dict):
gr.Warning("Please enter text in the input box")
return [None, gr.update(choices=files.list_output_files())]
- files.save_text(text)
+ # Only save text if local saving is enabled
+ if not disable_local_saving:
+ files.save_text(text)
+
result = api.text_to_speech(text, voice, format, speed)
if result is None:
gr.Warning("Failed to generate speech. Please try again.")
@@ -162,45 +175,7 @@ def setup_event_handlers(components: dict):
outputs=[components["model"]["status_btn"], components["model"]["voice"]],
)
- components["input"]["file_select"].change(
- fn=handle_file_select,
- inputs=[components["input"]["file_select"]],
- outputs=[components["input"]["file_preview"]],
- )
-
- components["input"]["file_upload"].upload(
- fn=handle_file_upload,
- inputs=[components["input"]["file_upload"]],
- outputs=[components["input"]["file_select"]],
- )
-
- components["output"]["play_btn"].click(
- fn=play_selected,
- inputs=[components["output"]["output_files"]],
- outputs=[components["output"]["selected_audio"]],
- )
-
- # Connect clear files button
- components["input"]["clear_files"].click(
- fn=clear_files,
- inputs=[
- components["model"]["voice"],
- components["model"]["format"],
- components["model"]["speed"],
- ],
- outputs=[
- components["input"]["file_select"],
- components["input"]["file_upload"],
- components["input"]["file_preview"],
- components["output"]["audio_output"],
- components["output"]["output_files"],
- components["model"]["voice"],
- components["model"]["format"],
- components["model"]["speed"],
- ],
- )
-
- # Connect submit buttons for each tab
+ # Connect text submit button (always present)
components["input"]["text_submit"].click(
fn=generate_from_text,
inputs=[
@@ -215,26 +190,70 @@ def setup_event_handlers(components: dict):
],
)
- # Connect clear outputs button
- components["output"]["clear_outputs"].click(
- fn=clear_outputs,
- outputs=[
- components["output"]["audio_output"],
- components["output"]["output_files"],
- components["output"]["selected_audio"],
- ],
- )
+ # Only connect file-related handlers if components exist
+ if components["input"]["file_select"] is not None:
+ components["input"]["file_select"].change(
+ fn=handle_file_select,
+ inputs=[components["input"]["file_select"]],
+ outputs=[components["input"]["file_preview"]],
+ )
- components["input"]["file_submit"].click(
- fn=generate_from_file,
- inputs=[
- components["input"]["file_select"],
- components["model"]["voice"],
- components["model"]["format"],
- components["model"]["speed"],
- ],
- outputs=[
- components["output"]["audio_output"],
- components["output"]["output_files"],
- ],
- )
+ if components["input"]["file_upload"] is not None:
+ # File upload handler - output depends on disable_local_saving
+ components["input"]["file_upload"].upload(
+ fn=handle_file_upload,
+ inputs=[components["input"]["file_upload"]],
+ outputs=[components["input"]["text_input"] if disable_local_saving else components["input"]["file_select"]],
+ )
+
+ if components["output"]["play_btn"] is not None:
+ components["output"]["play_btn"].click(
+ fn=play_selected,
+ inputs=[components["output"]["output_files"]],
+ outputs=[components["output"]["selected_audio"]],
+ )
+
+ if components["input"]["clear_files"] is not None:
+ components["input"]["clear_files"].click(
+ fn=clear_files,
+ inputs=[
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ outputs=[
+ components["input"]["file_select"],
+ components["input"]["file_upload"],
+ components["input"]["file_preview"],
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ )
+
+ if components["output"]["clear_outputs"] is not None:
+ components["output"]["clear_outputs"].click(
+ fn=clear_outputs,
+ outputs=[
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ components["output"]["selected_audio"],
+ ],
+ )
+
+ if components["input"]["file_submit"] is not None:
+ components["input"]["file_submit"].click(
+ fn=generate_from_file,
+ inputs=[
+ components["input"]["file_select"],
+ components["model"]["voice"],
+ components["model"]["format"],
+ components["model"]["speed"],
+ ],
+ outputs=[
+ components["output"]["audio_output"],
+ components["output"]["output_files"],
+ ],
+ )
diff --git a/ui/lib/interface.py b/ui/lib/interface.py
index 02d3083..1ae344c 100644
--- a/ui/lib/interface.py
+++ b/ui/lib/interface.py
@@ -26,7 +26,7 @@ def create_interface():
# Main interface
with gr.Row():
# Create columns
- input_col, input_components = create_input_column()
+ input_col, input_components = create_input_column(disable_local_saving)
model_col, model_components = create_model_column(
available_voices
) # Pass initial voices
@@ -40,7 +40,7 @@ def create_interface():
}
# Set up event handlers
- setup_event_handlers(components)
+ setup_event_handlers(components, disable_local_saving)
# Add periodic status check with Timer
def update_status():
diff --git a/ui/tests/test_api.py b/ui/tests/test_api.py
index fe5dbe7..d682326 100644
--- a/ui/tests/test_api.py
+++ b/ui/tests/test_api.py
@@ -106,24 +106,54 @@ def test_get_status_html_unavailable():
def test_text_to_speech_api_params(mock_response, tmp_path):
"""Test correct API parameters are sent"""
- with patch("requests.post") as mock_post, patch(
- "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
- ), patch("builtins.open", mock_open()):
- mock_post.return_value = mock_response({})
- api.text_to_speech("test text", "voice1", "mp3", 1.5)
+ test_cases = [
+ # Single voice as string
+ ("voice1", "voice1"),
+ # Multiple voices as list
+ (["voice1", "voice2"], "voice1+voice2"),
+ # Single voice as list
+ (["voice1"], "voice1"),
+ ]
- mock_post.assert_called_once()
- args, kwargs = mock_post.call_args
+ for input_voice, expected_voice in test_cases:
+ with patch("requests.post") as mock_post, patch(
+ "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
+ ), patch("builtins.open", mock_open()):
+ mock_post.return_value = mock_response({})
+ api.text_to_speech("test text", input_voice, "mp3", 1.5)
- # Check request body
- assert kwargs["json"] == {
- "model": "kokoro",
- "input": "test text",
- "voice": "voice1",
- "response_format": "mp3",
- "speed": 1.5,
- }
+ mock_post.assert_called_once()
+ args, kwargs = mock_post.call_args
- # Check headers and timeout
- assert kwargs["headers"] == {"Content-Type": "application/json"}
- assert kwargs["timeout"] == 300
+ # Check request body
+ assert kwargs["json"] == {
+ "model": "kokoro",
+ "input": "test text",
+ "voice": expected_voice,
+ "response_format": "mp3",
+ "speed": 1.5,
+ }
+
+ # Check headers and timeout
+ assert kwargs["headers"] == {"Content-Type": "application/json"}
+ assert kwargs["timeout"] == 300
+
+
+def test_text_to_speech_output_filename(mock_response, tmp_path):
+ """Test output filename contains correct voice identifier"""
+ test_cases = [
+ # Single voice
+ ("voice1", lambda f: "voice-voice1" in f),
+ # Multiple voices
+ (["voice1", "voice2"], lambda f: "voice-voice1+voice2" in f),
+ ]
+
+ for input_voice, filename_check in test_cases:
+ with patch("requests.post", return_value=mock_response({})), patch(
+ "ui.lib.api.OUTPUTS_DIR", str(tmp_path)
+ ), patch("builtins.open", mock_open()) as mock_file:
+ result = api.text_to_speech("test text", input_voice, "mp3", 1.0)
+
+ assert result is not None
+ assert filename_check(result), f"Expected voice pattern not found in filename: {result}"
+ mock_file.assert_called_once()
diff --git a/ui/tests/test_components.py b/ui/tests/test_components.py
index d9576c0..9e2b796 100644
--- a/ui/tests/test_components.py
+++ b/ui/tests/test_components.py
@@ -36,8 +36,10 @@ def test_model_column_default_values():
expected_choices = [(voice_id, voice_id) for voice_id in voice_ids]
assert components["voice"].choices == expected_choices
# Value is not converted to tuple format for the value property
- assert components["voice"].value == voice_ids[0]
+ assert components["voice"].value == [voice_ids[0]]
assert components["voice"].interactive is True
+ assert components["voice"].multiselect is True
+ assert components["voice"].label == "Voice(s)"
# Test format dropdown
# Gradio Dropdown converts choices to (value, label) tuples
diff --git a/ui/tests/test_interface.py b/ui/tests/test_interface.py
index cff4825..15c60ba 100644
--- a/ui/tests/test_interface.py
+++ b/ui/tests/test_interface.py
@@ -136,7 +136,7 @@ def test_interface_components_presence():
required_components = {
"Text to speak",
- "Voice",
+ "Voice(s)",
"Audio Format",
"Speed",
"Generated Speech",