diff --git a/Kokoro-82M b/Kokoro-82M index 3095858..c97b7bb 160000 --- a/Kokoro-82M +++ b/Kokoro-82M @@ -1 +1 @@ -Subproject commit 3095858c40fc22e28c46429da9340dfda1f8cf28 +Subproject commit c97b7bbc3e60f447383c79b2f94fee861ff156ac diff --git a/README.md b/README.md index bb96802..c7eefd4 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@

# Kokoro TTS API -[![Tests](https://img.shields.io/badge/tests-105%20passed-darkgreen)]() -[![Coverage](https://img.shields.io/badge/coverage-74%25-darkgreen)]() +[![Tests](https://img.shields.io/badge/tests-111%20passed-darkgreen)]() +[![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]() [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero) Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 0ede610..3cc4cc2 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -1,7 +1,29 @@ +""" +Text normalization module for TTS processing. +Handles various text formats including URLs, emails, numbers, money, and special characters. +Converts them into a format suitable for text-to-speech processing. +""" + import re from functools import lru_cache -def split_num(num: re.Match) -> str: +# Constants +VALID_TLDS = [ + "com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name", + "pro", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat", + "xxx", "aero", "arpa", "bg", "br", "ca", "cn", "de", "es", "eu", "fr", + "in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io" +] + +# Pre-compiled regex patterns for performance +EMAIL_PATTERN = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE) +URL_PATTERN = re.compile( + r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" + + "|".join(VALID_TLDS) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?", + re.IGNORECASE +) + +def split_num(num: re.Match[str]) -> str: """Handle number splitting for various formats""" num = num.group() if "." in num: @@ -25,7 +47,7 @@ def split_num(num: re.Match) -> str: return f"{left} oh {right}{s}" return f"{left} {right}{s}" -def handle_money(m: re.Match) -> str: +def handle_money(m: re.Match[str]) -> str: """Convert money expressions to spoken form""" m = m.group() bill = "dollar" if m[0] == "$" else "pound" @@ -44,28 +66,58 @@ def handle_money(m: re.Match) -> str: ) return f"{b} {bill}{s} and {c} {coins}" -def handle_decimal(num: re.Match) -> str: +def handle_decimal(num: re.Match[str]) -> str: """Convert decimal numbers to spoken form""" a, b = num.group().split(".") return " point ".join([a, " ".join(b)]) -def handle_url(u: re.Match) -> str: +def handle_email(m: re.Match[str]) -> str: + """Convert email addresses into speakable format""" + email = m.group(0) + parts = email.split('@') + if len(parts) == 2: + user, domain = parts + domain = domain.replace('.', ' dot ') + return f"{user} at {domain}" + return email + +def handle_url(u: re.Match[str]) -> str: """Make URLs speakable by converting special characters to spoken words""" if not u: return "" url = u.group(0).strip() - # Handle common URL prefixes - url = re.sub(r'^https?://', 'http ', url, flags=re.IGNORECASE) + + # Handle protocol first + url = re.sub(r'^https?://', lambda a: 'https ' if 'https' in a.group() else 'http ', url, flags=re.IGNORECASE) url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE) - # Replace symbols with words - url = url.replace("/", " slash ") - url = url.replace(".", " dot ") - url = url.replace("@", " at ") - url = url.replace("?", " question mark ") + # Handle port numbers before other replacements + url = re.sub(r':(\d+)(?=/|$)', lambda m: f" colon {m.group(1)}", url) + + # Split into domain and path + parts = url.split('/', 1) + domain = parts[0] + path = parts[1] if len(parts) > 1 else '' + + # Handle dots in domain + domain = domain.replace('.', ' dot ') + + # Reconstruct URL + if path: + url = f"{domain} slash {path}" + else: + url = domain + + # Replace remaining symbols with words + url = url.replace("-", " dash ") + url = url.replace("_", " underscore ") + url = url.replace("?", " question-mark ") url = url.replace("=", " equals ") url = url.replace("&", " ampersand ") + url = url.replace("%", " percent ") + url = url.replace(":", " colon ") # Handle any remaining colons + url = url.replace("/", " slash ") # Handle any remaining slashes # Clean up extra spaces return re.sub(r'\s+', ' ', url).strip() @@ -73,14 +125,11 @@ def handle_url(u: re.Match) -> str: def normalize_urls(text: str) -> str: """Pre-process URLs before other text normalization""" - url_patterns = [ - r"https?://[^\s]+", # URLs with http(s) - r"www\.[^\s]+", # URLs with www - r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b" # Email addresses - ] + # Handle email addresses first + text = EMAIL_PATTERN.sub(handle_email, text) - for pattern in url_patterns: - text = re.sub(pattern, handle_url, text, flags=re.IGNORECASE) + # Handle URLs + text = URL_PATTERN.sub(handle_url, text) return text @@ -88,6 +137,7 @@ def normalize_text(text: str) -> str: """Normalize text for TTS processing""" # Pre-process URLs first text = normalize_urls(text) + # Replace quotes and brackets text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace("«", chr(8220)).replace("»", chr(8221)) diff --git a/api/tests/test_normalizer.py b/api/tests/test_normalizer.py index c3d91f6..9555e22 100644 --- a/api/tests/test_normalizer.py +++ b/api/tests/test_normalizer.py @@ -3,19 +3,44 @@ import pytest from api.src.services.text_processing.normalizer import normalize_text -def test_urls(): - """Test URL handling""" - # URLs with http/https - assert normalize_text("Check out https://example.com") == "Check out http example dot com" - assert normalize_text("Visit http://site.com/docs") == "Visit http site dot com slash docs" - - # URLs with www +def test_url_protocols(): + """Test URL protocol handling""" + assert normalize_text("Check out https://example.com") == "Check out https example dot com" + assert normalize_text("Visit http://site.com") == "Visit http site dot com" + assert normalize_text("Go to https://test.org/path") == "Go to https test dot org slash path" + +def test_url_www(): + """Test www prefix handling""" assert normalize_text("Go to www.example.com") == "Go to www example dot com" - - # Email addresses + assert normalize_text("Visit www.test.org/docs") == "Visit www test dot org slash docs" + assert normalize_text("Check www.site.com?q=test") == "Check www site dot com question-mark q equals test" + +def test_url_localhost(): + """Test localhost URL handling""" + assert normalize_text("Running on localhost:7860") == "Running on localhost colon 78 60" + assert normalize_text("Server at localhost:8080/api") == "Server at localhost colon 80 80 slash api" + assert normalize_text("Test localhost:3000/test?v=1") == "Test localhost colon 3000 slash test question-mark v equals 1" + +def test_url_ip_addresses(): + """Test IP address URL handling""" + assert normalize_text("Access 0.0.0.0:9090/test") == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test" + assert normalize_text("API at 192.168.1.1:8000") == "API at 192 dot 168 dot 1 dot 1 colon 8000" + assert normalize_text("Server 127.0.0.1") == "Server 127 dot 0 dot 0 dot 1" + +def test_url_raw_domains(): + """Test raw domain handling""" + assert normalize_text("Visit google.com/search") == "Visit google dot com slash search" + assert normalize_text("Go to example.com/path?q=test") == "Go to example dot com slash path question-mark q equals test" + assert normalize_text("Check docs.test.com") == "Check docs dot test dot com" + +def test_url_email_addresses(): + """Test email address handling""" assert normalize_text("Email me at user@example.com") == "Email me at user at example dot com" - - # Normal text should be unaffected, other than downstream normalization + assert normalize_text("Contact admin@test.org") == "Contact admin at test dot org" + assert normalize_text("Send to test.user@site.com") == "Send to test dot user at site dot com" + +def test_non_url_text(): + """Test that non-URL text is unaffected""" assert normalize_text("This is not.a.url text") == "This is not-a-url text" assert normalize_text("Hello, how are you today?") == "Hello, how are you today?" assert normalize_text("It costs $50.") == "It costs 50 dollars."