From 162508272449c0b6e782eea10c25b7f9d0978bc8 Mon Sep 17 00:00:00 2001
From: Fireblade <fireblade5234@gmail.com>
Date: Tue, 7 Jan 2025 19:34:38 -0500
Subject: [PATCH 1/4] Fix url parsing for urls without https, http, or www. It
 also allows raw ips, ports, and dashs

---
 Kokoro-82M                                     |  2 +-
 api/src/services/text_processing/normalizer.py | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Kokoro-82M b/Kokoro-82M
index 3095858..c97b7bb 160000
--- a/Kokoro-82M
+++ b/Kokoro-82M
@@ -1 +1 @@
-Subproject commit 3095858c40fc22e28c46429da9340dfda1f8cf28
+Subproject commit c97b7bbc3e60f447383c79b2f94fee861ff156ac
diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
index 0ede610..799742b 100644
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@@ -1,6 +1,11 @@
 import re
 from functools import lru_cache
 
+valid_tlds=["com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name",
+    "pro", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat",
+    "xxx", "aero", "arpa", "bg", "br", "ca", "cn", "de", "es", "eu", "fr",
+    "in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io"]
+
 def split_num(num: re.Match) -> str:
     """Handle number splitting for various formats"""
     num = num.group()
@@ -56,14 +61,18 @@ def handle_url(u: re.Match) -> str:
         
     url = u.group(0).strip()
     # Handle common URL prefixes
-    url = re.sub(r'^https?://', 'http ', url, flags=re.IGNORECASE)
+    url = re.sub(r'^https?://', lambda a : 'https ' if 'https' in a.group() else 'http', url, flags=re.IGNORECASE)
     url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
     
     # Replace symbols with words
+    
+    url = url.replace(":", " colon ")
+    url = url.replace("-", " dash ")
+    url = url.replace("_", " underscore ")
     url = url.replace("/", " slash ")
     url = url.replace(".", " dot ")
     url = url.replace("@", " at ")
-    url = url.replace("?", " question mark ")
+    url = url.replace("?", " question-mark ")
     url = url.replace("=", " equals ")
     url = url.replace("&", " ampersand ")
     
@@ -74,8 +83,7 @@ def handle_url(u: re.Match) -> str:
 def normalize_urls(text: str) -> str:
     """Pre-process URLs before other text normalization"""
     url_patterns = [
-        r"https?://[^\s]+",  # URLs with http(s)
-        r"www\.[^\s]+",      # URLs with www
+        r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" + "|".join(valid_tlds) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",  # URLs with http(s), raw ip, www, or domain.tld
         r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b"  # Email addresses
     ]
     
@@ -87,7 +95,9 @@ def normalize_urls(text: str) -> str:
 def normalize_text(text: str) -> str:
     """Normalize text for TTS processing"""
     # Pre-process URLs first
+    
     text = normalize_urls(text)
+    
     # Replace quotes and brackets
     text = text.replace(chr(8216), "'").replace(chr(8217), "'")
     text = text.replace("«", chr(8220)).replace("»", chr(8221))

From a0a85f5ef0ba82285fa0fcfe346ee224d771657d Mon Sep 17 00:00:00 2001
From: remsky <jeremy.braun@ucalgary.ca>
Date: Wed, 8 Jan 2025 03:13:17 -0700
Subject: [PATCH 2/4] -add email handling, minor additional URL processing,
 tests

---
 .../services/text_processing/normalizer.py    | 78 ++++++++++++++-----
 api/tests/test_normalizer.py                  | 47 ++++++++---
 2 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
index 799742b..6ec3adb 100644
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@@ -1,12 +1,29 @@
+"""
+Text normalization module for TTS processing.
+Handles various text formats including URLs, emails, numbers, money, and special characters.
+Converts them into a format suitable for text-to-speech processing.
+"""
+
 import re
 from functools import lru_cache
 
-valid_tlds=["com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name",
+# Constants
+VALID_TLDS = [
+    "com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name",
     "pro", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat",
     "xxx", "aero", "arpa", "bg", "br", "ca", "cn", "de", "es", "eu", "fr",
-    "in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io"]
+    "in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io"
+]
 
-def split_num(num: re.Match) -> str:
+# Pre-compiled regex patterns for performance
+EMAIL_PATTERN = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE)
+URL_PATTERN = re.compile(
+    r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" + 
+    "|".join(VALID_TLDS) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",
+    re.IGNORECASE
+)
+
+def split_num(num: re.Match[str]) -> str:
     """Handle number splitting for various formats"""
     num = num.group()
     if "." in num:
@@ -30,7 +47,7 @@ def split_num(num: re.Match) -> str:
             return f"{left} oh {right}{s}"
     return f"{left} {right}{s}"
 
-def handle_money(m: re.Match) -> str:
+def handle_money(m: re.Match[str]) -> str:
     """Convert money expressions to spoken form"""
     m = m.group()
     bill = "dollar" if m[0] == "$" else "pound"
@@ -49,32 +66,56 @@ def handle_money(m: re.Match) -> str:
     )
     return f"{b} {bill}{s} and {c} {coins}"
 
-def handle_decimal(num: re.Match) -> str:
+def handle_decimal(num: re.Match[str]) -> str:
     """Convert decimal numbers to spoken form"""
     a, b = num.group().split(".")
     return " point ".join([a, " ".join(b)])
 
-def handle_url(u: re.Match) -> str:
+def handle_email(m: re.Match[str]) -> str:
+    """Convert email addresses into speakable format"""
+    email = m.group(0)
+    parts = email.split('@')
+    if len(parts) == 2:
+        user, domain = parts
+        domain = domain.replace('.', ' dot ')
+        return f"{user} at {domain}"
+    return email
+
+def handle_url(u: re.Match[str]) -> str:
     """Make URLs speakable by converting special characters to spoken words"""
     if not u:
         return ""
         
     url = u.group(0).strip()
-    # Handle common URL prefixes
-    url = re.sub(r'^https?://', lambda a : 'https ' if 'https' in a.group() else 'http', url, flags=re.IGNORECASE)
+    
+    # Handle protocol first
+    url = re.sub(r'^https?://', lambda a: 'https ' if 'https' in a.group() else 'http ', url, flags=re.IGNORECASE)
     url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
     
-    # Replace symbols with words
+    # Handle port numbers before other replacements
+    url = re.sub(r':(\d+)(?=/|$)', lambda m: f" colon {m.group(1)}", url)
     
-    url = url.replace(":", " colon ")
+    # Split into domain and path
+    parts = url.split('/', 1)
+    domain = parts[0]
+    path = parts[1] if len(parts) > 1 else ''
+    
+    # Handle dots in domain
+    domain = domain.replace('.', ' dot ')
+    
+    # Reconstruct URL
+    if path:
+        url = f"{domain} slash {path}"
+    else:
+        url = domain
+    
+    # Replace remaining symbols with words
     url = url.replace("-", " dash ")
     url = url.replace("_", " underscore ")
-    url = url.replace("/", " slash ")
-    url = url.replace(".", " dot ")
-    url = url.replace("@", " at ")
     url = url.replace("?", " question-mark ")
     url = url.replace("=", " equals ")
     url = url.replace("&", " ampersand ")
+    url = url.replace(":", " colon ")  # Handle any remaining colons
     
     # Clean up extra spaces
     return re.sub(r'\s+', ' ', url).strip()
@@ -82,20 +123,17 @@ def handle_url(u: re.Match) -> str:
 
 def normalize_urls(text: str) -> str:
     """Pre-process URLs before other text normalization"""
-    url_patterns = [
-        r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" + "|".join(valid_tlds) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",  # URLs with http(s), raw ip, www, or domain.tld
-        r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b"  # Email addresses
-    ]
+    # Handle email addresses first
+    text = EMAIL_PATTERN.sub(handle_email, text)
     
-    for pattern in url_patterns:
-        text = re.sub(pattern, handle_url, text, flags=re.IGNORECASE)
+    # Handle URLs
+    text = URL_PATTERN.sub(handle_url, text)
     
     return text
     
 def normalize_text(text: str) -> str:
     """Normalize text for TTS processing"""
     # Pre-process URLs first
-    
     text = normalize_urls(text)
     
     # Replace quotes and brackets
diff --git a/api/tests/test_normalizer.py b/api/tests/test_normalizer.py
index c3d91f6..9555e22 100644
--- a/api/tests/test_normalizer.py
+++ b/api/tests/test_normalizer.py
@@ -3,19 +3,44 @@
 import pytest
 from api.src.services.text_processing.normalizer import normalize_text
 
-def test_urls():
-    """Test URL handling"""
-    # URLs with http/https
-    assert normalize_text("Check out https://example.com") == "Check out http example dot com"
-    assert normalize_text("Visit http://site.com/docs") == "Visit http site dot com slash docs"
-    
-    # URLs with www
+def test_url_protocols():
+    """Test URL protocol handling"""
+    assert normalize_text("Check out https://example.com") == "Check out https example dot com"
+    assert normalize_text("Visit http://site.com") == "Visit http site dot com"
+    assert normalize_text("Go to https://test.org/path") == "Go to https test dot org slash path"
+
+def test_url_www():
+    """Test www prefix handling"""
     assert normalize_text("Go to www.example.com") == "Go to www example dot com"
-    
-    # Email addresses
+    assert normalize_text("Visit www.test.org/docs") == "Visit www test dot org slash docs"
+    assert normalize_text("Check www.site.com?q=test") == "Check www site dot com question-mark q equals test"
+
+def test_url_localhost():
+    """Test localhost URL handling"""
+    assert normalize_text("Running on localhost:7860") == "Running on localhost colon 78 60"
+    assert normalize_text("Server at localhost:8080/api") == "Server at localhost colon 80 80 slash api"
+    assert normalize_text("Test localhost:3000/test?v=1") == "Test localhost colon 3000 slash test question-mark v equals 1"
+
+def test_url_ip_addresses():
+    """Test IP address URL handling"""
+    assert normalize_text("Access 0.0.0.0:9090/test") == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test"
+    assert normalize_text("API at 192.168.1.1:8000") == "API at 192 dot 168 dot 1 dot 1 colon 8000"
+    assert normalize_text("Server 127.0.0.1") == "Server 127 dot 0 dot 0 dot 1"
+
+def test_url_raw_domains():
+    """Test raw domain handling"""
+    assert normalize_text("Visit google.com/search") == "Visit google dot com slash search"
+    assert normalize_text("Go to example.com/path?q=test") == "Go to example dot com slash path question-mark q equals test"
+    assert normalize_text("Check docs.test.com") == "Check docs dot test dot com"
+
+def test_url_email_addresses():
+    """Test email address handling"""
     assert normalize_text("Email me at user@example.com") == "Email me at user at example dot com"
-    
-    # Normal text should be unaffected, other than downstream normalization
+    assert normalize_text("Contact admin@test.org") == "Contact admin at test dot org"
+    assert normalize_text("Send to test.user@site.com") == "Send to test dot user at site dot com"
+
+def test_non_url_text():
+    """Test that non-URL text is unaffected"""
     assert normalize_text("This is not.a.url text") == "This is not-a-url text"
     assert normalize_text("Hello, how are you today?") == "Hello, how are you today?"
     assert normalize_text("It costs $50.") == "It costs 50 dollars."

From f1ab1c2b6928a78ee6e8d32b371a8748ebde9565 Mon Sep 17 00:00:00 2001
From: remsky <jeremy.braun@ucalgary.ca>
Date: Wed, 8 Jan 2025 03:13:37 -0700
Subject: [PATCH 3/4] Update README.md to reflect test and coverage
 improvements

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 097685b..743136e 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 </p>
 
 # Kokoro TTS API
-[![Tests](https://img.shields.io/badge/tests-105%20passed-darkgreen)]()
-[![Coverage](https://img.shields.io/badge/coverage-74%25-darkgreen)]()
+[![Tests](https://img.shields.io/badge/tests-111%20passed-darkgreen)]()
+[![Coverage](https://img.shields.io/badge/coverage-75%25-darkgreen)]()
 [![Tested at Model Commit](https://img.shields.io/badge/last--tested--model--commit-a67f113-blue)](https://huggingface.co/hexgrad/Kokoro-82M/tree/c3b0d86e2a980e027ef71c28819ea02e351c2667) [![Try on Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Try%20on-Spaces-blue)](https://huggingface.co/spaces/Remsky/Kokoro-TTS-Zero)
 
 Dockerized FastAPI wrapper for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) text-to-speech model

From 1f22cda9be4a9d02af73971aae70d31ff93a0271 Mon Sep 17 00:00:00 2001
From: Fireblade <fireblade5234@gmail.com>
Date: Wed, 8 Jan 2025 08:50:22 -0500
Subject: [PATCH 4/4] Fix remaining slashes not being converted into text and
 made % be converted

---
 api/src/services/text_processing/normalizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
index 6ec3adb..3cc4cc2 100644
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@@ -115,7 +115,9 @@ def handle_url(u: re.Match[str]) -> str:
     url = url.replace("?", " question-mark ")
     url = url.replace("=", " equals ")
     url = url.replace("&", " ampersand ")
+    url = url.replace("%", " percent ")
     url = url.replace(":", " colon ")  # Handle any remaining colons
+    url = url.replace("/", " slash ")  # Handle any remaining slashes
     
     # Clean up extra spaces
     return re.sub(r'\s+', ' ', url).strip()