mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Fix url parsing for urls without https, http, or www. It also allows raw ips, ports, and dashs
This commit is contained in:
parent
a8402782a7
commit
1625082724
2 changed files with 15 additions and 5 deletions
|
@ -1 +1 @@
|
||||||
Subproject commit 3095858c40fc22e28c46429da9340dfda1f8cf28
|
Subproject commit c97b7bbc3e60f447383c79b2f94fee861ff156ac
|
|
@ -1,6 +1,11 @@
|
||||||
import re
|
import re
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
|
valid_tlds=["com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name",
|
||||||
|
"pro", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat",
|
||||||
|
"xxx", "aero", "arpa", "bg", "br", "ca", "cn", "de", "es", "eu", "fr",
|
||||||
|
"in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io"]
|
||||||
|
|
||||||
def split_num(num: re.Match) -> str:
|
def split_num(num: re.Match) -> str:
|
||||||
"""Handle number splitting for various formats"""
|
"""Handle number splitting for various formats"""
|
||||||
num = num.group()
|
num = num.group()
|
||||||
|
@ -56,14 +61,18 @@ def handle_url(u: re.Match) -> str:
|
||||||
|
|
||||||
url = u.group(0).strip()
|
url = u.group(0).strip()
|
||||||
# Handle common URL prefixes
|
# Handle common URL prefixes
|
||||||
url = re.sub(r'^https?://', 'http ', url, flags=re.IGNORECASE)
|
url = re.sub(r'^https?://', lambda a : 'https ' if 'https' in a.group() else 'http', url, flags=re.IGNORECASE)
|
||||||
url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
|
url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
|
||||||
|
|
||||||
# Replace symbols with words
|
# Replace symbols with words
|
||||||
|
|
||||||
|
url = url.replace(":", " colon ")
|
||||||
|
url = url.replace("-", " dash ")
|
||||||
|
url = url.replace("_", " underscore ")
|
||||||
url = url.replace("/", " slash ")
|
url = url.replace("/", " slash ")
|
||||||
url = url.replace(".", " dot ")
|
url = url.replace(".", " dot ")
|
||||||
url = url.replace("@", " at ")
|
url = url.replace("@", " at ")
|
||||||
url = url.replace("?", " question mark ")
|
url = url.replace("?", " question-mark ")
|
||||||
url = url.replace("=", " equals ")
|
url = url.replace("=", " equals ")
|
||||||
url = url.replace("&", " ampersand ")
|
url = url.replace("&", " ampersand ")
|
||||||
|
|
||||||
|
@ -74,8 +83,7 @@ def handle_url(u: re.Match) -> str:
|
||||||
def normalize_urls(text: str) -> str:
|
def normalize_urls(text: str) -> str:
|
||||||
"""Pre-process URLs before other text normalization"""
|
"""Pre-process URLs before other text normalization"""
|
||||||
url_patterns = [
|
url_patterns = [
|
||||||
r"https?://[^\s]+", # URLs with http(s)
|
r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" + "|".join(valid_tlds) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?", # URLs with http(s), raw ip, www, or domain.tld
|
||||||
r"www\.[^\s]+", # URLs with www
|
|
||||||
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b" # Email addresses
|
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b" # Email addresses
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -87,7 +95,9 @@ def normalize_urls(text: str) -> str:
|
||||||
def normalize_text(text: str) -> str:
|
def normalize_text(text: str) -> str:
|
||||||
"""Normalize text for TTS processing"""
|
"""Normalize text for TTS processing"""
|
||||||
# Pre-process URLs first
|
# Pre-process URLs first
|
||||||
|
|
||||||
text = normalize_urls(text)
|
text = normalize_urls(text)
|
||||||
|
|
||||||
# Replace quotes and brackets
|
# Replace quotes and brackets
|
||||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||||
|
|
Loading…
Add table
Reference in a new issue