2025-01-08 03:13:17 -07:00
|
|
|
"""
|
|
|
|
Text normalization module for TTS processing.
|
|
|
|
Handles various text formats including URLs, emails, numbers, money, and special characters.
|
|
|
|
Converts them into a format suitable for text-to-speech processing.
|
|
|
|
"""
|
|
|
|
|
2025-01-03 17:54:17 -07:00
|
|
|
import re
|
2025-01-04 17:54:54 -07:00
|
|
|
from functools import lru_cache
|
2025-01-03 17:54:17 -07:00
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
# Constants
|
|
|
|
VALID_TLDS = [
|
|
|
|
"com", "org", "net", "edu", "gov", "mil", "int", "biz", "info", "name",
|
2025-01-07 19:34:38 -05:00
|
|
|
"pro", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat",
|
|
|
|
"xxx", "aero", "arpa", "bg", "br", "ca", "cn", "de", "es", "eu", "fr",
|
2025-01-08 03:13:17 -07:00
|
|
|
"in", "it", "jp", "mx", "nl", "ru", "uk", "us", "io"
|
|
|
|
]
|
|
|
|
|
|
|
|
# Pre-compiled regex patterns for performance
|
|
|
|
EMAIL_PATTERN = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE)
|
|
|
|
URL_PATTERN = re.compile(
|
|
|
|
r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:" +
|
|
|
|
"|".join(VALID_TLDS) + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",
|
|
|
|
re.IGNORECASE
|
|
|
|
)
|
2025-01-07 19:34:38 -05:00
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
def split_num(num: re.Match[str]) -> str:
|
2025-01-03 17:54:17 -07:00
|
|
|
"""Handle number splitting for various formats"""
|
|
|
|
num = num.group()
|
|
|
|
if "." in num:
|
|
|
|
return num
|
|
|
|
elif ":" in num:
|
|
|
|
h, m = [int(n) for n in num.split(":")]
|
|
|
|
if m == 0:
|
|
|
|
return f"{h} o'clock"
|
|
|
|
elif m < 10:
|
|
|
|
return f"{h} oh {m}"
|
|
|
|
return f"{h} {m}"
|
|
|
|
year = int(num[:4])
|
|
|
|
if year < 1100 or year % 1000 < 10:
|
|
|
|
return num
|
|
|
|
left, right = num[:2], int(num[2:4])
|
|
|
|
s = "s" if num.endswith("s") else ""
|
|
|
|
if 100 <= year % 1000 <= 999:
|
|
|
|
if right == 0:
|
|
|
|
return f"{left} hundred{s}"
|
|
|
|
elif right < 10:
|
|
|
|
return f"{left} oh {right}{s}"
|
|
|
|
return f"{left} {right}{s}"
|
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
def handle_money(m: re.Match[str]) -> str:
|
2025-01-03 17:54:17 -07:00
|
|
|
"""Convert money expressions to spoken form"""
|
|
|
|
m = m.group()
|
|
|
|
bill = "dollar" if m[0] == "$" else "pound"
|
|
|
|
if m[-1].isalpha():
|
|
|
|
return f"{m[1:]} {bill}s"
|
|
|
|
elif "." not in m:
|
|
|
|
s = "" if m[1:] == "1" else "s"
|
|
|
|
return f"{m[1:]} {bill}{s}"
|
|
|
|
b, c = m[1:].split(".")
|
|
|
|
s = "" if b == "1" else "s"
|
|
|
|
c = int(c.ljust(2, "0"))
|
|
|
|
coins = (
|
|
|
|
f"cent{'' if c == 1 else 's'}"
|
|
|
|
if m[0] == "$"
|
|
|
|
else ("penny" if c == 1 else "pence")
|
|
|
|
)
|
|
|
|
return f"{b} {bill}{s} and {c} {coins}"
|
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
def handle_decimal(num: re.Match[str]) -> str:
|
2025-01-03 17:54:17 -07:00
|
|
|
"""Convert decimal numbers to spoken form"""
|
|
|
|
a, b = num.group().split(".")
|
|
|
|
return " point ".join([a, " ".join(b)])
|
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
def handle_email(m: re.Match[str]) -> str:
|
|
|
|
"""Convert email addresses into speakable format"""
|
|
|
|
email = m.group(0)
|
|
|
|
parts = email.split('@')
|
|
|
|
if len(parts) == 2:
|
|
|
|
user, domain = parts
|
|
|
|
domain = domain.replace('.', ' dot ')
|
|
|
|
return f"{user} at {domain}"
|
|
|
|
return email
|
|
|
|
|
|
|
|
def handle_url(u: re.Match[str]) -> str:
|
2025-01-07 00:18:44 -07:00
|
|
|
"""Make URLs speakable by converting special characters to spoken words"""
|
|
|
|
if not u:
|
|
|
|
return ""
|
|
|
|
|
|
|
|
url = u.group(0).strip()
|
2025-01-08 03:13:17 -07:00
|
|
|
|
|
|
|
# Handle protocol first
|
|
|
|
url = re.sub(r'^https?://', lambda a: 'https ' if 'https' in a.group() else 'http ', url, flags=re.IGNORECASE)
|
2025-01-07 00:18:44 -07:00
|
|
|
url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
|
2025-01-06 19:40:21 -05:00
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
# Handle port numbers before other replacements
|
|
|
|
url = re.sub(r':(\d+)(?=/|$)', lambda m: f" colon {m.group(1)}", url)
|
2025-01-07 19:34:38 -05:00
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
# Split into domain and path
|
|
|
|
parts = url.split('/', 1)
|
|
|
|
domain = parts[0]
|
|
|
|
path = parts[1] if len(parts) > 1 else ''
|
|
|
|
|
|
|
|
# Handle dots in domain
|
|
|
|
domain = domain.replace('.', ' dot ')
|
|
|
|
|
|
|
|
# Reconstruct URL
|
|
|
|
if path:
|
|
|
|
url = f"{domain} slash {path}"
|
|
|
|
else:
|
|
|
|
url = domain
|
|
|
|
|
|
|
|
# Replace remaining symbols with words
|
2025-01-07 19:34:38 -05:00
|
|
|
url = url.replace("-", " dash ")
|
|
|
|
url = url.replace("_", " underscore ")
|
|
|
|
url = url.replace("?", " question-mark ")
|
2025-01-07 00:18:44 -07:00
|
|
|
url = url.replace("=", " equals ")
|
|
|
|
url = url.replace("&", " ampersand ")
|
2025-01-08 03:13:17 -07:00
|
|
|
url = url.replace(":", " colon ") # Handle any remaining colons
|
2025-01-06 19:40:21 -05:00
|
|
|
|
2025-01-07 00:18:44 -07:00
|
|
|
# Clean up extra spaces
|
|
|
|
return re.sub(r'\s+', ' ', url).strip()
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_urls(text: str) -> str:
|
|
|
|
"""Pre-process URLs before other text normalization"""
|
2025-01-08 03:13:17 -07:00
|
|
|
# Handle email addresses first
|
|
|
|
text = EMAIL_PATTERN.sub(handle_email, text)
|
2025-01-06 19:40:21 -05:00
|
|
|
|
2025-01-08 03:13:17 -07:00
|
|
|
# Handle URLs
|
|
|
|
text = URL_PATTERN.sub(handle_url, text)
|
2025-01-03 17:54:17 -07:00
|
|
|
|
2025-01-07 00:18:44 -07:00
|
|
|
return text
|
2025-01-06 19:40:21 -05:00
|
|
|
|
2025-01-07 00:18:44 -07:00
|
|
|
def normalize_text(text: str) -> str:
|
|
|
|
"""Normalize text for TTS processing"""
|
|
|
|
# Pre-process URLs first
|
|
|
|
text = normalize_urls(text)
|
2025-01-07 19:34:38 -05:00
|
|
|
|
2025-01-03 17:54:17 -07:00
|
|
|
# Replace quotes and brackets
|
|
|
|
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
|
|
|
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
|
|
|
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
|
|
|
text = text.replace("(", "«").replace(")", "»")
|
|
|
|
|
|
|
|
# Handle CJK punctuation
|
|
|
|
for a, b in zip("、。!,:;?", ",.!,:;?"):
|
|
|
|
text = text.replace(a, b + " ")
|
|
|
|
|
|
|
|
# Clean up whitespace
|
|
|
|
text = re.sub(r"[^\S \n]", " ", text)
|
|
|
|
text = re.sub(r" +", " ", text)
|
|
|
|
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
|
|
|
|
|
|
|
|
# Handle titles and abbreviations
|
|
|
|
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
|
|
|
|
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
|
|
|
|
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
|
|
|
|
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
|
|
|
|
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
|
|
|
|
|
|
|
|
# Handle common words
|
|
|
|
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
|
|
|
|
|
|
|
|
# Handle numbers and money
|
|
|
|
text = re.sub(
|
|
|
|
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
|
|
|
|
split_num,
|
|
|
|
text
|
|
|
|
)
|
|
|
|
text = re.sub(r"(?<=\d),(?=\d)", "", text)
|
|
|
|
text = re.sub(
|
|
|
|
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
|
|
|
|
handle_money,
|
|
|
|
text,
|
|
|
|
)
|
|
|
|
text = re.sub(r"\d*\.\d+", handle_decimal, text)
|
|
|
|
|
|
|
|
# Handle various formatting
|
|
|
|
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
|
|
|
|
text = re.sub(r"(?<=\d)S", " S", text)
|
|
|
|
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
|
|
|
|
text = re.sub(r"(?<=X')S\b", "s", text)
|
|
|
|
text = re.sub(
|
|
|
|
r"(?:[A-Za-z]\.){2,} [a-z]",
|
|
|
|
lambda m: m.group().replace(".", "-"),
|
|
|
|
text
|
|
|
|
)
|
|
|
|
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
|
|
|
|
|
|
|
|
return text.strip()
|