Added tested, slight changes to regex

This commit is contained in:
remsky 2025-01-07 00:18:44 -07:00
parent db2f3dd323
commit fddf26c905
2 changed files with 57 additions and 21 deletions

View file

@ -50,29 +50,44 @@ def handle_decimal(num: re.Match) -> str:
return " point ".join([a, " ".join(b)])
def handle_url(u: re.Match) -> str:
"""Make urls speakable"""
symbol_to_word={":": "colon", "/":"slash",".":"dot","_":"underscore","-":"dash","?":"question mark", "=":"equals","&":"ampersand","%":"percent"}
u=u.group(0)
for s,w in symbol_to_word.items():
u=u.replace(s,f" {w} ")
u=u.replace(" ", " ")
return u
# @lru_cache(maxsize=1000) # Cache normalized text results
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing
Args:
text: Input text to normalize
"""Make URLs speakable by converting special characters to spoken words"""
if not u:
return ""
Returns:
Normalized text
"""
# Handle URL's
text = re.sub(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", handle_url,text)
url = u.group(0).strip()
# Handle common URL prefixes
url = re.sub(r'^https?://', 'http ', url, flags=re.IGNORECASE)
url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
# Replace symbols with words
url = url.replace("/", " slash ")
url = url.replace(".", " dot ")
url = url.replace("@", " at ")
url = url.replace("?", " question mark ")
url = url.replace("=", " equals ")
url = url.replace("&", " ampersand ")
# Clean up extra spaces
return re.sub(r'\s+', ' ', url).strip()
def normalize_urls(text: str) -> str:
"""Pre-process URLs before other text normalization"""
url_patterns = [
r"https?://[^\s]+", # URLs with http(s)
r"www\.[^\s]+", # URLs with www
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b" # Email addresses
]
for pattern in url_patterns:
text = re.sub(pattern, handle_url, text, flags=re.IGNORECASE)
return text
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing"""
# Pre-process URLs first
text = normalize_urls(text)
# Replace quotes and brackets
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))

View file

@ -0,0 +1,21 @@
"""Tests for text normalization service"""
import pytest
from api.src.services.text_processing.normalizer import normalize_text
def test_urls():
"""Test URL handling"""
# URLs with http/https
assert normalize_text("Check out https://example.com") == "Check out http example dot com"
assert normalize_text("Visit http://site.com/docs") == "Visit http site dot com slash docs"
# URLs with www
assert normalize_text("Go to www.example.com") == "Go to www example dot com"
# Email addresses
assert normalize_text("Email me at user@example.com") == "Email me at user at example dot com"
# Normal text should be unaffected, other than downstream normalization
assert normalize_text("This is not.a.url text") == "This is not-a-url text"
assert normalize_text("Hello, how are you today?") == "Hello, how are you today?"
assert normalize_text("It costs $50.") == "It costs 50 dollars."