mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-04-13 09:39:17 +00:00
Added tested, slight changes to regex
This commit is contained in:
parent
db2f3dd323
commit
fddf26c905
2 changed files with 57 additions and 21 deletions
|
@ -50,29 +50,44 @@ def handle_decimal(num: re.Match) -> str:
|
|||
return " point ".join([a, " ".join(b)])
|
||||
|
||||
def handle_url(u: re.Match) -> str:
|
||||
"""Make urls speakable"""
|
||||
symbol_to_word={":": "colon", "/":"slash",".":"dot","_":"underscore","-":"dash","?":"question mark", "=":"equals","&":"ampersand","%":"percent"}
|
||||
|
||||
u=u.group(0)
|
||||
|
||||
for s,w in symbol_to_word.items():
|
||||
u=u.replace(s,f" {w} ")
|
||||
u=u.replace(" ", " ")
|
||||
return u
|
||||
|
||||
# @lru_cache(maxsize=1000) # Cache normalized text results
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalize text for TTS processing
|
||||
|
||||
Args:
|
||||
text: Input text to normalize
|
||||
"""Make URLs speakable by converting special characters to spoken words"""
|
||||
if not u:
|
||||
return ""
|
||||
|
||||
Returns:
|
||||
Normalized text
|
||||
"""
|
||||
# Handle URL's
|
||||
text = re.sub(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", handle_url,text)
|
||||
url = u.group(0).strip()
|
||||
# Handle common URL prefixes
|
||||
url = re.sub(r'^https?://', 'http ', url, flags=re.IGNORECASE)
|
||||
url = re.sub(r'^www\.', 'www ', url, flags=re.IGNORECASE)
|
||||
|
||||
# Replace symbols with words
|
||||
url = url.replace("/", " slash ")
|
||||
url = url.replace(".", " dot ")
|
||||
url = url.replace("@", " at ")
|
||||
url = url.replace("?", " question mark ")
|
||||
url = url.replace("=", " equals ")
|
||||
url = url.replace("&", " ampersand ")
|
||||
|
||||
# Clean up extra spaces
|
||||
return re.sub(r'\s+', ' ', url).strip()
|
||||
|
||||
|
||||
def normalize_urls(text: str) -> str:
|
||||
"""Pre-process URLs before other text normalization"""
|
||||
url_patterns = [
|
||||
r"https?://[^\s]+", # URLs with http(s)
|
||||
r"www\.[^\s]+", # URLs with www
|
||||
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b" # Email addresses
|
||||
]
|
||||
|
||||
for pattern in url_patterns:
|
||||
text = re.sub(pattern, handle_url, text, flags=re.IGNORECASE)
|
||||
|
||||
return text
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalize text for TTS processing"""
|
||||
# Pre-process URLs first
|
||||
text = normalize_urls(text)
|
||||
# Replace quotes and brackets
|
||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||
text = text.replace("«", chr(8220)).replace("»", chr(8221))
|
||||
|
|
21
api/tests/test_normalizer.py
Normal file
21
api/tests/test_normalizer.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
"""Tests for text normalization service"""
|
||||
|
||||
import pytest
|
||||
from api.src.services.text_processing.normalizer import normalize_text
|
||||
|
||||
def test_urls():
|
||||
"""Test URL handling"""
|
||||
# URLs with http/https
|
||||
assert normalize_text("Check out https://example.com") == "Check out http example dot com"
|
||||
assert normalize_text("Visit http://site.com/docs") == "Visit http site dot com slash docs"
|
||||
|
||||
# URLs with www
|
||||
assert normalize_text("Go to www.example.com") == "Go to www example dot com"
|
||||
|
||||
# Email addresses
|
||||
assert normalize_text("Email me at user@example.com") == "Email me at user at example dot com"
|
||||
|
||||
# Normal text should be unaffected, other than downstream normalization
|
||||
assert normalize_text("This is not.a.url text") == "This is not-a-url text"
|
||||
assert normalize_text("Hello, how are you today?") == "Hello, how are you today?"
|
||||
assert normalize_text("It costs $50.") == "It costs 50 dollars."
|
Loading…
Add table
Reference in a new issue