Kokoro-FastAPI/api/src/services/text_processing/normalizer.py

"""
Text normalization module for TTS processing.
Handles various text formats including URLs, emails, numbers, money, and special characters.
Converts them into a format suitable for text-to-speech processing.
"""

import re
from functools import lru_cache
import inflect

from ...structures.schemas import NormalizationOptions

# Constants
VALID_TLDS = [
    "com",
    "org",
    "net",
    "edu",
    "gov",
    "mil",
    "int",
    "biz",
    "info",
    "name",
    "pro",
    "coop",
    "museum",
    "travel",
    "jobs",
    "mobi",
    "tel",
    "asia",
    "cat",
    "xxx",
    "aero",
    "arpa",
    "bg",
    "br",
    "ca",
    "cn",
    "de",
    "es",
    "eu",
    "fr",
    "in",
    "it",
    "jp",
    "mx",
    "nl",
    "ru",
    "uk",
    "us",
    "io",
]

VALID_UNITS = {
    "m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile",  # Length
    "g":"gram", "kg":"kilogram", "mg":"miligram",      # Mass
    "s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time
    "l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter",  # Volume
    "kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second","cm/h":"centimeter per day", # Speed
    "°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin",     # Temperature
    "pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere",  # Pressure
    "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency
    "v":"volt", "kv":"kilovolt", "mv":"mergavolt",      # Voltage
    "a":"amp", "ma":"megaamp", "ka":"kiloamp",      # Current
    "w":"watt", "kw":"kilowatt", "mw":"megawatt",      # Power
    "j":"joule", "kj":"kilojoule", "mj":"megajoule",      # Energy
    "Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm",      # Resistance (Ohm)
    "f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance
    "b":"bit", "kb":"kilobit", "mb":"megabit", "gb":"gigabit", "tb":"terabit", "pb":"petabit", # Data size
    "kbps":"kilobit per second","mbps":"megabit per second","gbps":"gigabit per second","tbps":"terabit per second",
    "px":"pixel"  # CSS units
}


# Pre-compiled regex patterns for performance
EMAIL_PATTERN = re.compile(
    r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE
)
URL_PATTERN = re.compile(
    r"(https?://|www\.|)+(localhost|[a-zA-Z0-9.-]+(\.(?:"
    + "|".join(VALID_TLDS)
    + "))+|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",
    re.IGNORECASE,
)

UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)\s*(" + "|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[^\w\d]{1}|\b)""",re.IGNORECASE)

INFLECT_ENGINE=inflect.engine()

def split_num(num: re.Match[str]) -> str:
    """Handle number splitting for various formats"""
    num = num.group()
    if "." in num:
        return num
    elif ":" in num:
        h, m = [int(n) for n in num.split(":")]
        if m == 0:
            return f"{h} o'clock"
        elif m < 10:
            return f"{h} oh {m}"
        return f"{h} {m}"
    year = int(num[:4])
    if year < 1100 or year % 1000 < 10:
        return num
    left, right = num[:2], int(num[2:4])
    s = "s" if num.endswith("s") else ""
    if 100 <= year % 1000 <= 999:
        if right == 0:
            return f"{left} hundred{s}"
        elif right < 10:
            return f"{left} oh {right}{s}"
    return f"{left} {right}{s}"

def handle_units(u: re.Match[str]) -> str:
    """Converts units to their full form"""
    unit_string=u.group(6).strip() 
    unit=unit_string
    
    if unit_string.lower() in VALID_UNITS:
        unit=VALID_UNITS[unit_string.lower()].split(" ")
        
        # Handles the B vs b case
        if unit[0].endswith("bit"):
            b_case=unit_string[min(1,len(unit_string) - 1)]
            if b_case == "B":
                unit[0]=unit[0][:-3] + "byte"
            
        number=u.group(1).strip()
        unit[0]=INFLECT_ENGINE.no(unit[0],number)
    return " ".join(unit)

def handle_money(m: re.Match[str]) -> str:
    """Convert money expressions to spoken form"""
    m = m.group()
    bill = "dollar" if m[0] == "$" else "pound"
    if m[-1].isalpha():
        return f"{m[1:]} {bill}s"
    elif "." not in m:
        s = "" if m[1:] == "1" else "s"
        return f"{m[1:]} {bill}{s}"
    b, c = m[1:].split(".")
    s = "" if b == "1" else "s"
    c = int(c.ljust(2, "0"))
    coins = (
        f"cent{'' if c == 1 else 's'}"
        if m[0] == "$"
        else ("penny" if c == 1 else "pence")
    )
    return f"{b} {bill}{s} and {c} {coins}"


def handle_decimal(num: re.Match[str]) -> str:
    """Convert decimal numbers to spoken form"""
    a, b = num.group().split(".")
    return " point ".join([a, " ".join(b)])


def handle_email(m: re.Match[str]) -> str:
    """Convert email addresses into speakable format"""
    email = m.group(0)
    parts = email.split("@")
    if len(parts) == 2:
        user, domain = parts
        domain = domain.replace(".", " dot ")
        return f"{user} at {domain}"
    return email


def handle_url(u: re.Match[str]) -> str:
    """Make URLs speakable by converting special characters to spoken words"""
    if not u:
        return ""

    url = u.group(0).strip()

    # Handle protocol first
    url = re.sub(
        r"^https?://",
        lambda a: "https " if "https" in a.group() else "http ",
        url,
        flags=re.IGNORECASE,
    )
    url = re.sub(r"^www\.", "www ", url, flags=re.IGNORECASE)

    # Handle port numbers before other replacements
    url = re.sub(r":(\d+)(?=/|$)", lambda m: f" colon {m.group(1)}", url)

    # Split into domain and path
    parts = url.split("/", 1)
    domain = parts[0]
    path = parts[1] if len(parts) > 1 else ""

    # Handle dots in domain
    domain = domain.replace(".", " dot ")

    # Reconstruct URL
    if path:
        url = f"{domain} slash {path}"
    else:
        url = domain

    # Replace remaining symbols with words
    url = url.replace("-", " dash ")
    url = url.replace("_", " underscore ")
    url = url.replace("?", " question-mark ")
    url = url.replace("=", " equals ")
    url = url.replace("&", " ampersand ")
    url = url.replace("%", " percent ")
    url = url.replace(":", " colon ")  # Handle any remaining colons
    url = url.replace("/", " slash ")  # Handle any remaining slashes

    # Clean up extra spaces
    return re.sub(r"\s+", " ", url).strip()


def normalize_text(text: str,normalization_options: NormalizationOptions) -> str:
    """Normalize text for TTS processing"""
    # Handle email addresses first if enabled
    if normalization_options.email_normalization:
        text = EMAIL_PATTERN.sub(handle_email, text)

    # Handle URLs if enabled
    if normalization_options.url_normalization:
        text = URL_PATTERN.sub(handle_url, text)

    # Pre-process numbers with units if enabled
    if normalization_options.unit_normalization:
        text=UNIT_PATTERN.sub(handle_units,text)
    
    # Replace optional pluralization
    if normalization_options.optional_pluralization_normalization:
        text = re.sub(r"\(s\)","s",text)
    
    # Replace quotes and brackets
    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
    text = text.replace("«", chr(8220)).replace("»", chr(8221))
    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
    text = text.replace("(", "«").replace(")", "»")

    # Handle CJK punctuation and some non standard chars
    for a, b in zip("、。！，：；？–", ",.!,:;?-"):
        text = text.replace(a, b + " ")

    # Clean up whitespace
    text = re.sub(r"[^\S \n]", " ", text)
    text = re.sub(r"  +", " ", text)
    text = re.sub(r"(?<=\n) +(?=\n)", "", text)

    # Handle titles and abbreviations
    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)

    # Handle common words
    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)

    # Handle numbers and money
    text = re.sub(
        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
    )
    
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    text = re.sub(
        r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
        handle_money,
        text,
    )
    
    text = re.sub(r"\d*\.\d+", handle_decimal, text)

    # Handle various formatting
    text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
    text = re.sub(r"(?<=\d)S", " S", text)
    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
    text = re.sub(r"(?<=X')S\b", "s", text)
    text = re.sub(
        r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
    )
    text = re.sub( r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)

    return text.strip()
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`"""`
			`Text normalization module for TTS processing.`
			`Handles various text formats including URLs, emails, numbers, money, and special characters.`
			`Converts them into a format suitable for text-to-speech processing.`
			`"""`

- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`import re`
First streaming attempt 2025-01-04 17:54:54 -07:00			`from functools import lru_cache`
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`import inflect`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00
Added normilization options 2025-02-11 19:09:35 -05:00			`from ...structures.schemas import NormalizationOptions`

-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Constants`
			`VALID_TLDS = [`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`"com",`
			`"org",`
			`"net",`
			`"edu",`
			`"gov",`
			`"mil",`
			`"int",`
			`"biz",`
			`"info",`
			`"name",`
			`"pro",`
			`"coop",`
			`"museum",`
			`"travel",`
			`"jobs",`
			`"mobi",`
			`"tel",`
			`"asia",`
			`"cat",`
			`"xxx",`
			`"aero",`
			`"arpa",`
			`"bg",`
			`"br",`
			`"ca",`
			`"cn",`
			`"de",`
			`"es",`
			`"eu",`
			`"fr",`
			`"in",`
			`"it",`
			`"jp",`
			`"mx",`
			`"nl",`
			`"ru",`
			`"uk",`
			`"us",`
			`"io",`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`]`

Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`VALID_UNITS = {`
			`"m":"meter", "cm":"centimeter", "mm":"millimeter", "km":"kilometer", "in":"inch", "ft":"foot", "yd":"yard", "mi":"mile", # Length`
			`"g":"gram", "kg":"kilogram", "mg":"miligram", # Mass`
			`"s":"second", "ms":"milisecond", "min":"minutes", "h":"hour", # Time`
			`"l":"liter", "ml":"mililiter", "cl":"centiliter", "dl":"deciliter", # Volume`
made it so bytes vs bits are translated correctly 2025-02-11 15:18:10 +00:00			`"kph":"kilometer per hour", "mph":"mile per hour","mi/h":"mile per hour", "m/s":"meter per second", "km/h":"kilometer per hour", "mm/s":"milimeter per second","cm/s":"centimeter per second", "ft/s":"feet per second","cm/h":"centimeter per day", # Speed`
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`"°c":"degree celsius","c":"degree celsius", "°f":"degree fahrenheit","f":"degree fahrenheit", "k":"kelvin", # Temperature`
			`"pa":"pascal", "kpa":"kilopascal", "mpa":"megapascal", "atm":"atmosphere", # Pressure`
			`"hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz", # Frequency`
			`"v":"volt", "kv":"kilovolt", "mv":"mergavolt", # Voltage`
			`"a":"amp", "ma":"megaamp", "ka":"kiloamp", # Current`
			`"w":"watt", "kw":"kilowatt", "mw":"megawatt", # Power`
			`"j":"joule", "kj":"kilojoule", "mj":"megajoule", # Energy`
			`"Ω":"ohm", "kΩ":"kiloohm", "mΩ":"megaohm", # Resistance (Ohm)`
			`"f":"farad", "µf":"microfarad", "nf":"nanofarad", "pf":"picofarad", # Capacitance`
made it so bytes vs bits are translated correctly 2025-02-11 15:18:10 +00:00			`"b":"bit", "kb":"kilobit", "mb":"megabit", "gb":"gigabit", "tb":"terabit", "pb":"petabit", # Data size`
			`"kbps":"kilobit per second","mbps":"megabit per second","gbps":"gigabit per second","tbps":"terabit per second",`
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`"px":"pixel" # CSS units`
			`}`

made it so bytes vs bits are translated correctly 2025-02-11 15:18:10 +00:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Pre-compiled regex patterns for performance`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`EMAIL_PATTERN = re.compile(`
			`r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b", re.IGNORECASE`
			`)`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`URL_PATTERN = re.compile(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`r"(https?://\|www\.\|)+(localhost\|[a-zA-Z0-9.-]+(\.(?:"`
			`+ "\|".join(VALID_TLDS)`
			`+ "))+\|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})(:[0-9]+)?([/?][^\s]*)?",`
			`re.IGNORECASE,`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`)`
Fix url parsing for urls without https, http, or www. It also allows raw ips, ports, and dashs 2025-01-07 19:34:38 -05:00
made it so bytes vs bits are translated correctly 2025-02-11 15:18:10 +00:00			`UNIT_PATTERN = re.compile(r"((?<!\w)([+-]?)(\d{1,3}(,\d{3})\|\d+)(\.\d+)?)\s(" + "\|".join(sorted(list(VALID_UNITS.keys()),reverse=True)) + r"""){1}(?=[^\w\d]{1}\|\b)""",re.IGNORECASE)`
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00
			`INFLECT_ENGINE=inflect.engine()`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`def split_num(num: re.Match[str]) -> str:`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`"""Handle number splitting for various formats"""`
			`num = num.group()`
			`if "." in num:`
			`return num`
			`elif ":" in num:`
			`h, m = [int(n) for n in num.split(":")]`
			`if m == 0:`
			`return f"{h} o'clock"`
			`elif m < 10:`
			`return f"{h} oh {m}"`
			`return f"{h} {m}"`
			`year = int(num[:4])`
			`if year < 1100 or year % 1000 < 10:`
			`return num`
			`left, right = num[:2], int(num[2:4])`
			`s = "s" if num.endswith("s") else ""`
			`if 100 <= year % 1000 <= 999:`
			`if right == 0:`
			`return f"{left} hundred{s}"`
			`elif right < 10:`
			`return f"{left} oh {right}{s}"`
			`return f"{left} {right}{s}"`

Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`def handle_units(u: re.Match[str]) -> str:`
Added normilization options 2025-02-11 19:09:35 -05:00			`"""Converts units to their full form"""`
made it so bytes vs bits are translated correctly 2025-02-11 15:18:10 +00:00			`unit_string=u.group(6).strip()`
			`unit=unit_string`

			`if unit_string.lower() in VALID_UNITS:`
			`unit=VALID_UNITS[unit_string.lower()].split(" ")`

			`# Handles the B vs b case`
			`if unit[0].endswith("bit"):`
			`b_case=unit_string[min(1,len(unit_string) - 1)]`
			`if b_case == "B":`
			`unit[0]=unit[0][:-3] + "byte"`

Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`number=u.group(1).strip()`
			`unit[0]=INFLECT_ENGINE.no(unit[0],number)`
			`return " ".join(unit)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`def handle_money(m: re.Match[str]) -> str:`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`"""Convert money expressions to spoken form"""`
			`m = m.group()`
			`bill = "dollar" if m[0] == "$" else "pound"`
			`if m[-1].isalpha():`
			`return f"{m[1:]} {bill}s"`
			`elif "." not in m:`
			`s = "" if m[1:] == "1" else "s"`
			`return f"{m[1:]} {bill}{s}"`
			`b, c = m[1:].split(".")`
			`s = "" if b == "1" else "s"`
			`c = int(c.ljust(2, "0"))`
			`coins = (`
			`f"cent{'' if c == 1 else 's'}"`
			`if m[0] == "$"`
			`else ("penny" if c == 1 else "pence")`
			`)`
			`return f"{b} {bill}{s} and {c} {coins}"`

Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`def handle_decimal(num: re.Match[str]) -> str:`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`"""Convert decimal numbers to spoken form"""`
			`a, b = num.group().split(".")`
			`return " point ".join([a, " ".join(b)])`

Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`def handle_email(m: re.Match[str]) -> str:`
			`"""Convert email addresses into speakable format"""`
			`email = m.group(0)`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`parts = email.split("@")`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`if len(parts) == 2:`
			`user, domain = parts`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`domain = domain.replace(".", " dot ")`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`return f"{user} at {domain}"`
			`return email`

Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`def handle_url(u: re.Match[str]) -> str:`
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00			`"""Make URLs speakable by converting special characters to spoken words"""`
			`if not u:`
			`return ""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00			`url = u.group(0).strip()`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Handle protocol first`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`url = re.sub(`
			`r"^https?://",`
			`lambda a: "https " if "https" in a.group() else "http ",`
			`url,`
			`flags=re.IGNORECASE,`
			`)`
			`url = re.sub(r"^www\.", "www ", url, flags=re.IGNORECASE)`

-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Handle port numbers before other replacements`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`url = re.sub(r":(\d+)(?=/\|$)", lambda m: f" colon {m.group(1)}", url)`

-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Split into domain and path`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`parts = url.split("/", 1)`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`domain = parts[0]`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`path = parts[1] if len(parts) > 1 else ""`

-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Handle dots in domain`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`domain = domain.replace(".", " dot ")`

-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Reconstruct URL`
			`if path:`
			`url = f"{domain} slash {path}"`
			`else:`
			`url = domain`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`# Replace remaining symbols with words`
Fix url parsing for urls without https, http, or www. It also allows raw ips, ports, and dashs 2025-01-07 19:34:38 -05:00			`url = url.replace("-", " dash ")`
			`url = url.replace("_", " underscore ")`
			`url = url.replace("?", " question-mark ")`
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00			`url = url.replace("=", " equals ")`
			`url = url.replace("&", " ampersand ")`
Fix remaining slashes not being converted into text and made % be converted 2025-01-08 08:50:22 -05:00			`url = url.replace("%", " percent ")`
-add email handling, minor additional URL processing, tests 2025-01-08 03:13:17 -07:00			`url = url.replace(":", " colon ") # Handle any remaining colons`
Fix remaining slashes not being converted into text and made % be converted 2025-01-08 08:50:22 -05:00			`url = url.replace("/", " slash ") # Handle any remaining slashes`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00			`# Clean up extra spaces`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`return re.sub(r"\s+", " ", url).strip()`
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00

Added normilization options 2025-02-11 19:09:35 -05:00			`def normalize_text(text: str,normalization_options: NormalizationOptions) -> str:`
Added tested, slight changes to regex 2025-01-07 00:18:44 -07:00			`"""Normalize text for TTS processing"""`
Added normilization options 2025-02-11 19:09:35 -05:00			`# Handle email addresses first if enabled`
			`if normalization_options.email_normalization:`
			`text = EMAIL_PATTERN.sub(handle_email, text)`

			`# Handle URLs if enabled`
			`if normalization_options.url_normalization:`
			`text = URL_PATTERN.sub(handle_url, text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Added normilization options 2025-02-11 19:09:35 -05:00			`# Pre-process numbers with units if enabled`
			`if normalization_options.unit_normalization:`
			`text=UNIT_PATTERN.sub(handle_units,text)`
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00
added optional pluralization normalization 2025-02-11 19:24:29 -05:00			`# Replace optional pluralization`
			`if normalization_options.optional_pluralization_normalization:`
			`text = re.sub(r"\(s\)","s",text)`

- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Replace quotes and brackets`
			`text = text.replace(chr(8216), "'").replace(chr(8217), "'")`
			`text = text.replace("«", chr(8220)).replace("»", chr(8221))`
			`text = text.replace(chr(8220), '"').replace(chr(8221), '"')`
			`text = text.replace("(", "«").replace(")", "»")`
Ruff format + fix 2025-01-09 18:41:44 -07:00
Made the api use the normalizer, fixed the wrong version of espeak, added better normilzation, improved the sentence splitting, fixed some formatting 2025-02-10 21:45:05 -05:00			`# Handle CJK punctuation and some non standard chars`
			`for a, b in zip("、。！，：；？–", ",.!,:;?-"):`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`text = text.replace(a, b + " ")`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Clean up whitespace`
			`text = re.sub(r"[^\S \n]", " ", text)`
			`text = re.sub(r" +", " ", text)`
			`text = re.sub(r"(?<=\n) +(?=\n)", "", text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Handle titles and abbreviations`
			`text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)`
			`text = re.sub(r"\b(?:Mr\.\|MR\.(?= [A-Z]))", "Mister", text)`
			`text = re.sub(r"\b(?:Ms\.\|MS\.(?= [A-Z]))", "Miss", text)`
			`text = re.sub(r"\b(?:Mrs\.\|MRS\.(?= [A-Z]))", "Mrs", text)`
			`text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Handle common words`
			`text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Handle numbers and money`
			`text = re.sub(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`r"\d*\.\d+\|\b\d{4}s?\b\|(?<!:)\b(?:[1-9]\|1[0-2]):[0-5]\d\b(?!:)", split_num, text`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`)`
Added normilization options 2025-02-11 19:09:35 -05:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`text = re.sub(r"(?<=\d),(?=\d)", "", text)`
			`text = re.sub(`
			`r"(?i)[$£]\d+(?:\.\d+)?(?: hundred\| thousand\| (?:[bm]\|tr)illion)*\b\|[$£]\d+\.\d\d?\b",`
			`handle_money,`
			`text,`
			`)`
Added normilization options 2025-02-11 19:09:35 -05:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`text = re.sub(r"\d*\.\d+", handle_decimal, text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`# Handle various formatting`
			`text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)`
			`text = re.sub(r"(?<=\d)S", " S", text)`
			`text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)`
			`text = re.sub(r"(?<=X')S\b", "s", text)`
			`text = re.sub(`
Ruff format + fix 2025-01-09 18:41:44 -07:00			`r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text`
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`)`
added optional pluralization normalization 2025-02-11 19:24:29 -05:00			`text = re.sub( r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
- CPU ONNX + PyTorch CUDA, functional - Incorporated text processing module as service, towards modularization and optimizations - Added text processing router for phonemization - Enhanced benchmark statistics with real-time speed metrics 2025-01-03 17:54:17 -07:00			`return text.strip()`