diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index e9f73c0..280a26e 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -4,8 +4,10 @@ Handles various text formats including URLs, emails, numbers, money, and special Converts them into a format suitable for text-to-speech processing. """ +import math import re from functools import lru_cache +from typing import List, Optional, Union import inflect from numpy import number @@ -132,6 +134,7 @@ VALID_UNITS = { "px": "pixel", # CSS units } +MONEY_UNITS = {"$": ("dollar", "cent"), "£": ("pound", "pence"), "€": ("euro", "cent")} # Pre-compiled regex patterns for performance EMAIL_PATTERN = re.compile( @@ -152,37 +155,24 @@ UNIT_PATTERN = re.compile( ) TIME_PATTERN = re.compile( - r"([0-9]{2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE + r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE +) + +MONEY_PATTERN = re.compile( + r"(-?)([" + + "".join(MONEY_UNITS.keys()) + + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b", + re.IGNORECASE, +) + +NUMBER_PATTERN = re.compile( + r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b", + re.IGNORECASE, ) INFLECT_ENGINE = inflect.engine() -def split_num(num: re.Match[str]) -> str: - """Handle number splitting for various formats""" - num = num.group() - if "." in num: - return num - elif ":" in num: - h, m = [int(n) for n in num.split(":")] - if m == 0: - return f"{h} o'clock" - elif m < 10: - return f"{h} oh {m}" - return f"{h} {m}" - year = int(num[:4]) - if year < 1100 or year % 1000 < 10: - return num - left, right = num[:2], int(num[2:4]) - s = "s" if num.endswith("s") else "" - if 100 <= year % 1000 <= 999: - if right == 0: - return f"{left} hundred{s}" - elif right < 10: - return f"{left} oh {right}{s}" - return f"{left} {right}{s}" - - def handle_units(u: re.Match[str]) -> str: """Converts units to their full form""" unit_string = u.group(6).strip() @@ -208,14 +198,61 @@ def conditional_int(number: float, threshold: float = 0.00001): return number +def translate_multiplier(multiplier: str) -> str: + """Translate multiplier abrevations to words""" + + multiplier_translation = { + "k": "thousand", + "m": "million", + "b": "billion", + "t": "trillion", + } + if multiplier.lower() in multiplier_translation: + return multiplier_translation[multiplier.lower()] + return multiplier.strip() + + +def split_four_digit(number: float): + part1 = str(conditional_int(number))[:2] + part2 = str(conditional_int(number))[2:] + return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}" + + +def handle_numbers(n: re.Match[str]) -> str: + number = n.group(2) + + try: + number = float(number) + except: + return n.group() + + if n.group(1) == "-": + number *= -1 + + multiplier = translate_multiplier(n.group(3)) + + number = conditional_int(number) + if multiplier != "": + multiplier = f" {multiplier}" + else: + if ( + number % 1 == 0 + and len(str(number)) == 4 + and number > 1500 + and number % 1000 > 9 + ): + return split_four_digit(number) + + return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}" + + def handle_money(m: re.Match[str]) -> str: """Convert money expressions to spoken form""" - bill = "dollar" if m.group(2) == "$" else "pound" - coin = "cent" if m.group(2) == "$" else "pence" + bill, coin = MONEY_UNITS[m.group(2)] + number = m.group(3) - multiplier = m.group(4) try: number = float(number) except: @@ -224,12 +261,17 @@ def handle_money(m: re.Match[str]) -> str: if m.group(1) == "-": number *= -1 + multiplier = translate_multiplier(m.group(4)) + + if multiplier != "": + multiplier = f" {multiplier}" + if number % 1 == 0 or multiplier != "": text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}" else: sub_number = int(str(number).split(".")[-1].ljust(2, "0")) - text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}" + text_number = f"{INFLECT_ENGINE.number_to_words(int(math.floor(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}" return text_number @@ -320,15 +362,31 @@ def handle_phone_number(p: re.Match[str]) -> str: def handle_time(t: re.Match[str]) -> str: t = t.groups() - numbers = " ".join( - [INFLECT_ENGINE.number_to_words(X.strip()) for X in t[0].split(":")] - ) + time_parts = t[0].split(":") + + numbers = [] + numbers.append(INFLECT_ENGINE.number_to_words(time_parts[0].strip())) + + minute_number = INFLECT_ENGINE.number_to_words(time_parts[1].strip()) + if int(time_parts[1]) < 10: + if int(time_parts[1]) != 0: + numbers.append(f"oh {minute_number}") + else: + numbers.append(minute_number) half = "" - if t[2] is not None: - half = t[2].strip() + if len(time_parts) > 2: + seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip()) + second_word = INFLECT_ENGINE.plural("second", int(time_parts[2].strip())) + numbers.append(f"and {seconds_number} {second_word}") + else: + if t[2] is not None: + half = " " + t[2].strip() + else: + if int(time_parts[1]) == 0: + numbers.append("o'clock") - return numbers + half + return " ".join(numbers) + half def normalize_text(text: str, normalization_options: NormalizationOptions) -> str: @@ -366,7 +424,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st for a, b in zip("、。!,:;?–", ",.!,:;?-"): text = text.replace(a, b + " ") - # Handle simple time in the format of HH:MM:SS + # Handle simple time in the format of HH:MM:SS (am/pm) text = TIME_PATTERN.sub( handle_time, text, @@ -390,15 +448,12 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st # Handle numbers and money text = re.sub(r"(?<=\d),(?=\d)", "", text) - text = re.sub( - r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b", + text = MONEY_PATTERN.sub( handle_money, text, ) - text = re.sub( - r"\d*\.\d+|\b\d{4}s?\b|(?