diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 8876f0c..280a26e 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -4,6 +4,7 @@ Handles various text formats including URLs, emails, numbers, money, and special Converts them into a format suitable for text-to-speech processing. """ +import math import re from functools import lru_cache from typing import List, Optional, Union @@ -12,7 +13,6 @@ import inflect from numpy import number from text_to_num import text2num from torch import mul -import math from ...structures.schemas import NormalizationOptions @@ -134,11 +134,7 @@ VALID_UNITS = { "px": "pixel", # CSS units } -MONEY_UNITS = { - "$": ("dollar", "cent"), - "£": ("pound", "pence"), - "€": ("euro", "cent") -} +MONEY_UNITS = {"$": ("dollar", "cent"), "£": ("pound", "pence"), "€": ("euro", "cent")} # Pre-compiled regex patterns for performance EMAIL_PATTERN = re.compile( @@ -159,22 +155,24 @@ UNIT_PATTERN = re.compile( ) TIME_PATTERN = re.compile( - r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", - re.IGNORECASE + r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE ) MONEY_PATTERN = re.compile( - r"(-?)([" + ''.join(MONEY_UNITS.keys()) + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b", - re.IGNORECASE + r"(-?)([" + + "".join(MONEY_UNITS.keys()) + + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b", + re.IGNORECASE, ) NUMBER_PATTERN = re.compile( r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b", - re.IGNORECASE + re.IGNORECASE, ) INFLECT_ENGINE = inflect.engine() + def handle_units(u: re.Match[str]) -> str: """Converts units to their full form""" unit_string = u.group(6).strip() @@ -199,19 +197,27 @@ def conditional_int(number: float, threshold: float = 0.00001): return int(round(number)) return number + def translate_multiplier(multiplier: str) -> str: """Translate multiplier abrevations to words""" - multiplier_translation = {"k": "thousand", "m": "million", "b": "billion", "t": "trillion"} + multiplier_translation = { + "k": "thousand", + "m": "million", + "b": "billion", + "t": "trillion", + } if multiplier.lower() in multiplier_translation: return multiplier_translation[multiplier.lower()] return multiplier.strip() + def split_four_digit(number: float): part1 = str(conditional_int(number))[:2] part2 = str(conditional_int(number))[2:] return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}" + def handle_numbers(n: re.Match[str]) -> str: number = n.group(2) @@ -229,18 +235,24 @@ def handle_numbers(n: re.Match[str]) -> str: if multiplier != "": multiplier = f" {multiplier}" else: - if number % 1 == 0 and len(str(number)) == 4 and number > 1500 and number % 1000 > 9: + if ( + number % 1 == 0 + and len(str(number)) == 4 + and number > 1500 + and number % 1000 > 9 + ): return split_four_digit(number) return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}" + def handle_money(m: re.Match[str]) -> str: """Convert money expressions to spoken form""" bill, coin = MONEY_UNITS[m.group(2)] number = m.group(3) - + try: number = float(number) except: @@ -365,7 +377,7 @@ def handle_time(t: re.Match[str]) -> str: half = "" if len(time_parts) > 2: seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip()) - second_word = INFLECT_ENGINE.plural('second',int(time_parts[2].strip())) + second_word = INFLECT_ENGINE.plural("second", int(time_parts[2].strip())) numbers.append(f"and {seconds_number} {second_word}") else: if t[2] is not None: @@ -441,10 +453,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st text, ) - text = NUMBER_PATTERN.sub( - handle_numbers, - text - ) + text = NUMBER_PATTERN.sub(handle_numbers, text) text = re.sub(r"\d*\.\d+", handle_decimal, text) diff --git a/api/tests/test_normalizer.py b/api/tests/test_normalizer.py index 93b3f76..3db0801 100644 --- a/api/tests/test_normalizer.py +++ b/api/tests/test_normalizer.py @@ -149,8 +149,8 @@ def test_money(): assert ( normalize_text( - "He went gambling and lost about $25.05k.", - normalization_options=NormalizationOptions() + "He went gambling and lost about $25.05k.", + normalization_options=NormalizationOptions(), ) == "He went gambling and lost about twenty-five point zero five thousand dollars." ) @@ -169,91 +169,134 @@ def test_money(): ) assert ( - normalize_text("The plant cost $200,000.8.", normalization_options=NormalizationOptions()) + normalize_text( + "The plant cost $200,000.8.", normalization_options=NormalizationOptions() + ) == "The plant cost two hundred thousand dollars and eighty cents." ) assert ( - normalize_text("€30.2 is in euros", normalization_options=NormalizationOptions()) + normalize_text( + "€30.2 is in euros", normalization_options=NormalizationOptions() + ) == "thirty euros and twenty cents is in euros" ) + def test_time(): """Test time normalization""" assert ( - normalize_text("Your flight leaves at 10:35 pm", normalization_options=NormalizationOptions()) + normalize_text( + "Your flight leaves at 10:35 pm", + normalization_options=NormalizationOptions(), + ) == "Your flight leaves at ten thirty-five pm" ) assert ( - normalize_text("He departed for london around 5:03 am.", normalization_options=NormalizationOptions()) + normalize_text( + "He departed for london around 5:03 am.", + normalization_options=NormalizationOptions(), + ) == "He departed for london around five oh three am." ) assert ( - normalize_text("Only the 13:42 and 15:12 slots are available.", normalization_options=NormalizationOptions()) + normalize_text( + "Only the 13:42 and 15:12 slots are available.", + normalization_options=NormalizationOptions(), + ) == "Only the thirteen forty-two and fifteen twelve slots are available." ) assert ( - normalize_text("It is currently 1:00 pm", normalization_options=NormalizationOptions()) + normalize_text( + "It is currently 1:00 pm", normalization_options=NormalizationOptions() + ) == "It is currently one pm" ) assert ( - normalize_text("It is currently 3:00", normalization_options=NormalizationOptions()) + normalize_text( + "It is currently 3:00", normalization_options=NormalizationOptions() + ) == "It is currently three o'clock" ) assert ( - normalize_text("12:00 am is midnight", normalization_options=NormalizationOptions()) + normalize_text( + "12:00 am is midnight", normalization_options=NormalizationOptions() + ) == "twelve am is midnight" ) + def test_number(): """Test number normalization""" assert ( - normalize_text("I bought 1035 cans of soda", normalization_options=NormalizationOptions()) + normalize_text( + "I bought 1035 cans of soda", normalization_options=NormalizationOptions() + ) == "I bought one thousand and thirty-five cans of soda" ) assert ( - normalize_text("The bus has a maximum capacity of 62 people", normalization_options=NormalizationOptions()) + normalize_text( + "The bus has a maximum capacity of 62 people", + normalization_options=NormalizationOptions(), + ) == "The bus has a maximum capacity of sixty-two people" ) assert ( - normalize_text("There are 1300 products left in stock", normalization_options=NormalizationOptions()) + normalize_text( + "There are 1300 products left in stock", + normalization_options=NormalizationOptions(), + ) == "There are one thousand, three hundred products left in stock" ) assert ( - normalize_text("The population is 7,890,000 people.", normalization_options=NormalizationOptions()) + normalize_text( + "The population is 7,890,000 people.", + normalization_options=NormalizationOptions(), + ) == "The population is seven million, eight hundred and ninety thousand people." ) assert ( - normalize_text("He looked around but only found 1.6k of the 10k bricks", normalization_options=NormalizationOptions()) + normalize_text( + "He looked around but only found 1.6k of the 10k bricks", + normalization_options=NormalizationOptions(), + ) == "He looked around but only found one point six thousand of the ten thousand bricks" ) assert ( - normalize_text("The book has 342 pages.", normalization_options=NormalizationOptions()) + normalize_text( + "The book has 342 pages.", normalization_options=NormalizationOptions() + ) == "The book has three hundred and forty-two pages." ) assert ( - normalize_text("He made -50 sales today.", normalization_options=NormalizationOptions()) + normalize_text( + "He made -50 sales today.", normalization_options=NormalizationOptions() + ) == "He made minus fifty sales today." ) assert ( - normalize_text("56.789 to the power of 1.35 million", normalization_options=NormalizationOptions()) + normalize_text( + "56.789 to the power of 1.35 million", + normalization_options=NormalizationOptions(), + ) == "fifty-six point seven eight nine to the power of one point three five million" ) + def test_non_url_text(): """Test that non-URL text is unaffected""" assert ( diff --git a/dev/Test copy 2.py b/dev/Test copy 2.py index eecbea5..52634ec 100644 --- a/dev/Test copy 2.py +++ b/dev/Test copy 2.py @@ -35,4 +35,4 @@ for chunk in response.iter_lines(decode_unicode=True): f.write(chunk_audio) # Print word level timestamps - print(chunk_json["timestamps"]) \ No newline at end of file + print(chunk_json["timestamps"])