Fixed formatting

2025-08-05 16:48:53 +00:00 · 2025-05-01 18:16:36 +00:00 · 2025-05-01 18:16:36 +00:00 · c80d4feb69
commit c80d4feb69
parent f89b76d5d6
3 changed files with 90 additions and 38 deletions
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -4,6 +4,7 @@ Handles various text formats including URLs, emails, numbers, money, and special
 Converts them into a format suitable for text-to-speech processing.
 """

+import math
 import re
 from functools import lru_cache
 from typing import List, Optional, Union
@ -12,7 +13,6 @@ import inflect
 from numpy import number
 from text_to_num import text2num
 from torch import mul
-import math

 from ...structures.schemas import NormalizationOptions

@ -134,11 +134,7 @@ VALID_UNITS = {
    "px": "pixel",  # CSS units
 }

-MONEY_UNITS = {
-    "$": ("dollar", "cent"),
-    "£": ("pound", "pence"),
-    "€": ("euro", "cent")
-}
+MONEY_UNITS = {"$": ("dollar", "cent"), "£": ("pound", "pence"), "€": ("euro", "cent")}

 # Pre-compiled regex patterns for performance
 EMAIL_PATTERN = re.compile(
@ -159,22 +155,24 @@ UNIT_PATTERN = re.compile(
 )

 TIME_PATTERN = re.compile(
-    r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?",
-    re.IGNORECASE
+    r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE
 )

 MONEY_PATTERN = re.compile(
-    r"(-?)([" + ''.join(MONEY_UNITS.keys()) + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b",
-    re.IGNORECASE
+    r"(-?)(["
+    + "".join(MONEY_UNITS.keys())
+    + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b",
+    re.IGNORECASE,
 )

 NUMBER_PATTERN = re.compile(
    r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b",
-    re.IGNORECASE
+    re.IGNORECASE,
 )

 INFLECT_ENGINE = inflect.engine()

+
 def handle_units(u: re.Match[str]) -> str:
    """Converts units to their full form"""
    unit_string = u.group(6).strip()
@ -199,19 +197,27 @@ def conditional_int(number: float, threshold: float = 0.00001):
        return int(round(number))
    return number

+
 def translate_multiplier(multiplier: str) -> str:
    """Translate multiplier abrevations to words"""

-    multiplier_translation = {"k": "thousand", "m": "million", "b": "billion", "t": "trillion"}
+    multiplier_translation = {
+        "k": "thousand",
+        "m": "million",
+        "b": "billion",
+        "t": "trillion",
+    }
    if multiplier.lower() in multiplier_translation:
        return multiplier_translation[multiplier.lower()]
    return multiplier.strip()

+
 def split_four_digit(number: float):
    part1 = str(conditional_int(number))[:2]
    part2 = str(conditional_int(number))[2:]
    return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}"

+
 def handle_numbers(n: re.Match[str]) -> str:
    number = n.group(2)

@ -229,18 +235,24 @@ def handle_numbers(n: re.Match[str]) -> str:
    if multiplier != "":
        multiplier = f" {multiplier}"
    else:
-        if number % 1 == 0 and len(str(number)) == 4 and number > 1500 and number % 1000 > 9:
+        if (
+            number % 1 == 0
+            and len(str(number)) == 4
+            and number > 1500
+            and number % 1000 > 9
+        ):
            return split_four_digit(number)

    return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}"

+
 def handle_money(m: re.Match[str]) -> str:
    """Convert money expressions to spoken form"""

    bill, coin = MONEY_UNITS[m.group(2)]

    number = m.group(3)
-    
+
    try:
        number = float(number)
    except:
@ -365,7 +377,7 @@ def handle_time(t: re.Match[str]) -> str:
    half = ""
    if len(time_parts) > 2:
        seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip())
-        second_word = INFLECT_ENGINE.plural('second',int(time_parts[2].strip()))
+        second_word = INFLECT_ENGINE.plural("second", int(time_parts[2].strip()))
        numbers.append(f"and {seconds_number} {second_word}")
    else:
        if t[2] is not None:
@ -441,10 +453,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
        text,
    )

-    text = NUMBER_PATTERN.sub(
-        handle_numbers,
-        text
-    )
+    text = NUMBER_PATTERN.sub(handle_numbers, text)

    text = re.sub(r"\d*\.\d+", handle_decimal, text)

--- a/api/tests/test_normalizer.py
+++ b/api/tests/test_normalizer.py
@ -149,8 +149,8 @@ def test_money():

    assert (
        normalize_text(
-            "He went gambling and lost about $25.05k.", 
-            normalization_options=NormalizationOptions()
+            "He went gambling and lost about $25.05k.",
+            normalization_options=NormalizationOptions(),
        )
        == "He went gambling and lost about twenty-five point zero five thousand dollars."
    )
@ -169,91 +169,134 @@ def test_money():
    )

    assert (
-        normalize_text("The plant cost $200,000.8.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "The plant cost $200,000.8.", normalization_options=NormalizationOptions()
+        )
        == "The plant cost two hundred thousand dollars and eighty cents."
    )

    assert (
-        normalize_text("€30.2 is in euros", normalization_options=NormalizationOptions())
+        normalize_text(
+            "€30.2 is in euros", normalization_options=NormalizationOptions()
+        )
        == "thirty euros and twenty cents is in euros"
    )

+
 def test_time():
    """Test time normalization"""

    assert (
-        normalize_text("Your flight leaves at 10:35 pm", normalization_options=NormalizationOptions())
+        normalize_text(
+            "Your flight leaves at 10:35 pm",
+            normalization_options=NormalizationOptions(),
+        )
        == "Your flight leaves at ten thirty-five pm"
    )

    assert (
-        normalize_text("He departed for london around 5:03 am.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "He departed for london around 5:03 am.",
+            normalization_options=NormalizationOptions(),
+        )
        == "He departed for london around five oh three am."
    )

    assert (
-        normalize_text("Only the 13:42 and 15:12 slots are available.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "Only the 13:42 and 15:12 slots are available.",
+            normalization_options=NormalizationOptions(),
+        )
        == "Only the thirteen forty-two and fifteen twelve slots are available."
    )

    assert (
-        normalize_text("It is currently 1:00 pm", normalization_options=NormalizationOptions())
+        normalize_text(
+            "It is currently 1:00 pm", normalization_options=NormalizationOptions()
+        )
        == "It is currently one pm"
    )

    assert (
-        normalize_text("It is currently 3:00", normalization_options=NormalizationOptions())
+        normalize_text(
+            "It is currently 3:00", normalization_options=NormalizationOptions()
+        )
        == "It is currently three o'clock"
    )

    assert (
-        normalize_text("12:00 am is midnight", normalization_options=NormalizationOptions())
+        normalize_text(
+            "12:00 am is midnight", normalization_options=NormalizationOptions()
+        )
        == "twelve am is midnight"
    )

+
 def test_number():
    """Test number normalization"""

    assert (
-        normalize_text("I bought 1035 cans of soda", normalization_options=NormalizationOptions())
+        normalize_text(
+            "I bought 1035 cans of soda", normalization_options=NormalizationOptions()
+        )
        == "I bought one thousand and thirty-five cans of soda"
    )

    assert (
-        normalize_text("The bus has a maximum capacity of 62 people", normalization_options=NormalizationOptions())
+        normalize_text(
+            "The bus has a maximum capacity of 62 people",
+            normalization_options=NormalizationOptions(),
+        )
        == "The bus has a maximum capacity of sixty-two people"
    )

    assert (
-        normalize_text("There are 1300 products left in stock", normalization_options=NormalizationOptions())
+        normalize_text(
+            "There are 1300 products left in stock",
+            normalization_options=NormalizationOptions(),
+        )
        == "There are one thousand, three hundred products left in stock"
    )

    assert (
-        normalize_text("The population is 7,890,000 people.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "The population is 7,890,000 people.",
+            normalization_options=NormalizationOptions(),
+        )
        == "The population is seven million, eight hundred and ninety thousand people."
    )

    assert (
-        normalize_text("He looked around but only found 1.6k of the 10k bricks", normalization_options=NormalizationOptions())
+        normalize_text(
+            "He looked around but only found 1.6k of the 10k bricks",
+            normalization_options=NormalizationOptions(),
+        )
        == "He looked around but only found one point six thousand of the ten thousand bricks"
    )

    assert (
-        normalize_text("The book has 342 pages.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "The book has 342 pages.", normalization_options=NormalizationOptions()
+        )
        == "The book has three hundred and forty-two pages."
    )

    assert (
-        normalize_text("He made -50 sales today.", normalization_options=NormalizationOptions())
+        normalize_text(
+            "He made -50 sales today.", normalization_options=NormalizationOptions()
+        )
        == "He made minus fifty sales today."
    )

    assert (
-        normalize_text("56.789 to the power of 1.35 million", normalization_options=NormalizationOptions())
+        normalize_text(
+            "56.789 to the power of 1.35 million",
+            normalization_options=NormalizationOptions(),
+        )
        == "fifty-six point seven eight nine to the power of one point three five million"
    )

+
 def test_non_url_text():
    """Test that non-URL text is unaffected"""
    assert (
--- a/dev/Test
+++ b/dev/Test
@ -35,4 +35,4 @@ for chunk in response.iter_lines(decode_unicode=True):
        f.write(chunk_audio)

        # Print word level timestamps
-        print(chunk_json["timestamps"])
+        print(chunk_json["timestamps"])