Refactor text normalization: Move handling of problematic symbols to occur after number and money processing to improve accuracy in text normalization.

2025-08-05 16:48:53 +00:00 · 2025-06-01 10:18:24 +08:00 · 2025-06-01 10:18:24 +08:00 · 888e3121ff
commit 888e3121ff
parent 84d2a4d806
1 changed files with 23 additions and 23 deletions
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@ -441,7 +441,29 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
-    # Handle other problematic symbols
+    # Handle titles and abbreviations
    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
    # Handle common words
    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
    # Handle numbers and money BEFORE replacing special characters
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    text = MONEY_PATTERN.sub(
        handle_money,
        text,
    )
    text = NUMBER_PATTERN.sub(handle_numbers, text)
    text = re.sub(r"\d*\.\d+", handle_decimal, text)
    # Handle other problematic symbols AFTER money/number processing
    text = text.replace('~', '')    # Remove tilde
    text = text.replace('@', ' at ')  # At symbol
    text = text.replace('#', ' number ')  # Hash/pound
@ -457,28 +479,6 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
    text = text.replace('=', ' equals ')  # Equals sign
    text = text.replace('+', ' plus ')    # Plus sign
    # Handle titles and abbreviations
    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
    # Handle common words
    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
    # Handle numbers and money
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    text = MONEY_PATTERN.sub(
        handle_money,
        text,
    )
    text = NUMBER_PATTERN.sub(handle_numbers, text)
    text = re.sub(r"\d*\.\d+", handle_decimal, text)
    # Handle various formatting
    text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
    text = re.sub(r"(?<=\d)S", " S", text)