Refactor text normalization: Move handling of problematic symbols to occur after number and money processing to improve accuracy in text normalization.

This commit is contained in:
Lukin 2025-06-01 10:18:24 +08:00
parent 84d2a4d806
commit 888e3121ff

View file

@ -441,7 +441,29 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
text = text.replace('\n', ' ') text = text.replace('\n', ' ')
text = text.replace('\r', ' ') text = text.replace('\r', ' ')
# Handle other problematic symbols # Handle titles and abbreviations
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
# Handle common words
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
# Handle numbers and money BEFORE replacing special characters
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = MONEY_PATTERN.sub(
handle_money,
text,
)
text = NUMBER_PATTERN.sub(handle_numbers, text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
# Handle other problematic symbols AFTER money/number processing
text = text.replace('~', '') # Remove tilde text = text.replace('~', '') # Remove tilde
text = text.replace('@', ' at ') # At symbol text = text.replace('@', ' at ') # At symbol
text = text.replace('#', ' number ') # Hash/pound text = text.replace('#', ' number ') # Hash/pound
@ -457,28 +479,6 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
text = text.replace('=', ' equals ') # Equals sign text = text.replace('=', ' equals ') # Equals sign
text = text.replace('+', ' plus ') # Plus sign text = text.replace('+', ' plus ') # Plus sign
# Handle titles and abbreviations
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
# Handle common words
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
# Handle numbers and money
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = MONEY_PATTERN.sub(
handle_money,
text,
)
text = NUMBER_PATTERN.sub(handle_numbers, text)
text = re.sub(r"\d*\.\d+", handle_decimal, text)
# Handle various formatting # Handle various formatting
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text) text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text) text = re.sub(r"(?<=\d)S", " S", text)