2025-01-07 00:18:44 -07:00
""" Tests for text normalization service """
import pytest
2025-01-09 18:41:44 -07:00
2025-01-07 00:18:44 -07:00
from api . src . services . text_processing . normalizer import normalize_text
2025-02-11 21:30:41 -05:00
from api . src . structures . schemas import NormalizationOptions
2025-01-09 18:41:44 -07:00
2025-04-04 16:50:46 -06:00
2025-01-08 03:13:17 -07:00
def test_url_protocols ( ) :
""" Test URL protocol handling """
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Check out https://example.com " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Check out https example dot com "
)
2025-02-11 21:30:41 -05:00
assert normalize_text ( " Visit http://site.com " , normalization_options = NormalizationOptions ( ) ) == " Visit http site dot com "
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Go to https://test.org/path " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Go to https test dot org slash path "
)
2025-01-08 03:13:17 -07:00
def test_url_www ( ) :
""" Test www prefix handling """
2025-02-11 21:30:41 -05:00
assert normalize_text ( " Go to www.example.com " , normalization_options = NormalizationOptions ( ) ) == " Go to www example dot com "
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Visit www.test.org/docs " , normalization_options = NormalizationOptions ( ) ) == " Visit www test dot org slash docs "
2025-01-09 18:41:44 -07:00
)
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Check www.site.com?q=test " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Check www site dot com question-mark q equals test "
)
2025-01-08 03:13:17 -07:00
def test_url_localhost ( ) :
""" Test localhost URL handling """
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Running on localhost:7860 " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Running on localhost colon 78 60 "
)
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Server at localhost:8080/api " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Server at localhost colon 80 80 slash api "
)
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Test localhost:3000/test?v=1 " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Test localhost colon 3000 slash test question-mark v equals 1 "
)
2025-01-08 03:13:17 -07:00
def test_url_ip_addresses ( ) :
""" Test IP address URL handling """
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Access 0.0.0.0:9090/test " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test "
)
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " API at 192.168.1.1:8000 " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " API at 192 dot 168 dot 1 dot 1 colon 8000 "
)
2025-02-11 21:30:41 -05:00
assert normalize_text ( " Server 127.0.0.1 " , normalization_options = NormalizationOptions ( ) ) == " Server 127 dot 0 dot 0 dot 1 "
2025-01-08 03:13:17 -07:00
2025-01-09 18:41:44 -07:00
2025-01-08 03:13:17 -07:00
def test_url_raw_domains ( ) :
""" Test raw domain handling """
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Visit google.com/search " , normalization_options = NormalizationOptions ( ) ) == " Visit google dot com slash search "
2025-01-09 18:41:44 -07:00
)
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Go to example.com/path?q=test " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Go to example dot com slash path question-mark q equals test "
)
2025-02-11 21:30:41 -05:00
assert normalize_text ( " Check docs.test.com " , normalization_options = NormalizationOptions ( ) ) == " Check docs dot test dot com "
2025-01-08 03:13:17 -07:00
2025-01-09 18:41:44 -07:00
2025-01-08 03:13:17 -07:00
def test_url_email_addresses ( ) :
""" Test email address handling """
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Email me at user@example.com " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Email me at user at example dot com "
)
2025-02-11 21:30:41 -05:00
assert normalize_text ( " Contact admin@test.org " , normalization_options = NormalizationOptions ( ) ) == " Contact admin at test dot org "
2025-01-09 18:41:44 -07:00
assert (
2025-02-11 21:30:41 -05:00
normalize_text ( " Send to test.user@site.com " , normalization_options = NormalizationOptions ( ) )
2025-01-09 18:41:44 -07:00
== " Send to test dot user at site dot com "
)
2025-03-21 18:03:09 +00:00
def test_money ( ) :
""" Test that money text is normalized correctly """
assert normalize_text ( " He lost $5.3 thousand. " , normalization_options = NormalizationOptions ( ) ) == " He lost five point three thousand dollars. "
assert normalize_text ( " To put it weirdly -$6.9 million " , normalization_options = NormalizationOptions ( ) ) == " To put it weirdly minus six point nine million dollars "
assert normalize_text ( " It costs $50.3. " , normalization_options = NormalizationOptions ( ) ) == " It costs fifty dollars and thirty cents. "
2025-01-08 03:13:17 -07:00
def test_non_url_text ( ) :
""" Test that non-URL text is unaffected """
2025-02-11 21:30:41 -05:00
assert normalize_text ( " This is not.a.url text " , normalization_options = NormalizationOptions ( ) ) == " This is not-a-url text "
assert normalize_text ( " Hello, how are you today? " , normalization_options = NormalizationOptions ( ) ) == " Hello, how are you today? "
2025-02-15 17:48:12 -05:00
assert normalize_text ( " It costs $50. " , normalization_options = NormalizationOptions ( ) ) == " It costs fifty dollars. "