made it skip text normalization when using other languages as it only supports english

2025-08-05 16:48:53 +00:00 · 2025-02-16 14:16:18 -05:00 · 2025-02-16 14:16:18 -05:00 · 9c0e328318
commit 9c0e328318
parent 4802128943
2 changed files with 8 additions and 4 deletions
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@ -106,6 +106,7 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
 async def smart_split(
    text: str, 
    max_tokens: int = settings.absolute_max_tokens,
+    lang_code: str = "a",
    normalization_options: NormalizationOptions = NormalizationOptions()
 ) -> AsyncGenerator[Tuple[str, List[int]], None]:
    """Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
@ -113,9 +114,12 @@ async def smart_split(
    chunk_count = 0
    logger.info(f"Starting smart split for {len(text)} chars")

-    # Normilize text
+    # Normalize text
    if settings.advanced_text_normalization and normalization_options.normalize:
-        text=normalize_text(text,normalization_options)
+        if lang_code in ["a","b","en-us","en-gb"]:
+            text=normalize_text(text,normalization_options)
+        else:
+            logger.info("Skipping text normalization as it is only supported for english")

    # Process all sentences
    sentences = get_sentence_info(text)
@ -241,4 +245,4 @@ async def smart_split(
    total_time = time.time() - start_time
    logger.info(
        f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks"
-    )
+    )
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -264,7 +264,7 @@ class TTSService:
            
            
            # Process text in chunks with smart splitting
-            async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
+            async for chunk_text, tokens in smart_split(text,lang_code=lang_code,normalization_options=normalization_options):
                try:
                    # Process audio for chunk
                    async for result, chunk_data in self._process_chunk(