From 8c08655a0855f6295bf28ba776ca12ab1f5cd301 Mon Sep 17 00:00:00 2001
From: fondoger <fondoger@outlook.com>
Date: Fri, 4 Apr 2025 03:49:27 +0800
Subject: [PATCH] Change silent tag syntax to [silent 1.5s]

---
 README.md                                         |  4 ++--
 .../services/text_processing/text_processor.py    | 15 ++++++++++-----
 api/src/services/tts_service.py                   |  4 ++--
 3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index c3e380a..496974c 100644
--- a/README.md
+++ b/README.md
@@ -340,12 +340,12 @@ Key Streaming Metrics:
 <summary>Custom Phonemes and Silence Tags</summary>
 
 - Custom Phoneme: `[<text>](/phoneme/)`
-- Silence Tag: `[silent](/duration/)`
+- Silence Tag: `[silent <duration>]`
 
 Example:
 
 ```text
-I [live](/lɪv/) in a city.  [silent](/3s/) Many concerts are broadcast [live](/lˈIv/).
+I [live](/lɪv/) in a city.  [silent 3s] Many concerts are broadcast [live](/lˈIv/).
 ```
 
 </details>
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
index 0fa711d..3eb7173 100644
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@@ -14,8 +14,8 @@ from ...structures.schemas import NormalizationOptions
 
 # Pre-compiled regex patterns for performance
 CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
-# Matching: [silent](/1s/), [silent](/0.5s/), [silent](/.5s/)
-CUSTOM_PHONEME_SILENCE_TAG = re.compile(r"\[silent\]\(\/(\d*\.?\d+)s\/\)")
+# Matching: [silent 1s], [silent 0.5s], [silent .5s]
+SILENCE_TAG = re.compile(r"\[silent (\d*\.?\d+)s\]")
 
 def process_text_chunk(
     text: str, language: str = "a", skip_phonemize: bool = False
@@ -113,7 +113,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
                 
         # Handle silence tags
         # Eg: "This is a test sentence, [silent](/1s/) with silence for one second."
-        while match := CUSTOM_PHONEME_SILENCE_TAG.search(sentence):
+        while match := SILENCE_TAG.search(sentence):
             match_prefix = sentence[:match.start()] # `This is a test sentence, `
             match_text = match.group(0)             # `[silent](/1s/)`
             match_suffix = sentence[match.end():]   # ` with silence for one second.`
@@ -137,6 +137,10 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
     return results
 
 def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
+    """
+    Replace [text](/phonemes/) with a <|custom_phonemes_X|/> tag to avoid being normalized.
+    Silence tags like [silence 1.5s] are replaced too.
+    """
     latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
     phenomes_list[latest_id] = s.group(0).strip()
     return latest_id
@@ -154,9 +158,10 @@ async def smart_split(
 
     custom_phoneme_list = {}
 
+    text = SILENCE_TAG.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
+
     # Normalize text
     if settings.advanced_text_normalization and normalization_options.normalize:
-        print(lang_code)
         if lang_code in ["a","b","en-us","en-gb"]:
             text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
             text=normalize_text(text,normalization_options)
@@ -172,7 +177,7 @@ async def smart_split(
 
     for sentence, tokens, count in sentences:
         # Handle silence tags
-        if CUSTOM_PHONEME_SILENCE_TAG.match(sentence):
+        if SILENCE_TAG.match(sentence):
             # Yield any existing chunk if present.
             if current_chunk:
                 chunk_text = " ".join(current_chunk)
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index 4195952..c27399d 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -21,7 +21,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager
 from ..structures.schemas import NormalizationOptions
 from .audio import AudioNormalizer, AudioService
 from .text_processing import tokenize
-from .text_processing.text_processor import CUSTOM_PHONEME_SILENCE_TAG, smart_split
+from .text_processing.text_processor import SILENCE_TAG, smart_split
 
 
 class TTSService:
@@ -63,7 +63,7 @@ class TTSService:
         async with self._chunk_semaphore:
             try:
                 # Handle silence tags, eg: `[silent](0.5s)`
-                if match := CUSTOM_PHONEME_SILENCE_TAG.match(chunk_text):
+                if match := SILENCE_TAG.match(chunk_text):
                     silence_duration = float(match.group(1))
                     silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32)
                     if not output_format: