Change silent tag syntax to [silent 1.5s]

2025-08-05 16:48:53 +00:00 · 2025-04-04 03:49:27 +08:00 · 2025-04-04 03:49:27 +08:00 · 8c08655a08
commit 8c08655a08
parent 1d163f84f9
3 changed files with 14 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -340,12 +340,12 @@ Key Streaming Metrics:
 <summary>Custom Phonemes and Silence Tags</summary>

 - Custom Phoneme: `[<text>](/phoneme/)`
- Silence Tag: `[silent](/duration/)`
+- Silence Tag: `[silent <duration>]`

 Example:

 ```text
-I [live](/lɪv/) in a city.  [silent](/3s/) Many concerts are broadcast [live](/lˈIv/).
+I [live](/lɪv/) in a city.  [silent 3s] Many concerts are broadcast [live](/lˈIv/).
 ```

 </details>
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@ -14,8 +14,8 @@ from ...structures.schemas import NormalizationOptions

 # Pre-compiled regex patterns for performance
 CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
-# Matching: [silent](/1s/), [silent](/0.5s/), [silent](/.5s/)
-CUSTOM_PHONEME_SILENCE_TAG = re.compile(r"\[silent\]\(\/(\d*\.?\d+)s\/\)")
+# Matching: [silent 1s], [silent 0.5s], [silent .5s]
+SILENCE_TAG = re.compile(r"\[silent (\d*\.?\d+)s\]")

 def process_text_chunk(
    text: str, language: str = "a", skip_phonemize: bool = False
@ -113,7 +113,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
                
        # Handle silence tags
        # Eg: "This is a test sentence, [silent](/1s/) with silence for one second."
-        while match := CUSTOM_PHONEME_SILENCE_TAG.search(sentence):
+        while match := SILENCE_TAG.search(sentence):
            match_prefix = sentence[:match.start()] # `This is a test sentence, `
            match_text = match.group(0)             # `[silent](/1s/)`
            match_suffix = sentence[match.end():]   # ` with silence for one second.`
@ -137,6 +137,10 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
    return results

 def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
+    """
+    Replace [text](/phonemes/) with a <|custom_phonemes_X|/> tag to avoid being normalized.
+    Silence tags like [silence 1.5s] are replaced too.
+    """
    latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
    phenomes_list[latest_id] = s.group(0).strip()
    return latest_id
@ -154,9 +158,10 @@ async def smart_split(

    custom_phoneme_list = {}

+    text = SILENCE_TAG.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
+
    # Normalize text
    if settings.advanced_text_normalization and normalization_options.normalize:
-        print(lang_code)
        if lang_code in ["a","b","en-us","en-gb"]:
            text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
            text=normalize_text(text,normalization_options)
@ -172,7 +177,7 @@ async def smart_split(

    for sentence, tokens, count in sentences:
        # Handle silence tags
-        if CUSTOM_PHONEME_SILENCE_TAG.match(sentence):
+        if SILENCE_TAG.match(sentence):
            # Yield any existing chunk if present.
            if current_chunk:
                chunk_text = " ".join(current_chunk)
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -21,7 +21,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager
 from ..structures.schemas import NormalizationOptions
 from .audio import AudioNormalizer, AudioService
 from .text_processing import tokenize
-from .text_processing.text_processor import CUSTOM_PHONEME_SILENCE_TAG, smart_split
+from .text_processing.text_processor import SILENCE_TAG, smart_split


 class TTSService:
@ -63,7 +63,7 @@ class TTSService:
        async with self._chunk_semaphore:
            try:
                # Handle silence tags, eg: `[silent](0.5s)`
-                if match := CUSTOM_PHONEME_SILENCE_TAG.match(chunk_text):
+                if match := SILENCE_TAG.match(chunk_text):
                    silence_duration = float(match.group(1))
                    silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32)
                    if not output_format: