From 8c08655a0855f6295bf28ba776ca12ab1f5cd301 Mon Sep 17 00:00:00 2001 From: fondoger Date: Fri, 4 Apr 2025 03:49:27 +0800 Subject: [PATCH] Change silent tag syntax to [silent 1.5s] --- README.md | 4 ++-- .../services/text_processing/text_processor.py | 15 ++++++++++----- api/src/services/tts_service.py | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c3e380a..496974c 100644 --- a/README.md +++ b/README.md @@ -340,12 +340,12 @@ Key Streaming Metrics: Custom Phonemes and Silence Tags - Custom Phoneme: `[](/phoneme/)` -- Silence Tag: `[silent](/duration/)` +- Silence Tag: `[silent ]` Example: ```text -I [live](/lɪv/) in a city. [silent](/3s/) Many concerts are broadcast [live](/lˈIv/). +I [live](/lɪv/) in a city. [silent 3s] Many concerts are broadcast [live](/lˈIv/). ``` diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 0fa711d..3eb7173 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -14,8 +14,8 @@ from ...structures.schemas import NormalizationOptions # Pre-compiled regex patterns for performance CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))") -# Matching: [silent](/1s/), [silent](/0.5s/), [silent](/.5s/) -CUSTOM_PHONEME_SILENCE_TAG = re.compile(r"\[silent\]\(\/(\d*\.?\d+)s\/\)") +# Matching: [silent 1s], [silent 0.5s], [silent .5s] +SILENCE_TAG = re.compile(r"\[silent (\d*\.?\d+)s\]") def process_text_chunk( text: str, language: str = "a", skip_phonemize: bool = False @@ -113,7 +113,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T # Handle silence tags # Eg: "This is a test sentence, [silent](/1s/) with silence for one second." - while match := CUSTOM_PHONEME_SILENCE_TAG.search(sentence): + while match := SILENCE_TAG.search(sentence): match_prefix = sentence[:match.start()] # `This is a test sentence, ` match_text = match.group(0) # `[silent](/1s/)` match_suffix = sentence[match.end():] # ` with silence for one second.` @@ -137,6 +137,10 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T return results def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str: + """ + Replace [text](/phonemes/) with a <|custom_phonemes_X|/> tag to avoid being normalized. + Silence tags like [silence 1.5s] are replaced too. + """ latest_id = f"" phenomes_list[latest_id] = s.group(0).strip() return latest_id @@ -154,9 +158,10 @@ async def smart_split( custom_phoneme_list = {} + text = SILENCE_TAG.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text) + # Normalize text if settings.advanced_text_normalization and normalization_options.normalize: - print(lang_code) if lang_code in ["a","b","en-us","en-gb"]: text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text) text=normalize_text(text,normalization_options) @@ -172,7 +177,7 @@ async def smart_split( for sentence, tokens, count in sentences: # Handle silence tags - if CUSTOM_PHONEME_SILENCE_TAG.match(sentence): + if SILENCE_TAG.match(sentence): # Yield any existing chunk if present. if current_chunk: chunk_text = " ".join(current_chunk) diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 4195952..c27399d 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -21,7 +21,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager from ..structures.schemas import NormalizationOptions from .audio import AudioNormalizer, AudioService from .text_processing import tokenize -from .text_processing.text_processor import CUSTOM_PHONEME_SILENCE_TAG, smart_split +from .text_processing.text_processor import SILENCE_TAG, smart_split class TTSService: @@ -63,7 +63,7 @@ class TTSService: async with self._chunk_semaphore: try: # Handle silence tags, eg: `[silent](0.5s)` - if match := CUSTOM_PHONEME_SILENCE_TAG.match(chunk_text): + if match := SILENCE_TAG.match(chunk_text): silence_duration = float(match.group(1)) silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32) if not output_format: