Change silent tag syntax to [silent 1.5s]

This commit is contained in:
fondoger 2025-04-04 03:49:27 +08:00
parent 1d163f84f9
commit 8c08655a08
3 changed files with 14 additions and 9 deletions

View file

@ -340,12 +340,12 @@ Key Streaming Metrics:
<summary>Custom Phonemes and Silence Tags</summary>
- Custom Phoneme: `[<text>](/phoneme/)`
- Silence Tag: `[silent](/duration/)`
- Silence Tag: `[silent <duration>]`
Example:
```text
I [live](/lɪv/) in a city. [silent](/3s/) Many concerts are broadcast [live](/lˈIv/).
I [live](/lɪv/) in a city. [silent 3s] Many concerts are broadcast [live](/lˈIv/).
```
</details>

View file

@ -14,8 +14,8 @@ from ...structures.schemas import NormalizationOptions
# Pre-compiled regex patterns for performance
CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
# Matching: [silent](/1s/), [silent](/0.5s/), [silent](/.5s/)
CUSTOM_PHONEME_SILENCE_TAG = re.compile(r"\[silent\]\(\/(\d*\.?\d+)s\/\)")
# Matching: [silent 1s], [silent 0.5s], [silent .5s]
SILENCE_TAG = re.compile(r"\[silent (\d*\.?\d+)s\]")
def process_text_chunk(
text: str, language: str = "a", skip_phonemize: bool = False
@ -113,7 +113,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
# Handle silence tags
# Eg: "This is a test sentence, [silent](/1s/) with silence for one second."
while match := CUSTOM_PHONEME_SILENCE_TAG.search(sentence):
while match := SILENCE_TAG.search(sentence):
match_prefix = sentence[:match.start()] # `This is a test sentence, `
match_text = match.group(0) # `[silent](/1s/)`
match_suffix = sentence[match.end():] # ` with silence for one second.`
@ -137,6 +137,10 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
return results
def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
"""
Replace [text](/phonemes/) with a <|custom_phonemes_X|/> tag to avoid being normalized.
Silence tags like [silence 1.5s] are replaced too.
"""
latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
phenomes_list[latest_id] = s.group(0).strip()
return latest_id
@ -154,9 +158,10 @@ async def smart_split(
custom_phoneme_list = {}
text = SILENCE_TAG.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
# Normalize text
if settings.advanced_text_normalization and normalization_options.normalize:
print(lang_code)
if lang_code in ["a","b","en-us","en-gb"]:
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
text=normalize_text(text,normalization_options)
@ -172,7 +177,7 @@ async def smart_split(
for sentence, tokens, count in sentences:
# Handle silence tags
if CUSTOM_PHONEME_SILENCE_TAG.match(sentence):
if SILENCE_TAG.match(sentence):
# Yield any existing chunk if present.
if current_chunk:
chunk_text = " ".join(current_chunk)

View file

@ -21,7 +21,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager
from ..structures.schemas import NormalizationOptions
from .audio import AudioNormalizer, AudioService
from .text_processing import tokenize
from .text_processing.text_processor import CUSTOM_PHONEME_SILENCE_TAG, smart_split
from .text_processing.text_processor import SILENCE_TAG, smart_split
class TTSService:
@ -63,7 +63,7 @@ class TTSService:
async with self._chunk_semaphore:
try:
# Handle silence tags, eg: `[silent](0.5s)`
if match := CUSTOM_PHONEME_SILENCE_TAG.match(chunk_text):
if match := SILENCE_TAG.match(chunk_text):
silence_duration = float(match.group(1))
silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32)
if not output_format: