mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Change silent tag syntax to [silent 1.5s]
This commit is contained in:
parent
1d163f84f9
commit
8c08655a08
3 changed files with 14 additions and 9 deletions
|
@ -340,12 +340,12 @@ Key Streaming Metrics:
|
||||||
<summary>Custom Phonemes and Silence Tags</summary>
|
<summary>Custom Phonemes and Silence Tags</summary>
|
||||||
|
|
||||||
- Custom Phoneme: `[<text>](/phoneme/)`
|
- Custom Phoneme: `[<text>](/phoneme/)`
|
||||||
- Silence Tag: `[silent](/duration/)`
|
- Silence Tag: `[silent <duration>]`
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
I [live](/lɪv/) in a city. [silent](/3s/) Many concerts are broadcast [live](/lˈIv/).
|
I [live](/lɪv/) in a city. [silent 3s] Many concerts are broadcast [live](/lˈIv/).
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
|
@ -14,8 +14,8 @@ from ...structures.schemas import NormalizationOptions
|
||||||
|
|
||||||
# Pre-compiled regex patterns for performance
|
# Pre-compiled regex patterns for performance
|
||||||
CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
|
CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
|
||||||
# Matching: [silent](/1s/), [silent](/0.5s/), [silent](/.5s/)
|
# Matching: [silent 1s], [silent 0.5s], [silent .5s]
|
||||||
CUSTOM_PHONEME_SILENCE_TAG = re.compile(r"\[silent\]\(\/(\d*\.?\d+)s\/\)")
|
SILENCE_TAG = re.compile(r"\[silent (\d*\.?\d+)s\]")
|
||||||
|
|
||||||
def process_text_chunk(
|
def process_text_chunk(
|
||||||
text: str, language: str = "a", skip_phonemize: bool = False
|
text: str, language: str = "a", skip_phonemize: bool = False
|
||||||
|
@ -113,7 +113,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
|
||||||
|
|
||||||
# Handle silence tags
|
# Handle silence tags
|
||||||
# Eg: "This is a test sentence, [silent](/1s/) with silence for one second."
|
# Eg: "This is a test sentence, [silent](/1s/) with silence for one second."
|
||||||
while match := CUSTOM_PHONEME_SILENCE_TAG.search(sentence):
|
while match := SILENCE_TAG.search(sentence):
|
||||||
match_prefix = sentence[:match.start()] # `This is a test sentence, `
|
match_prefix = sentence[:match.start()] # `This is a test sentence, `
|
||||||
match_text = match.group(0) # `[silent](/1s/)`
|
match_text = match.group(0) # `[silent](/1s/)`
|
||||||
match_suffix = sentence[match.end():] # ` with silence for one second.`
|
match_suffix = sentence[match.end():] # ` with silence for one second.`
|
||||||
|
@ -137,6 +137,10 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
|
def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
|
||||||
|
"""
|
||||||
|
Replace [text](/phonemes/) with a <|custom_phonemes_X|/> tag to avoid being normalized.
|
||||||
|
Silence tags like [silence 1.5s] are replaced too.
|
||||||
|
"""
|
||||||
latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
|
latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
|
||||||
phenomes_list[latest_id] = s.group(0).strip()
|
phenomes_list[latest_id] = s.group(0).strip()
|
||||||
return latest_id
|
return latest_id
|
||||||
|
@ -154,9 +158,10 @@ async def smart_split(
|
||||||
|
|
||||||
custom_phoneme_list = {}
|
custom_phoneme_list = {}
|
||||||
|
|
||||||
|
text = SILENCE_TAG.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
|
||||||
|
|
||||||
# Normalize text
|
# Normalize text
|
||||||
if settings.advanced_text_normalization and normalization_options.normalize:
|
if settings.advanced_text_normalization and normalization_options.normalize:
|
||||||
print(lang_code)
|
|
||||||
if lang_code in ["a","b","en-us","en-gb"]:
|
if lang_code in ["a","b","en-us","en-gb"]:
|
||||||
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
|
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
|
||||||
text=normalize_text(text,normalization_options)
|
text=normalize_text(text,normalization_options)
|
||||||
|
@ -172,7 +177,7 @@ async def smart_split(
|
||||||
|
|
||||||
for sentence, tokens, count in sentences:
|
for sentence, tokens, count in sentences:
|
||||||
# Handle silence tags
|
# Handle silence tags
|
||||||
if CUSTOM_PHONEME_SILENCE_TAG.match(sentence):
|
if SILENCE_TAG.match(sentence):
|
||||||
# Yield any existing chunk if present.
|
# Yield any existing chunk if present.
|
||||||
if current_chunk:
|
if current_chunk:
|
||||||
chunk_text = " ".join(current_chunk)
|
chunk_text = " ".join(current_chunk)
|
||||||
|
|
|
@ -21,7 +21,7 @@ from ..inference.voice_manager import get_manager as get_voice_manager
|
||||||
from ..structures.schemas import NormalizationOptions
|
from ..structures.schemas import NormalizationOptions
|
||||||
from .audio import AudioNormalizer, AudioService
|
from .audio import AudioNormalizer, AudioService
|
||||||
from .text_processing import tokenize
|
from .text_processing import tokenize
|
||||||
from .text_processing.text_processor import CUSTOM_PHONEME_SILENCE_TAG, smart_split
|
from .text_processing.text_processor import SILENCE_TAG, smart_split
|
||||||
|
|
||||||
|
|
||||||
class TTSService:
|
class TTSService:
|
||||||
|
@ -63,7 +63,7 @@ class TTSService:
|
||||||
async with self._chunk_semaphore:
|
async with self._chunk_semaphore:
|
||||||
try:
|
try:
|
||||||
# Handle silence tags, eg: `[silent](0.5s)`
|
# Handle silence tags, eg: `[silent](0.5s)`
|
||||||
if match := CUSTOM_PHONEME_SILENCE_TAG.match(chunk_text):
|
if match := SILENCE_TAG.match(chunk_text):
|
||||||
silence_duration = float(match.group(1))
|
silence_duration = float(match.group(1))
|
||||||
silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32)
|
silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32)
|
||||||
if not output_format:
|
if not output_format:
|
||||||
|
|
Loading…
Add table
Reference in a new issue