preserve custom phenomes

This commit is contained in:
Fireblade 2025-02-28 21:37:46 -05:00
parent 7d73c3c7ee
commit 906cf77a65

View file

@ -2,7 +2,7 @@
import re
import time
from typing import AsyncGenerator, List, Tuple
from typing import AsyncGenerator, Dict, List, Tuple
from loguru import logger
@ -12,6 +12,9 @@ from .phonemizer import phonemize
from .vocabulary import tokenize
from ...structures.schemas import NormalizationOptions
# Pre-compiled regex patterns for performance
CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
def process_text_chunk(
text: str, language: str = "a", skip_phonemize: bool = False
) -> List[int]:
@ -85,12 +88,21 @@ def process_text(text: str, language: str = "a") -> List[int]:
return process_text_chunk(text, language)
def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[Tuple[str, List[int], int]]:
"""Process all sentences and return info."""
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
phoneme_length, min_value = len(custom_phenomes_list), 0
1
results = []
for i in range(0, len(sentences), 2):
sentence = sentences[i].strip()
for replaced in range(min_value, phoneme_length):
current_id = f"</|custom_phonemes_{replaced}|/>"
if current_id in sentence:
sentence = sentence.replace(current_id, custom_phenomes_list.pop(current_id))
min_value += 1
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
if not sentence:
@ -102,6 +114,10 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
return results
def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
phenomes_list[latest_id] = s.group(0).strip()
return latest_id
async def smart_split(
text: str,
@ -114,15 +130,18 @@ async def smart_split(
chunk_count = 0
logger.info(f"Starting smart split for {len(text)} chars")
custom_phoneme_list = {}
# Normalize text
if settings.advanced_text_normalization and normalization_options.normalize:
if lang_code in ["a","b","en-us","en-gb"]:
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
text=normalize_text(text,normalization_options)
else:
logger.info("Skipping text normalization as it is only supported for english")
# Process all sentences
sentences = get_sentence_info(text)
sentences = get_sentence_info(text, custom_phoneme_list)
current_chunk = []
current_tokens = []