From 906cf77a65773a108e8a752ceb259650a347efcc Mon Sep 17 00:00:00 2001 From: Fireblade Date: Fri, 28 Feb 2025 21:37:46 -0500 Subject: [PATCH 1/3] preserve custom phenomes --- .../text_processing/text_processor.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 018cc51..1176c66 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -2,7 +2,7 @@ import re import time -from typing import AsyncGenerator, List, Tuple +from typing import AsyncGenerator, Dict, List, Tuple from loguru import logger @@ -12,6 +12,9 @@ from .phonemizer import phonemize from .vocabulary import tokenize from ...structures.schemas import NormalizationOptions +# Pre-compiled regex patterns for performance +CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))") + def process_text_chunk( text: str, language: str = "a", skip_phonemize: bool = False ) -> List[int]: @@ -85,12 +88,21 @@ def process_text(text: str, language: str = "a") -> List[int]: return process_text_chunk(text, language) -def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]: +def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[Tuple[str, List[int], int]]: """Process all sentences and return info.""" sentences = re.split(r"([.!?;:])(?=\s|$)", text) + phoneme_length, min_value = len(custom_phenomes_list), 0 + 1 results = [] for i in range(0, len(sentences), 2): sentence = sentences[i].strip() + for replaced in range(min_value, phoneme_length): + current_id = f"" + if current_id in sentence: + sentence = sentence.replace(current_id, custom_phenomes_list.pop(current_id)) + min_value += 1 + + punct = sentences[i + 1] if i + 1 < len(sentences) else "" if not sentence: @@ -102,6 +114,10 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]: return results +def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str: + latest_id = f"" + phenomes_list[latest_id] = s.group(0).strip() + return latest_id async def smart_split( text: str, @@ -114,15 +130,18 @@ async def smart_split( chunk_count = 0 logger.info(f"Starting smart split for {len(text)} chars") + custom_phoneme_list = {} + # Normalize text if settings.advanced_text_normalization and normalization_options.normalize: if lang_code in ["a","b","en-us","en-gb"]: + text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text) text=normalize_text(text,normalization_options) else: logger.info("Skipping text normalization as it is only supported for english") # Process all sentences - sentences = get_sentence_info(text) + sentences = get_sentence_info(text, custom_phoneme_list) current_chunk = [] current_tokens = [] From f415ce7109e7f8c5d2fdfe9d22e79c49635611b6 Mon Sep 17 00:00:00 2001 From: Fireblade Date: Fri, 28 Feb 2025 21:39:12 -0500 Subject: [PATCH 2/3] don't replace brackets as that is handled in misaki --- api/src/services/text_processing/normalizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 708b578..84c3694 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -270,7 +270,6 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str text = text.replace(chr(8216), "'").replace(chr(8217), "'") text = text.replace("«", chr(8220)).replace("»", chr(8221)) text = text.replace(chr(8220), '"').replace(chr(8221), '"') - text = text.replace("(", "«").replace(")", "»") # Handle CJK punctuation and some non standard chars for a, b in zip("、。!,:;?–", ",.!,:;?-"): From 43576c4a76a33ef908e39c4de068283d799564c4 Mon Sep 17 00:00:00 2001 From: Fireblade2534 <77405729+fireblade2534@users.noreply.github.com> Date: Sat, 1 Mar 2025 12:45:41 -0500 Subject: [PATCH 3/3] Remove random 1 --- api/src/services/text_processing/text_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 1176c66..0d8d36c 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -92,7 +92,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T """Process all sentences and return info.""" sentences = re.split(r"([.!?;:])(?=\s|$)", text) phoneme_length, min_value = len(custom_phenomes_list), 0 - 1 + results = [] for i in range(0, len(sentences), 2): sentence = sentences[i].strip() @@ -264,4 +264,4 @@ async def smart_split( total_time = time.time() - start_time logger.info( f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks" - ) \ No newline at end of file + )