From 9c279f2b5eac3d85e793576c959621f13eaacaa2 Mon Sep 17 00:00:00 2001 From: jiaohuix <1152937237@qq.com> Date: Mon, 26 May 2025 15:30:03 +0800 Subject: [PATCH] feat(text): add Chinese punctuation-based sentence splitting for better TTS --- .../services/text_processing/normalizer.py | 2 +- .../text_processing/text_processor.py | 64 ++++++++++++------- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 280a26e..5e5d6b6 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -11,7 +11,7 @@ from typing import List, Optional, Union import inflect from numpy import number -from text_to_num import text2num +# from text_to_num import text2num from torch import mul from ...structures.schemas import NormalizationOptions diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 584affe..77fd525 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -88,32 +88,48 @@ def process_text(text: str, language: str = "a") -> List[int]: def get_sentence_info( - text: str, custom_phenomes_list: Dict[str, str] + text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a" ) -> List[Tuple[str, List[int], int]]: - """Process all sentences and return info.""" - sentences = re.split(r"([.!?;:])(?=\s|$)", text) + """Process all sentences and return info, 支持中文分句""" + # 判断是否为中文 + is_chinese = lang_code.startswith("zh") or re.search(r"[\u4e00-\u9fff]", text) + if is_chinese: + # 按中文标点断句 + sentences = re.split(r"([,。!?;])", text) + # 合并标点 + merged = [] + for i in range(0, len(sentences)-1, 2): + merged.append(sentences[i] + sentences[i+1]) + if len(sentences) % 2 == 1: + merged.append(sentences[-1]) + sentences = merged + else: + sentences = re.split(r"([.!?;:])(?=\s|$)", text) phoneme_length, min_value = len(custom_phenomes_list), 0 - results = [] - for i in range(0, len(sentences), 2): - sentence = sentences[i].strip() - for replaced in range(min_value, phoneme_length): - current_id = f"" - if current_id in sentence: - sentence = sentence.replace( - current_id, custom_phenomes_list.pop(current_id) - ) - min_value += 1 - - punct = sentences[i + 1] if i + 1 < len(sentences) else "" - - if not sentence: - continue - - full = sentence + punct - tokens = process_text_chunk(full) - results.append((full, tokens, len(tokens))) - + if is_chinese: + for sentence in sentences: + sentence = sentence.strip() + if not sentence: + continue + tokens = process_text_chunk(sentence) + results.append((sentence, tokens, len(tokens))) + else: + for i in range(0, len(sentences), 2): + sentence = sentences[i].strip() + for replaced in range(min_value, phoneme_length): + current_id = f"" + if current_id in sentence: + sentence = sentence.replace( + current_id, custom_phenomes_list.pop(current_id) + ) + min_value += 1 + punct = sentences[i + 1] if i + 1 < len(sentences) else "" + if not sentence: + continue + full = sentence + punct + tokens = process_text_chunk(full) + results.append((full, tokens, len(tokens))) return results @@ -150,7 +166,7 @@ async def smart_split( ) # Process all sentences - sentences = get_sentence_info(text, custom_phoneme_list) + sentences = get_sentence_info(text, custom_phoneme_list, lang_code=lang_code) current_chunk = [] current_tokens = []