From 9c279f2b5eac3d85e793576c959621f13eaacaa2 Mon Sep 17 00:00:00 2001
From: jiaohuix <1152937237@qq.com>
Date: Mon, 26 May 2025 15:30:03 +0800
Subject: [PATCH] feat(text): add Chinese punctuation-based sentence splitting
 for better TTS

---
 .../services/text_processing/normalizer.py    |  2 +-
 .../text_processing/text_processor.py         | 64 ++++++++++++-------
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
index 280a26e..5e5d6b6 100644
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@@ -11,7 +11,7 @@ from typing import List, Optional, Union
 
 import inflect
 from numpy import number
-from text_to_num import text2num
+# from text_to_num import text2num
 from torch import mul
 
 from ...structures.schemas import NormalizationOptions
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
index 584affe..77fd525 100644
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@@ -88,32 +88,48 @@ def process_text(text: str, language: str = "a") -> List[int]:
 
 
 def get_sentence_info(
-    text: str, custom_phenomes_list: Dict[str, str]
+    text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a"
 ) -> List[Tuple[str, List[int], int]]:
-    """Process all sentences and return info."""
-    sentences = re.split(r"([.!?;:])(?=\s|$)", text)
+    """Process all sentences and return info, 支持中文分句"""
+    # 判断是否为中文
+    is_chinese = lang_code.startswith("zh") or re.search(r"[\u4e00-\u9fff]", text)
+    if is_chinese:
+        # 按中文标点断句
+        sentences = re.split(r"([，。！？；])", text)
+        # 合并标点
+        merged = []
+        for i in range(0, len(sentences)-1, 2):
+            merged.append(sentences[i] + sentences[i+1])
+        if len(sentences) % 2 == 1:
+            merged.append(sentences[-1])
+        sentences = merged
+    else:
+        sentences = re.split(r"([.!?;:])(?=\s|$)", text)
     phoneme_length, min_value = len(custom_phenomes_list), 0
-
     results = []
-    for i in range(0, len(sentences), 2):
-        sentence = sentences[i].strip()
-        for replaced in range(min_value, phoneme_length):
-            current_id = f"</|custom_phonemes_{replaced}|/>"
-            if current_id in sentence:
-                sentence = sentence.replace(
-                    current_id, custom_phenomes_list.pop(current_id)
-                )
-                min_value += 1
-
-        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
-
-        if not sentence:
-            continue
-
-        full = sentence + punct
-        tokens = process_text_chunk(full)
-        results.append((full, tokens, len(tokens)))
-
+    if is_chinese:
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            tokens = process_text_chunk(sentence)
+            results.append((sentence, tokens, len(tokens)))
+    else:
+        for i in range(0, len(sentences), 2):
+            sentence = sentences[i].strip()
+            for replaced in range(min_value, phoneme_length):
+                current_id = f"</|custom_phonemes_{replaced}|/>"
+                if current_id in sentence:
+                    sentence = sentence.replace(
+                        current_id, custom_phenomes_list.pop(current_id)
+                    )
+                    min_value += 1
+            punct = sentences[i + 1] if i + 1 < len(sentences) else ""
+            if not sentence:
+                continue
+            full = sentence + punct
+            tokens = process_text_chunk(full)
+            results.append((full, tokens, len(tokens)))
     return results
 
 
@@ -150,7 +166,7 @@ async def smart_split(
             )
 
     # Process all sentences
-    sentences = get_sentence_info(text, custom_phoneme_list)
+    sentences = get_sentence_info(text, custom_phoneme_list, lang_code=lang_code)
 
     current_chunk = []
     current_tokens = []