From b89da1ff280d1d32bc70b8df01ff3c738e7889ba Mon Sep 17 00:00:00 2001 From: Fireblade2534 Date: Wed, 28 May 2025 14:53:00 +0000 Subject: [PATCH] Make the code cleaner and add tests --- .../services/text_processing/phonemizer.py | 2 +- .../text_processing/text_processor.py | 51 ++++++--------- api/tests/test_text_processor.py | 62 +++++++++++++++++++ dev/Test money.py | 6 +- 4 files changed, 83 insertions(+), 38 deletions(-) diff --git a/api/src/services/text_processing/phonemizer.py b/api/src/services/text_processing/phonemizer.py index 5a50d64..c010005 100644 --- a/api/src/services/text_processing/phonemizer.py +++ b/api/src/services/text_processing/phonemizer.py @@ -75,7 +75,7 @@ def create_phonemizer(language: str = "a") -> PhonemizerBackend: Phonemizer backend instance """ # Map language codes to espeak language codes - lang_map = {"a": "en-us", "b": "en-gb"} + lang_map = {"a": "en-us", "b": "en-gb", "z": "z"} if language not in lang_map: raise ValueError(f"Unsupported language code: {language}") diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 77fd525..0dbb348 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -92,44 +92,30 @@ def get_sentence_info( ) -> List[Tuple[str, List[int], int]]: """Process all sentences and return info, 支持中文分句""" # 判断是否为中文 - is_chinese = lang_code.startswith("zh") or re.search(r"[\u4e00-\u9fff]", text) + is_chinese = lang_code.startswith("z") or re.search(r"[\u4e00-\u9fff]", text) if is_chinese: # 按中文标点断句 - sentences = re.split(r"([,。!?;])", text) - # 合并标点 - merged = [] - for i in range(0, len(sentences)-1, 2): - merged.append(sentences[i] + sentences[i+1]) - if len(sentences) % 2 == 1: - merged.append(sentences[-1]) - sentences = merged + sentences = re.split(r"([,。!?;])+", text) else: sentences = re.split(r"([.!?;:])(?=\s|$)", text) phoneme_length, min_value = len(custom_phenomes_list), 0 + results = [] - if is_chinese: - for sentence in sentences: - sentence = sentence.strip() - if not sentence: - continue - tokens = process_text_chunk(sentence) - results.append((sentence, tokens, len(tokens))) - else: - for i in range(0, len(sentences), 2): - sentence = sentences[i].strip() - for replaced in range(min_value, phoneme_length): - current_id = f"" - if current_id in sentence: - sentence = sentence.replace( - current_id, custom_phenomes_list.pop(current_id) - ) - min_value += 1 - punct = sentences[i + 1] if i + 1 < len(sentences) else "" - if not sentence: - continue - full = sentence + punct - tokens = process_text_chunk(full) - results.append((full, tokens, len(tokens))) + for i in range(0, len(sentences), 2): + sentence = sentences[i].strip() + for replaced in range(min_value, phoneme_length): + current_id = f"" + if current_id in sentence: + sentence = sentence.replace( + current_id, custom_phenomes_list.pop(current_id) + ) + min_value += 1 + punct = sentences[i + 1] if i + 1 < len(sentences) else "" + if not sentence: + continue + full = sentence + punct + tokens = process_text_chunk(full) + results.append((full, tokens, len(tokens))) return results @@ -154,7 +140,6 @@ async def smart_split( # Normalize text if settings.advanced_text_normalization and normalization_options.normalize: - print(lang_code) if lang_code in ["a", "b", "en-us", "en-gb"]: text = CUSTOM_PHONEMES.sub( lambda s: handle_custom_phonemes(s, custom_phoneme_list), text diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py index bfcbcfe..6ff8282 100644 --- a/api/tests/test_text_processor.py +++ b/api/tests/test_text_processor.py @@ -103,3 +103,65 @@ async def test_smart_split_with_punctuation(): # Verify punctuation is preserved assert all(any(p in chunk for p in "!?;:.") for chunk in chunks) + +def test_process_text_chunk_chinese_phonemes(): + """Test processing with Chinese pinyin phonemes.""" + pinyin = "nǐ hǎo lì" # Example pinyin sequence with tones + tokens = process_text_chunk(pinyin, skip_phonemize=True, language="z") + assert isinstance(tokens, list) + assert len(tokens) > 0 + + +def test_get_sentence_info_chinese(): + """Test Chinese sentence splitting and info extraction.""" + text = "这是一个句子。这是第二个句子!第三个问题?" + results = get_sentence_info(text, {}, lang_code="z") + + assert len(results) == 3 + for sentence, tokens, count in results: + assert isinstance(sentence, str) + assert isinstance(tokens, list) + assert isinstance(count, int) + assert count == len(tokens) + assert count > 0 + +@pytest.mark.asyncio +async def test_smart_split_chinese_short(): + """Test Chinese smart splitting with short text.""" + text = "这是一句话。" + chunks = [] + async for chunk_text, chunk_tokens in smart_split(text, lang_code="z"): + chunks.append((chunk_text, chunk_tokens)) + + assert len(chunks) == 1 + assert isinstance(chunks[0][0], str) + assert isinstance(chunks[0][1], list) + + +@pytest.mark.asyncio +async def test_smart_split_chinese_long(): + """Test Chinese smart splitting with longer text.""" + text = "。".join([f"测试句子 {i}" for i in range(20)]) + + chunks = [] + async for chunk_text, chunk_tokens in smart_split(text, lang_code="z"): + chunks.append((chunk_text, chunk_tokens)) + + assert len(chunks) > 1 + for chunk_text, chunk_tokens in chunks: + assert isinstance(chunk_text, str) + assert isinstance(chunk_tokens, list) + assert len(chunk_tokens) > 0 + + +@pytest.mark.asyncio +async def test_smart_split_chinese_punctuation(): + """Test Chinese smart splitting with punctuation preservation.""" + text = "第一句!第二问?第三句;第四句:第五句。" + + chunks = [] + async for chunk_text, _ in smart_split(text, lang_code="z"): + chunks.append(chunk_text) + + # Verify Chinese punctuation is preserved + assert all(any(p in chunk for p in "!?;:。") for chunk in chunks) \ No newline at end of file diff --git a/dev/Test money.py b/dev/Test money.py index 47e2a9c..57d1fa6 100644 --- a/dev/Test money.py +++ b/dev/Test money.py @@ -3,9 +3,7 @@ import json import requests -text = """the administration has offered up a platter of repression for more than a year and is still slated to lose $400 million. - -Columbia is the largest private landowner in New York City and boasts an endowment of $14.8 billion;""" +text = """奶酪芝士很浓郁!臭豆腐芝士有争议?陈年奶酪价格昂贵。""" Type = "wav" @@ -15,7 +13,7 @@ response = requests.post( json={ "model": "kokoro", "input": text, - "voice": "af_heart+af_sky", + "voice": "zf_xiaobei", "speed": 1.0, "response_format": Type, "stream": False,