mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00

-alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
"""Text chunking service"""
|
|
|
|
import re
|
|
from ...core.config import settings
|
|
|
|
|
|
def split_text(text: str, max_chunk=None):
|
|
"""Split text into chunks on natural pause points
|
|
|
|
Args:
|
|
text: Text to split into chunks
|
|
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
|
|
"""
|
|
if max_chunk is None:
|
|
max_chunk = settings.max_chunk_size
|
|
|
|
if not isinstance(text, str):
|
|
text = str(text) if text is not None else ""
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
return
|
|
|
|
# First split into sentences
|
|
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
|
|
for sentence in sentences:
|
|
sentence = sentence.strip()
|
|
if not sentence:
|
|
continue
|
|
|
|
# For medium-length sentences, split on punctuation
|
|
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
|
|
# First try splitting on semicolons and colons
|
|
parts = re.split(r"(?<=[;:])\s+", sentence)
|
|
|
|
for part in parts:
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
|
|
# If part is still long, split on commas
|
|
if len(part) > max_chunk:
|
|
subparts = re.split(r"(?<=,)\s+", part)
|
|
for subpart in subparts:
|
|
subpart = subpart.strip()
|
|
if subpart:
|
|
yield subpart
|
|
else:
|
|
yield part
|
|
else:
|
|
yield sentence
|