Kokoro-FastAPI/api/src/services/text_processing/chunker.py

53 lines
1.6 KiB
Python
Raw Normal View History

"""Text chunking service"""
import re
from ...core.config import settings
def split_text(text: str, max_chunk=None):
"""Split text into chunks on natural pause points
Args:
text: Text to split into chunks
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
"""
if max_chunk is None:
max_chunk = settings.max_chunk_size
if not isinstance(text, str):
text = str(text) if text is not None else ""
text = text.strip()
if not text:
return
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# For medium-length sentences, split on punctuation
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
# First try splitting on semicolons and colons
parts = re.split(r"(?<=[;:])\s+", sentence)
for part in parts:
part = part.strip()
if not part:
continue
# If part is still long, split on commas
if len(part) > max_chunk:
subparts = re.split(r"(?<=,)\s+", part)
for subpart in subparts:
subpart = subpart.strip()
if subpart:
yield subpart
else:
yield part
else:
yield sentence