Kokoro-FastAPI/api/src/services/text_processing/chunker.py

54 lines
1.5 KiB
Python
Raw Normal View History

"""Text chunking service"""
import re
2025-01-09 18:41:44 -07:00
from ...core.config import settings
def split_text(text: str, max_chunk=None):
"""Split text into chunks on natural pause points
2025-01-09 18:41:44 -07:00
Args:
text: Text to split into chunks
max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
"""
if max_chunk is None:
max_chunk = settings.max_chunk_size
2025-01-09 18:41:44 -07:00
if not isinstance(text, str):
text = str(text) if text is not None else ""
2025-01-09 18:41:44 -07:00
text = text.strip()
if not text:
return
2025-01-09 18:41:44 -07:00
# First split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
2025-01-09 18:41:44 -07:00
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
2025-01-09 18:41:44 -07:00
# For medium-length sentences, split on punctuation
if len(sentence) > max_chunk: # Lower threshold for more consistent sizes
# First try splitting on semicolons and colons
parts = re.split(r"(?<=[;:])\s+", sentence)
2025-01-09 18:41:44 -07:00
for part in parts:
part = part.strip()
if not part:
continue
2025-01-09 18:41:44 -07:00
# If part is still long, split on commas
if len(part) > max_chunk:
subparts = re.split(r"(?<=,)\s+", part)
for subpart in subparts:
subpart = subpart.strip()
if subpart:
yield subpart
else:
yield part
else:
yield sentence