Kokoro-FastAPI/api/src/services/text_processing/chunker.py

"""Text chunking service"""

import re
from ...core.config import settings


def split_text(text: str, max_chunk=None):
    """Split text into chunks on natural pause points

    Args:
        text: Text to split into chunks
        max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
    """
    if max_chunk is None:
        max_chunk = settings.max_chunk_size

    if not isinstance(text, str):
        text = str(text) if text is not None else ""

    text = text.strip()
    if not text:
        return

    # First split into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text)

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # For medium-length sentences, split on punctuation
        if len(sentence) > max_chunk:  # Lower threshold for more consistent sizes
            # First try splitting on semicolons and colons
            parts = re.split(r"(?<=[;:])\s+", sentence)

            for part in parts:
                part = part.strip()
                if not part:
                    continue

                # If part is still long, split on commas
                if len(part) > max_chunk:
                    subparts = re.split(r"(?<=,)\s+", part)
                    for subpart in subparts:
                        subpart = subpart.strip()
                        if subpart:
                            yield subpart
                else:
                    yield part
        else:
            yield sentence