Kokoro-FastAPI/api/src/services/text_processing/chunker.py

"""Text chunking service"""

import re

from ...core.config import settings


def split_text(text: str, max_chunk=None):
    """Split text into chunks on natural pause points

    Args:
        text: Text to split into chunks
        max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)
    """
    if max_chunk is None:
        max_chunk = settings.max_chunk_size

    if not isinstance(text, str):
        text = str(text) if text is not None else ""

    text = text.strip()
    if not text:
        return

    # First split into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text)

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # For medium-length sentences, split on punctuation
        if len(sentence) > max_chunk:  # Lower threshold for more consistent sizes
            # First try splitting on semicolons and colons
            parts = re.split(r"(?<=[;:])\s+", sentence)

            for part in parts:
                part = part.strip()
                if not part:
                    continue

                # If part is still long, split on commas
                if len(part) > max_chunk:
                    subparts = re.split(r"(?<=,)\s+", part)
                    for subpart in subparts:
                        subpart = subpart.strip()
                        if subpart:
                            yield subpart
                else:
                    yield part
        else:
            yield sentence
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`"""Text chunking service"""`

			`import re`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`from ...core.config import settings`


			`def split_text(text: str, max_chunk=None):`
			`"""Split text into chunks on natural pause points`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`Args:`
			`text: Text to split into chunks`
			`max_chunk: Maximum chunk size (defaults to settings.max_chunk_size)`
			`"""`
			`if max_chunk is None:`
			`max_chunk = settings.max_chunk_size`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`if not isinstance(text, str):`
			`text = str(text) if text is not None else ""`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`text = text.strip()`
			`if not text:`
			`return`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# First split into sentences`
			`sentences = re.split(r"(?<=[.!?])\s+", text)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`for sentence in sentences:`
			`sentence = sentence.strip()`
			`if not sentence:`
			`continue`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# For medium-length sentences, split on punctuation`
			`if len(sentence) > max_chunk: # Lower threshold for more consistent sizes`
			`# First try splitting on semicolons and colons`
			`parts = re.split(r"(?<=[;:])\s+", sentence)`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`for part in parts:`
			`part = part.strip()`
			`if not part:`
			`continue`
Ruff format + fix 2025-01-09 18:41:44 -07:00
-update soundfile version -alignment with streaming standards -audio processing config settings -more comprehensive model warmup -minor model improvements -enhancing testing, benchmarking -cool ascii logo 2025-01-06 03:32:41 -07:00			`# If part is still long, split on commas`
			`if len(part) > max_chunk:`
			`subparts = re.split(r"(?<=,)\s+", part)`
			`for subpart in subparts:`
			`subpart = subpart.strip()`
			`if subpart:`
			`yield subpart`
			`else:`
			`yield part`
			`else:`
			`yield sentence`