Kokoro-FastAPI/api/src/services/text_processing/vocabulary.py

def get_vocab():
    """Get the vocabulary dictionary mapping characters to token IDs"""
    _pad = "$"
    _punctuation = ';:,.!?¡¿—…"«»"" '
    _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

    # Create vocabulary dictionary
    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
    return {symbol: i for i, symbol in enumerate(symbols)}


# Initialize vocabulary
VOCAB = get_vocab()


def tokenize(phonemes: str) -> list[int]:
    """Convert phonemes string to token IDs

    Args:
        phonemes: String of phonemes to tokenize

    Returns:
        List of token IDs
    """
    # Strip phonemes to remove leading/trailing spaces that could cause artifacts
    phonemes = phonemes.strip()
    return [i for i in map(VOCAB.get, phonemes) if i is not None]


def decode_tokens(tokens: list[int]) -> str:
    """Convert token IDs back to phonemes string

    Args:
        tokens: List of token IDs

    Returns:
        String of phonemes
    """
    # Create reverse mapping
    id_to_symbol = {i: s for s, i in VOCAB.items()}
    return "".join(id_to_symbol[t] for t in tokens)