Kokoro-FastAPI/api/src/services/text_processing/phonemizer.py

import re
from abc import ABC, abstractmethod

import phonemizer

from .normalizer import normalize_text

phonemizers = {}


class PhonemizerBackend(ABC):
    """Abstract base class for phonemization backends"""

    @abstractmethod
    def phonemize(self, text: str) -> str:
        """Convert text to phonemes

        Args:
            text: Text to convert to phonemes

        Returns:
            Phonemized text
        """
        pass


class EspeakBackend(PhonemizerBackend):
    """Espeak-based phonemizer implementation"""

    def __init__(self, language: str):
        """Initialize espeak backend

        Args:
            language: Language code ('en-us' or 'en-gb')
        """
        self.backend = phonemizer.backend.EspeakBackend(
            language=language, preserve_punctuation=True, with_stress=True
        )

        self.language = language

    def phonemize(self, text: str) -> str:
        """Convert text to phonemes using espeak

        Args:
            text: Text to convert to phonemes

        Returns:
            Phonemized text
        """
        # Phonemize text
        ps = self.backend.phonemize([text])
        ps = ps[0] if ps else ""

        # Handle special cases
        ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
        ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
        ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
        ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)

        # Language-specific rules
        if self.language == "en-us":
            ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)

        return ps.strip()


def create_phonemizer(language: str = "a") -> PhonemizerBackend:
    """Factory function to create phonemizer backend

    Args:
        language: Language code ('a' for US English, 'b' for British English)

    Returns:
        Phonemizer backend instance
    """
    # Map language codes to espeak language codes
    lang_map = {"a": "en-us", "b": "en-gb", "z": "z"}

    if language not in lang_map:
        raise ValueError(f"Unsupported language code: {language}")

    return EspeakBackend(lang_map[language])


def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
    """Convert text to phonemes

    Args:
        text: Text to convert to phonemes
        language: Language code ('a' for US English, 'b' for British English)
        normalize: Whether to normalize text before phonemization

    Returns:
        Phonemized text
    """
    global phonemizers
    if normalize:
        text = normalize_text(text)
    if language not in phonemizers:
        phonemizers[language] = create_phonemizer(language)
    return phonemizers[language].phonemize(text)
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								import re
 								from abc import ABC, abstractmethod
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								import phonemizer
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								from .normalizer import normalize_text
-												Ruff check + formatting

											
										
										
											2025-02-09 18:32:17 -07:00
-												Fixed thread leak because of creating excessive E-speak backends

											
										
										
											2025-01-21 14:45:43 -05:00
+								phonemizers = {}
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												Ruff check + formatting

											
										
										
											2025-02-09 18:32:17 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								class PhonemizerBackend(ABC):
 								    """Abstract base class for phonemization backends"""
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    @abstractmethod
 								    def phonemize(self, text: str) -> str:
 								        """Convert text to phonemes
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        Args:
 								            text: Text to convert to phonemes
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        Returns:
 								            Phonemized text
 								        """
 								        pass
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								class EspeakBackend(PhonemizerBackend):
 								    """Espeak-based phonemizer implementation"""
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    def __init__(self, language: str):
 								        """Initialize espeak backend
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        Args:
 								            language: Language code ('en-us' or 'en-gb')
 								        """
 								        self.backend = phonemizer.backend.EspeakBackend(
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
+								            language=language, preserve_punctuation=True, with_stress=True
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        )
-												Fixed thread leak because of creating excessive E-speak backends

											
										
										
											2025-01-21 14:45:43 -05:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        self.language = language
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    def phonemize(self, text: str) -> str:
 								        """Convert text to phonemes using espeak
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        Args:
 								            text: Text to convert to phonemes
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        Returns:
 								            Phonemized text
 								        """
 								        # Phonemize text
 								        ps = self.backend.phonemize([text])
 								        ps = ps[0] if ps else ""
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        # Handle special cases
 								        ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
 								        ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
 								        ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
 								        ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        # Language-specific rules
 								        if self.language == "en-us":
 								            ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								        return ps.strip()
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								def create_phonemizer(language: str = "a") -> PhonemizerBackend:
 								    """Factory function to create phonemizer backend
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    Args:
 								        language: Language code ('a' for US English, 'b' for British English)
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    Returns:
 								        Phonemizer backend instance
 								    """
 								    # Map language codes to espeak language codes
-												Make the code cleaner and add tests

											
										
										
											2025-05-28 14:53:00 +00:00
+								    lang_map = {"a": "en-us", "b": "en-gb", "z": "z"}
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    if language not in lang_map:
 								        raise ValueError(f"Unsupported language code: {language}")
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    return EspeakBackend(lang_map[language])
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
 								    """Convert text to phonemes
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    Args:
 								        text: Text to convert to phonemes
 								        language: Language code ('a' for US English, 'b' for British English)
 								        normalize: Whether to normalize text before phonemization
-												Ruff format + fix

											
										
										
											2025-01-09 18:41:44 -07:00
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    Returns:
 								        Phonemized text
 								    """
-												Fixed thread leak because of creating excessive E-speak backends

											
										
										
											2025-01-21 14:45:43 -05:00
+								    global phonemizers
-												- CPU ONNX + PyTorch CUDA, functional
- Incorporated text processing module as service, towards modularization and optimizations
- Added text processing router for phonemization
- Enhanced benchmark statistics with real-time speed metrics

											
										
										
											2025-01-03 17:54:17 -07:00
+								    if normalize:
 								        text = normalize_text(text)
-												Fixed thread leak because of creating excessive E-speak backends

											
										
										
											2025-01-21 14:45:43 -05:00
+								    if language not in phonemizers:
-												Ruff check + formatting

											
										
										
											2025-02-09 18:32:17 -07:00
+								        phonemizers[language] = create_phonemizer(language)
-												Fixed thread leak because of creating excessive E-speak backends

											
										
										
											2025-01-21 14:45:43 -05:00
+								    return phonemizers[language].phonemize(text)