Kokoro-FastAPI/api/src/services/text_processing/phonemizer.py

103 lines
2.7 KiB
Python
Raw Normal View History

import re
from abc import ABC, abstractmethod
2025-01-09 18:41:44 -07:00
import phonemizer
2025-01-09 18:41:44 -07:00
from .normalizer import normalize_text
2025-02-09 18:32:17 -07:00
phonemizers = {}
2025-01-09 18:41:44 -07:00
2025-02-09 18:32:17 -07:00
class PhonemizerBackend(ABC):
"""Abstract base class for phonemization backends"""
2025-01-09 18:41:44 -07:00
@abstractmethod
def phonemize(self, text: str) -> str:
"""Convert text to phonemes
2025-01-09 18:41:44 -07:00
Args:
text: Text to convert to phonemes
2025-01-09 18:41:44 -07:00
Returns:
Phonemized text
"""
pass
2025-01-09 18:41:44 -07:00
class EspeakBackend(PhonemizerBackend):
"""Espeak-based phonemizer implementation"""
2025-01-09 18:41:44 -07:00
def __init__(self, language: str):
"""Initialize espeak backend
2025-01-09 18:41:44 -07:00
Args:
language: Language code ('en-us' or 'en-gb')
"""
self.backend = phonemizer.backend.EspeakBackend(
2025-01-09 18:41:44 -07:00
language=language, preserve_punctuation=True, with_stress=True
)
self.language = language
2025-01-09 18:41:44 -07:00
def phonemize(self, text: str) -> str:
"""Convert text to phonemes using espeak
2025-01-09 18:41:44 -07:00
Args:
text: Text to convert to phonemes
2025-01-09 18:41:44 -07:00
Returns:
Phonemized text
"""
# Phonemize text
ps = self.backend.phonemize([text])
ps = ps[0] if ps else ""
2025-01-09 18:41:44 -07:00
# Handle special cases
ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»"" ]|$)', "z", ps)
2025-01-09 18:41:44 -07:00
# Language-specific rules
if self.language == "en-us":
ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
2025-01-09 18:41:44 -07:00
return ps.strip()
2025-01-09 18:41:44 -07:00
def create_phonemizer(language: str = "a") -> PhonemizerBackend:
"""Factory function to create phonemizer backend
2025-01-09 18:41:44 -07:00
Args:
language: Language code ('a' for US English, 'b' for British English)
2025-01-09 18:41:44 -07:00
Returns:
Phonemizer backend instance
"""
# Map language codes to espeak language codes
2025-05-28 14:53:00 +00:00
lang_map = {"a": "en-us", "b": "en-gb", "z": "z"}
2025-01-09 18:41:44 -07:00
if language not in lang_map:
raise ValueError(f"Unsupported language code: {language}")
2025-01-09 18:41:44 -07:00
return EspeakBackend(lang_map[language])
2025-01-09 18:41:44 -07:00
def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
"""Convert text to phonemes
2025-01-09 18:41:44 -07:00
Args:
text: Text to convert to phonemes
language: Language code ('a' for US English, 'b' for British English)
normalize: Whether to normalize text before phonemization
2025-01-09 18:41:44 -07:00
Returns:
Phonemized text
"""
global phonemizers
if normalize:
text = normalize_text(text)
if language not in phonemizers:
2025-02-09 18:32:17 -07:00
phonemizers[language] = create_phonemizer(language)
return phonemizers[language].phonemize(text)