diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py index 985c513..d82310a 100644 --- a/api/src/inference/kokoro_v1.py +++ b/api/src/inference/kokoro_v1.py @@ -276,7 +276,9 @@ class KokoroV1(BaseModelBackend): ] ): continue - if not token.text or not token.text.strip(): + + # token.start_ts may be None + if not token.text or not token.text.strip() or token.start_ts is None or token.end_ts is None: continue start_time = float(token.start_ts) + current_offset diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 0acfe4d..b19c68d 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -10,6 +10,7 @@ import inflect from numpy import number from torch import mul from ...structures.schemas import NormalizationOptions +from misaki import en from text_to_num import text2num @@ -90,10 +91,23 @@ URL_PATTERN = re.compile( UNIT_PATTERN = re.compile(r"((? str: + """ + Convert a string into a sound-alike format + + Kokoro supports embedding phonemes in the text, and the token timestamps is based on the original text. + - Original Input Text: '[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models.' + - Text For Timestamps: 'Misaki is a G2P engine designed for Kokoro models.' + """ + phonemes, _ = g2p(sound_like) + return f"[{text}](/{phonemes}/)" + def split_num(num: re.Match[str]) -> str: """Handle number splitting for various formats""" num = num.group() @@ -116,7 +130,7 @@ def split_num(num: re.Match[str]) -> str: return f"{left} hundred{s}" elif right < 10: return f"{left} oh {right}{s}" - return f"{left} {right}{s}" + return sound_like(num, f"{left} {right}{s}") def handle_units(u: re.Match[str]) -> str: """Converts units to their full form""" @@ -134,7 +148,7 @@ def handle_units(u: re.Match[str]) -> str: number=u.group(1).strip() unit[0]=INFLECT_ENGINE.no(unit[0],number) - return " ".join(unit) + return sound_like(u.group(), " ".join(unit)) def conditional_int(number: float, threshold: float = 0.00001): if abs(round(number) - number) < threshold: @@ -164,12 +178,12 @@ def handle_money(m: re.Match[str]) -> str: text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}" - return text_number + return sound_like(m.group(), text_number) def handle_decimal(num: re.Match[str]) -> str: """Convert decimal numbers to spoken form""" a, b = num.group().split(".") - return " point ".join([a, " ".join(b)]) + return sound_like(num.group(), " point ".join([a, " ".join(b)])) def handle_email(m: re.Match[str]) -> str: @@ -179,7 +193,7 @@ def handle_email(m: re.Match[str]) -> str: if len(parts) == 2: user, domain = parts domain = domain.replace(".", " dot ") - return f"{user} at {domain}" + return sound_like(email, f"{user} at {domain}") return email @@ -227,34 +241,34 @@ def handle_url(u: re.Match[str]) -> str: url = url.replace("/", " slash ") # Handle any remaining slashes # Clean up extra spaces - return re.sub(r"\s+", " ", url).strip() + return sound_like(u.group(), re.sub(r"\s+", " ", url).strip()) def handle_phone_number(p: re.Match[str]) -> str: - p=list(p.groups()) + g=list(p.groups()) country_code="" - if p[0] is not None: - p[0]=p[0].replace("+","") - country_code += INFLECT_ENGINE.number_to_words(p[0]) + if g[0] is not None: + g[0]=g[0].replace("+","") + country_code += INFLECT_ENGINE.number_to_words(g[0]) - area_code=INFLECT_ENGINE.number_to_words(p[2].replace("(","").replace(")",""),group=1,comma="") + area_code=INFLECT_ENGINE.number_to_words(g[2].replace("(","").replace(")",""),group=1,comma="") - telephone_prefix=INFLECT_ENGINE.number_to_words(p[3],group=1,comma="") + telephone_prefix=INFLECT_ENGINE.number_to_words(g[3],group=1,comma="") - line_number=INFLECT_ENGINE.number_to_words(p[4],group=1,comma="") + line_number=INFLECT_ENGINE.number_to_words(g[4],group=1,comma="") - return ",".join([country_code,area_code,telephone_prefix,line_number]) + return sound_like(p.group(), ",".join([country_code,area_code,telephone_prefix,line_number])) def handle_time(t: re.Match[str]) -> str: - t=t.groups() + g = t.groups() - numbers = " ".join([INFLECT_ENGINE.number_to_words(X.strip()) for X in t[0].split(":")]) + numbers = " ".join([INFLECT_ENGINE.number_to_words(X.strip()) for X in g[0].split(":")]) half="" - if t[2] is not None: - half=t[2].strip() + if g[2] is not None: + half=g[2].strip() - return numbers + half + return sound_like(t.group(), numbers + half) def normalize_text(text: str,normalization_options: NormalizationOptions) -> str: """Normalize text for TTS processing"""