mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Reverted the kokoro version bump and change the phenomizer to use the phenomizer that the rest of the text uses.
This commit is contained in:
parent
c7f09bf467
commit
fd86395e98
3 changed files with 38 additions and 40 deletions
|
@ -95,9 +95,8 @@ TIME_PATTERN = re.compile(r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\
|
||||||
|
|
||||||
INFLECT_ENGINE=inflect.engine()
|
INFLECT_ENGINE=inflect.engine()
|
||||||
|
|
||||||
g2p = en.G2P(trf=False, british=False, fallback=None)
|
def sound_like(text: str, sound_like: str, lang_code: str) -> str:
|
||||||
|
from .phonemizer import phonemize
|
||||||
def sound_like(text: str, sound_like: str) -> str:
|
|
||||||
"""
|
"""
|
||||||
Convert a string into a sound-alike format
|
Convert a string into a sound-alike format
|
||||||
|
|
||||||
|
@ -105,10 +104,10 @@ def sound_like(text: str, sound_like: str) -> str:
|
||||||
- Original Input Text: '[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models.'
|
- Original Input Text: '[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models.'
|
||||||
- Text For Timestamps: 'Misaki is a G2P engine designed for Kokoro models.'
|
- Text For Timestamps: 'Misaki is a G2P engine designed for Kokoro models.'
|
||||||
"""
|
"""
|
||||||
phonemes, _ = g2p(sound_like)
|
phonemes = phonemize(sound_like, language = lang_code, normalize = False)
|
||||||
return f"[{text}](/{phonemes}/)"
|
return f"[{text}](/{phonemes}/)"
|
||||||
|
|
||||||
def split_num(num: re.Match[str]) -> str:
|
def split_num(num: re.Match[str], lang_code) -> str:
|
||||||
"""Handle number splitting for various formats"""
|
"""Handle number splitting for various formats"""
|
||||||
num = num.group()
|
num = num.group()
|
||||||
if "." in num:
|
if "." in num:
|
||||||
|
@ -116,10 +115,10 @@ def split_num(num: re.Match[str]) -> str:
|
||||||
elif ":" in num:
|
elif ":" in num:
|
||||||
h, m = [int(n) for n in num.split(":")]
|
h, m = [int(n) for n in num.split(":")]
|
||||||
if m == 0:
|
if m == 0:
|
||||||
return f"{h} o'clock"
|
return sound_like(num, f"{h} o'clock")
|
||||||
elif m < 10:
|
elif m < 10:
|
||||||
return f"{h} oh {m}"
|
return sound_like(num, f"{h} oh {m}")
|
||||||
return f"{h} {m}"
|
return sound_like(num, f"{h} {m}", lang_code)
|
||||||
year = int(num[:4])
|
year = int(num[:4])
|
||||||
if year < 1100 or year % 1000 < 10:
|
if year < 1100 or year % 1000 < 10:
|
||||||
return num
|
return num
|
||||||
|
@ -127,12 +126,12 @@ def split_num(num: re.Match[str]) -> str:
|
||||||
s = "s" if num.endswith("s") else ""
|
s = "s" if num.endswith("s") else ""
|
||||||
if 100 <= year % 1000 <= 999:
|
if 100 <= year % 1000 <= 999:
|
||||||
if right == 0:
|
if right == 0:
|
||||||
return f"{left} hundred{s}"
|
return sound_like(num, f"{left} hundred{s}", lang_code)
|
||||||
elif right < 10:
|
elif right < 10:
|
||||||
return f"{left} oh {right}{s}"
|
return sound_like(num, f"{left} oh {right}{s}", lang_code)
|
||||||
return sound_like(num, f"{left} {right}{s}")
|
return sound_like(num, f"{left} {right}{s}", lang_code)
|
||||||
|
|
||||||
def handle_units(u: re.Match[str]) -> str:
|
def handle_units(u: re.Match[str], lang_code) -> str:
|
||||||
"""Converts units to their full form"""
|
"""Converts units to their full form"""
|
||||||
unit_string=u.group(6).strip()
|
unit_string=u.group(6).strip()
|
||||||
unit=unit_string
|
unit=unit_string
|
||||||
|
@ -148,14 +147,14 @@ def handle_units(u: re.Match[str]) -> str:
|
||||||
|
|
||||||
number=u.group(1).strip()
|
number=u.group(1).strip()
|
||||||
unit[0]=INFLECT_ENGINE.no(unit[0],number)
|
unit[0]=INFLECT_ENGINE.no(unit[0],number)
|
||||||
return sound_like(u.group(), " ".join(unit))
|
return sound_like(u.group(), " ".join(unit), lang_code)
|
||||||
|
|
||||||
def conditional_int(number: float, threshold: float = 0.00001):
|
def conditional_int(number: float, threshold: float = 0.00001):
|
||||||
if abs(round(number) - number) < threshold:
|
if abs(round(number) - number) < threshold:
|
||||||
return int(round(number))
|
return int(round(number))
|
||||||
return number
|
return number
|
||||||
|
|
||||||
def handle_money(m: re.Match[str]) -> str:
|
def handle_money(m: re.Match[str], lang_code) -> str:
|
||||||
"""Convert money expressions to spoken form"""
|
"""Convert money expressions to spoken form"""
|
||||||
|
|
||||||
bill = "dollar" if m.group(2) == "$" else "pound"
|
bill = "dollar" if m.group(2) == "$" else "pound"
|
||||||
|
@ -178,7 +177,7 @@ def handle_money(m: re.Match[str]) -> str:
|
||||||
|
|
||||||
text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
|
text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
|
||||||
|
|
||||||
return sound_like(m.group(), text_number)
|
return sound_like(m.group(), text_number, lang_code)
|
||||||
|
|
||||||
def handle_decimal(num: re.Match[str]) -> str:
|
def handle_decimal(num: re.Match[str]) -> str:
|
||||||
"""Convert decimal numbers to spoken form"""
|
"""Convert decimal numbers to spoken form"""
|
||||||
|
@ -186,18 +185,18 @@ def handle_decimal(num: re.Match[str]) -> str:
|
||||||
return sound_like(num.group(), " point ".join([a, " ".join(b)]))
|
return sound_like(num.group(), " point ".join([a, " ".join(b)]))
|
||||||
|
|
||||||
|
|
||||||
def handle_email(m: re.Match[str]) -> str:
|
def handle_email(m: re.Match[str], lang_code) -> str:
|
||||||
"""Convert email addresses into speakable format"""
|
"""Convert email addresses into speakable format"""
|
||||||
email = m.group(0)
|
email = m.group(0)
|
||||||
parts = email.split("@")
|
parts = email.split("@")
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
user, domain = parts
|
user, domain = parts
|
||||||
domain = domain.replace(".", " dot ")
|
domain = domain.replace(".", " dot ")
|
||||||
return sound_like(email, f"{user} at {domain}")
|
return sound_like(email, f"{user} at {domain}", lang_code)
|
||||||
return email
|
return email
|
||||||
|
|
||||||
|
|
||||||
def handle_url(u: re.Match[str]) -> str:
|
def handle_url(u: re.Match[str], lang_code: str) -> str:
|
||||||
"""Make URLs speakable by converting special characters to spoken words"""
|
"""Make URLs speakable by converting special characters to spoken words"""
|
||||||
if not u:
|
if not u:
|
||||||
return ""
|
return ""
|
||||||
|
@ -241,9 +240,9 @@ def handle_url(u: re.Match[str]) -> str:
|
||||||
url = url.replace("/", " slash ") # Handle any remaining slashes
|
url = url.replace("/", " slash ") # Handle any remaining slashes
|
||||||
|
|
||||||
# Clean up extra spaces
|
# Clean up extra spaces
|
||||||
return sound_like(u.group(), re.sub(r"\s+", " ", url).strip())
|
return sound_like(u.group(), re.sub(r"\s+", " ", url).strip(), lang_code)
|
||||||
|
|
||||||
def handle_phone_number(p: re.Match[str]) -> str:
|
def handle_phone_number(p: re.Match[str], lang_code: str) -> str:
|
||||||
g=list(p.groups())
|
g=list(p.groups())
|
||||||
|
|
||||||
country_code=""
|
country_code=""
|
||||||
|
@ -257,9 +256,9 @@ def handle_phone_number(p: re.Match[str]) -> str:
|
||||||
|
|
||||||
line_number=INFLECT_ENGINE.number_to_words(g[4],group=1,comma="")
|
line_number=INFLECT_ENGINE.number_to_words(g[4],group=1,comma="")
|
||||||
|
|
||||||
return sound_like(p.group(), ",".join([country_code,area_code,telephone_prefix,line_number]))
|
return sound_like(p.group(), ",".join([country_code,area_code,telephone_prefix,line_number]), lang_code)
|
||||||
|
|
||||||
def handle_time(t: re.Match[str]) -> str:
|
def handle_time(t: re.Match[str], lang_code: str) -> str:
|
||||||
g = t.groups()
|
g = t.groups()
|
||||||
|
|
||||||
numbers = " ".join([INFLECT_ENGINE.number_to_words(X.strip()) for X in g[0].split(":")])
|
numbers = " ".join([INFLECT_ENGINE.number_to_words(X.strip()) for X in g[0].split(":")])
|
||||||
|
@ -268,21 +267,21 @@ def handle_time(t: re.Match[str]) -> str:
|
||||||
if g[2] is not None:
|
if g[2] is not None:
|
||||||
half=g[2].strip()
|
half=g[2].strip()
|
||||||
|
|
||||||
return sound_like(t.group(), numbers + half)
|
return sound_like(t.group(), numbers + half, lang_code)
|
||||||
|
|
||||||
def normalize_text(text: str,normalization_options: NormalizationOptions) -> str:
|
def normalize_text(text: str,normalization_options: NormalizationOptions, lang_code = "a") -> str:
|
||||||
"""Normalize text for TTS processing"""
|
"""Normalize text for TTS processing"""
|
||||||
# Handle email addresses first if enabled
|
# Handle email addresses first if enabled
|
||||||
if normalization_options.email_normalization:
|
if normalization_options.email_normalization:
|
||||||
text = EMAIL_PATTERN.sub(handle_email, text)
|
text = EMAIL_PATTERN.sub(lambda g: handle_email(g, lang_code = lang_code), text)
|
||||||
|
|
||||||
# Handle URLs if enabled
|
# Handle URLs if enabled
|
||||||
if normalization_options.url_normalization:
|
if normalization_options.url_normalization:
|
||||||
text = URL_PATTERN.sub(handle_url, text)
|
text = URL_PATTERN.sub(lambda g: handle_url(g, lang_code = lang_code), text)
|
||||||
|
|
||||||
# Pre-process numbers with units if enabled
|
# Pre-process numbers with units if enabled
|
||||||
if normalization_options.unit_normalization:
|
if normalization_options.unit_normalization:
|
||||||
text=UNIT_PATTERN.sub(handle_units,text)
|
text=UNIT_PATTERN.sub(lambda g: handle_units(g, lang_code = lang_code),text)
|
||||||
|
|
||||||
# Replace optional pluralization
|
# Replace optional pluralization
|
||||||
if normalization_options.optional_pluralization_normalization:
|
if normalization_options.optional_pluralization_normalization:
|
||||||
|
@ -290,7 +289,7 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str
|
||||||
|
|
||||||
# Replace phone numbers:
|
# Replace phone numbers:
|
||||||
if normalization_options.phone_normalization:
|
if normalization_options.phone_normalization:
|
||||||
text = re.sub(r"(\+?\d{1,2})?([ .-]?)(\(?\d{3}\)?)[\s.-](\d{3})[\s.-](\d{4})",handle_phone_number,text)
|
text = re.sub(r"(\+?\d{1,2})?([ .-]?)(\(?\d{3}\)?)[\s.-](\d{3})[\s.-](\d{4})",lambda g: handle_phone_number(g, lang_code = lang_code),text)
|
||||||
|
|
||||||
# Replace quotes and brackets
|
# Replace quotes and brackets
|
||||||
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
||||||
|
@ -302,7 +301,7 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str
|
||||||
text = text.replace(a, b + " ")
|
text = text.replace(a, b + " ")
|
||||||
|
|
||||||
# Handle simple time in the format of HH:MM:SS
|
# Handle simple time in the format of HH:MM:SS
|
||||||
text = TIME_PATTERN.sub(handle_time, text, )
|
text = TIME_PATTERN.sub(lambda g: handle_time(g, lang_code = lang_code), text, )
|
||||||
|
|
||||||
# Clean up whitespace
|
# Clean up whitespace
|
||||||
text = re.sub(r"[^\S \n]", " ", text)
|
text = re.sub(r"[^\S \n]", " ", text)
|
||||||
|
@ -324,15 +323,15 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str
|
||||||
|
|
||||||
text = re.sub(
|
text = re.sub(
|
||||||
r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
|
r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
|
||||||
handle_money,
|
lambda g: handle_money(g, lang_code = lang_code),
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = re.sub(
|
text = re.sub(
|
||||||
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
|
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", lambda g: split_num(g, lang_code = lang_code), text
|
||||||
)
|
)
|
||||||
|
|
||||||
text = re.sub(r"\d*\.\d+", handle_decimal, text)
|
text = re.sub(r"\d*\.\d+", lambda g: handle_decimal(g, lang_code = lang_code), text)
|
||||||
|
|
||||||
# Handle various formatting
|
# Handle various formatting
|
||||||
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
|
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
|
||||||
|
|
|
@ -88,7 +88,7 @@ def process_text(text: str, language: str = "a") -> List[int]:
|
||||||
return process_text_chunk(text, language)
|
return process_text_chunk(text, language)
|
||||||
|
|
||||||
|
|
||||||
def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[Tuple[str, List[int], int]]:
|
def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a") -> List[Tuple[str, List[int], int]]:
|
||||||
"""Process all sentences and return info."""
|
"""Process all sentences and return info."""
|
||||||
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
|
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
|
||||||
phoneme_length, min_value = len(custom_phenomes_list), 0
|
phoneme_length, min_value = len(custom_phenomes_list), 0
|
||||||
|
@ -109,7 +109,7 @@ def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[T
|
||||||
continue
|
continue
|
||||||
|
|
||||||
full = sentence + punct
|
full = sentence + punct
|
||||||
tokens = process_text_chunk(full)
|
tokens = process_text_chunk(full, language = lang_code)
|
||||||
results.append((full, tokens, len(tokens)))
|
results.append((full, tokens, len(tokens)))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
@ -134,15 +134,14 @@ async def smart_split(
|
||||||
|
|
||||||
# Normalize text
|
# Normalize text
|
||||||
if settings.advanced_text_normalization and normalization_options.normalize:
|
if settings.advanced_text_normalization and normalization_options.normalize:
|
||||||
print(lang_code)
|
|
||||||
if lang_code in ["a","b","en-us","en-gb"]:
|
if lang_code in ["a","b","en-us","en-gb"]:
|
||||||
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
|
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
|
||||||
text=normalize_text(text,normalization_options)
|
text = normalize_text(text,normalization_options, lang_code= lang_code)
|
||||||
else:
|
else:
|
||||||
logger.info("Skipping text normalization as it is only supported for english")
|
logger.info("Skipping text normalization as it is only supported for english")
|
||||||
|
|
||||||
# Process all sentences
|
# Process all sentences
|
||||||
sentences = get_sentence_info(text, custom_phoneme_list)
|
sentences = get_sentence_info(text, custom_phoneme_list, lang_code=lang_code)
|
||||||
|
|
||||||
current_chunk = []
|
current_chunk = []
|
||||||
current_tokens = []
|
current_tokens = []
|
||||||
|
@ -178,7 +177,7 @@ async def smart_split(
|
||||||
|
|
||||||
full_clause = clause + comma
|
full_clause = clause + comma
|
||||||
|
|
||||||
tokens = process_text_chunk(full_clause)
|
tokens = process_text_chunk(full_clause, language = lang_code)
|
||||||
count = len(tokens)
|
count = len(tokens)
|
||||||
|
|
||||||
# If adding clause keeps us under max and not optimal yet
|
# If adding clause keeps us under max and not optimal yet
|
||||||
|
|
|
@ -31,8 +31,8 @@ dependencies = [
|
||||||
"matplotlib>=3.10.0",
|
"matplotlib>=3.10.0",
|
||||||
"mutagen>=1.47.0",
|
"mutagen>=1.47.0",
|
||||||
"psutil>=6.1.1",
|
"psutil>=6.1.1",
|
||||||
"kokoro @ git+https://github.com/hexgrad/kokoro.git@26039de2dc8b2d464bb39506dacb44dffce9212a",
|
"kokoro @ git+https://github.com/hexgrad/kokoro.git@31a2b6337b8c1b1418ef68c48142328f640da938",
|
||||||
'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@f9f9f75d5d0e3c1e6f26f4847a1232d88c408051',
|
'misaki[en,ja,ko,zh] @ git+https://github.com/hexgrad/misaki.git@ebc76c21b66c5fc4866ed0ec234047177b396170',
|
||||||
"spacy==3.7.2",
|
"spacy==3.7.2",
|
||||||
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
|
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
|
||||||
"inflect>=7.5.0",
|
"inflect>=7.5.0",
|
||||||
|
|
Loading…
Add table
Reference in a new issue