mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Fixed formatting
This commit is contained in:
parent
f89b76d5d6
commit
c80d4feb69
3 changed files with 90 additions and 38 deletions
|
@ -4,6 +4,7 @@ Handles various text formats including URLs, emails, numbers, money, and special
|
||||||
Converts them into a format suitable for text-to-speech processing.
|
Converts them into a format suitable for text-to-speech processing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
import re
|
import re
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
@ -12,7 +13,6 @@ import inflect
|
||||||
from numpy import number
|
from numpy import number
|
||||||
from text_to_num import text2num
|
from text_to_num import text2num
|
||||||
from torch import mul
|
from torch import mul
|
||||||
import math
|
|
||||||
|
|
||||||
from ...structures.schemas import NormalizationOptions
|
from ...structures.schemas import NormalizationOptions
|
||||||
|
|
||||||
|
@ -134,11 +134,7 @@ VALID_UNITS = {
|
||||||
"px": "pixel", # CSS units
|
"px": "pixel", # CSS units
|
||||||
}
|
}
|
||||||
|
|
||||||
MONEY_UNITS = {
|
MONEY_UNITS = {"$": ("dollar", "cent"), "£": ("pound", "pence"), "€": ("euro", "cent")}
|
||||||
"$": ("dollar", "cent"),
|
|
||||||
"£": ("pound", "pence"),
|
|
||||||
"€": ("euro", "cent")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Pre-compiled regex patterns for performance
|
# Pre-compiled regex patterns for performance
|
||||||
EMAIL_PATTERN = re.compile(
|
EMAIL_PATTERN = re.compile(
|
||||||
|
@ -159,22 +155,24 @@ UNIT_PATTERN = re.compile(
|
||||||
)
|
)
|
||||||
|
|
||||||
TIME_PATTERN = re.compile(
|
TIME_PATTERN = re.compile(
|
||||||
r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?",
|
r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE
|
||||||
re.IGNORECASE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
MONEY_PATTERN = re.compile(
|
MONEY_PATTERN = re.compile(
|
||||||
r"(-?)([" + ''.join(MONEY_UNITS.keys()) + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b",
|
r"(-?)(["
|
||||||
re.IGNORECASE
|
+ "".join(MONEY_UNITS.keys())
|
||||||
|
+ r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
NUMBER_PATTERN = re.compile(
|
NUMBER_PATTERN = re.compile(
|
||||||
r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b",
|
r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b",
|
||||||
re.IGNORECASE
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
INFLECT_ENGINE = inflect.engine()
|
INFLECT_ENGINE = inflect.engine()
|
||||||
|
|
||||||
|
|
||||||
def handle_units(u: re.Match[str]) -> str:
|
def handle_units(u: re.Match[str]) -> str:
|
||||||
"""Converts units to their full form"""
|
"""Converts units to their full form"""
|
||||||
unit_string = u.group(6).strip()
|
unit_string = u.group(6).strip()
|
||||||
|
@ -199,19 +197,27 @@ def conditional_int(number: float, threshold: float = 0.00001):
|
||||||
return int(round(number))
|
return int(round(number))
|
||||||
return number
|
return number
|
||||||
|
|
||||||
|
|
||||||
def translate_multiplier(multiplier: str) -> str:
|
def translate_multiplier(multiplier: str) -> str:
|
||||||
"""Translate multiplier abrevations to words"""
|
"""Translate multiplier abrevations to words"""
|
||||||
|
|
||||||
multiplier_translation = {"k": "thousand", "m": "million", "b": "billion", "t": "trillion"}
|
multiplier_translation = {
|
||||||
|
"k": "thousand",
|
||||||
|
"m": "million",
|
||||||
|
"b": "billion",
|
||||||
|
"t": "trillion",
|
||||||
|
}
|
||||||
if multiplier.lower() in multiplier_translation:
|
if multiplier.lower() in multiplier_translation:
|
||||||
return multiplier_translation[multiplier.lower()]
|
return multiplier_translation[multiplier.lower()]
|
||||||
return multiplier.strip()
|
return multiplier.strip()
|
||||||
|
|
||||||
|
|
||||||
def split_four_digit(number: float):
|
def split_four_digit(number: float):
|
||||||
part1 = str(conditional_int(number))[:2]
|
part1 = str(conditional_int(number))[:2]
|
||||||
part2 = str(conditional_int(number))[2:]
|
part2 = str(conditional_int(number))[2:]
|
||||||
return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}"
|
return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}"
|
||||||
|
|
||||||
|
|
||||||
def handle_numbers(n: re.Match[str]) -> str:
|
def handle_numbers(n: re.Match[str]) -> str:
|
||||||
number = n.group(2)
|
number = n.group(2)
|
||||||
|
|
||||||
|
@ -229,11 +235,17 @@ def handle_numbers(n: re.Match[str]) -> str:
|
||||||
if multiplier != "":
|
if multiplier != "":
|
||||||
multiplier = f" {multiplier}"
|
multiplier = f" {multiplier}"
|
||||||
else:
|
else:
|
||||||
if number % 1 == 0 and len(str(number)) == 4 and number > 1500 and number % 1000 > 9:
|
if (
|
||||||
|
number % 1 == 0
|
||||||
|
and len(str(number)) == 4
|
||||||
|
and number > 1500
|
||||||
|
and number % 1000 > 9
|
||||||
|
):
|
||||||
return split_four_digit(number)
|
return split_four_digit(number)
|
||||||
|
|
||||||
return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}"
|
return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}"
|
||||||
|
|
||||||
|
|
||||||
def handle_money(m: re.Match[str]) -> str:
|
def handle_money(m: re.Match[str]) -> str:
|
||||||
"""Convert money expressions to spoken form"""
|
"""Convert money expressions to spoken form"""
|
||||||
|
|
||||||
|
@ -365,7 +377,7 @@ def handle_time(t: re.Match[str]) -> str:
|
||||||
half = ""
|
half = ""
|
||||||
if len(time_parts) > 2:
|
if len(time_parts) > 2:
|
||||||
seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip())
|
seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip())
|
||||||
second_word = INFLECT_ENGINE.plural('second',int(time_parts[2].strip()))
|
second_word = INFLECT_ENGINE.plural("second", int(time_parts[2].strip()))
|
||||||
numbers.append(f"and {seconds_number} {second_word}")
|
numbers.append(f"and {seconds_number} {second_word}")
|
||||||
else:
|
else:
|
||||||
if t[2] is not None:
|
if t[2] is not None:
|
||||||
|
@ -441,10 +453,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = NUMBER_PATTERN.sub(
|
text = NUMBER_PATTERN.sub(handle_numbers, text)
|
||||||
handle_numbers,
|
|
||||||
text
|
|
||||||
)
|
|
||||||
|
|
||||||
text = re.sub(r"\d*\.\d+", handle_decimal, text)
|
text = re.sub(r"\d*\.\d+", handle_decimal, text)
|
||||||
|
|
||||||
|
|
|
@ -150,7 +150,7 @@ def test_money():
|
||||||
assert (
|
assert (
|
||||||
normalize_text(
|
normalize_text(
|
||||||
"He went gambling and lost about $25.05k.",
|
"He went gambling and lost about $25.05k.",
|
||||||
normalization_options=NormalizationOptions()
|
normalization_options=NormalizationOptions(),
|
||||||
)
|
)
|
||||||
== "He went gambling and lost about twenty-five point zero five thousand dollars."
|
== "He went gambling and lost about twenty-five point zero five thousand dollars."
|
||||||
)
|
)
|
||||||
|
@ -169,91 +169,134 @@ def test_money():
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("The plant cost $200,000.8.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"The plant cost $200,000.8.", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "The plant cost two hundred thousand dollars and eighty cents."
|
== "The plant cost two hundred thousand dollars and eighty cents."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("€30.2 is in euros", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"€30.2 is in euros", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "thirty euros and twenty cents is in euros"
|
== "thirty euros and twenty cents is in euros"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_time():
|
def test_time():
|
||||||
"""Test time normalization"""
|
"""Test time normalization"""
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("Your flight leaves at 10:35 pm", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"Your flight leaves at 10:35 pm",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "Your flight leaves at ten thirty-five pm"
|
== "Your flight leaves at ten thirty-five pm"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("He departed for london around 5:03 am.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"He departed for london around 5:03 am.",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "He departed for london around five oh three am."
|
== "He departed for london around five oh three am."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("Only the 13:42 and 15:12 slots are available.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"Only the 13:42 and 15:12 slots are available.",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "Only the thirteen forty-two and fifteen twelve slots are available."
|
== "Only the thirteen forty-two and fifteen twelve slots are available."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("It is currently 1:00 pm", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"It is currently 1:00 pm", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "It is currently one pm"
|
== "It is currently one pm"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("It is currently 3:00", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"It is currently 3:00", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "It is currently three o'clock"
|
== "It is currently three o'clock"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("12:00 am is midnight", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"12:00 am is midnight", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "twelve am is midnight"
|
== "twelve am is midnight"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_number():
|
def test_number():
|
||||||
"""Test number normalization"""
|
"""Test number normalization"""
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("I bought 1035 cans of soda", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"I bought 1035 cans of soda", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "I bought one thousand and thirty-five cans of soda"
|
== "I bought one thousand and thirty-five cans of soda"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("The bus has a maximum capacity of 62 people", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"The bus has a maximum capacity of 62 people",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "The bus has a maximum capacity of sixty-two people"
|
== "The bus has a maximum capacity of sixty-two people"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("There are 1300 products left in stock", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"There are 1300 products left in stock",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "There are one thousand, three hundred products left in stock"
|
== "There are one thousand, three hundred products left in stock"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("The population is 7,890,000 people.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"The population is 7,890,000 people.",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "The population is seven million, eight hundred and ninety thousand people."
|
== "The population is seven million, eight hundred and ninety thousand people."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("He looked around but only found 1.6k of the 10k bricks", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"He looked around but only found 1.6k of the 10k bricks",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "He looked around but only found one point six thousand of the ten thousand bricks"
|
== "He looked around but only found one point six thousand of the ten thousand bricks"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("The book has 342 pages.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"The book has 342 pages.", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "The book has three hundred and forty-two pages."
|
== "The book has three hundred and forty-two pages."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("He made -50 sales today.", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"He made -50 sales today.", normalization_options=NormalizationOptions()
|
||||||
|
)
|
||||||
== "He made minus fifty sales today."
|
== "He made minus fifty sales today."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
normalize_text("56.789 to the power of 1.35 million", normalization_options=NormalizationOptions())
|
normalize_text(
|
||||||
|
"56.789 to the power of 1.35 million",
|
||||||
|
normalization_options=NormalizationOptions(),
|
||||||
|
)
|
||||||
== "fifty-six point seven eight nine to the power of one point three five million"
|
== "fifty-six point seven eight nine to the power of one point three five million"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_non_url_text():
|
def test_non_url_text():
|
||||||
"""Test that non-URL text is unaffected"""
|
"""Test that non-URL text is unaffected"""
|
||||||
assert (
|
assert (
|
||||||
|
|
Loading…
Add table
Reference in a new issue