diff --git a/README.md b/README.md
index 6859163..5318b03 100644
--- a/README.md
+++ b/README.md
@@ -516,7 +516,36 @@ Monitor system state and resource usage with these endpoints:
Useful for debugging resource exhaustion or performance issues.
-## Known Issues
+## Known Issues & Troubleshooting
+
+
+Missing words & Missing some timestamps
+
+The api will automaticly do text normalization on input text which may incorrectly remove or change some phrases. This can be disabled by adding `"normalization_options":{"normalize": false}` to your request json:
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "input": "Hello world!",
+ "voice": "af_heart",
+ "response_format": "pcm",
+ "normalization_options":
+ {
+ "normalize": False
+ }
+ },
+ stream=True
+)
+
+for chunk in response.iter_content(chunk_size=1024):
+ if chunk:
+ # Process streaming chunks
+ pass
+```
+
+
Versioning & Development
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
index 742c216..1e89151 100644
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@@ -125,20 +125,18 @@ async def process_and_validate_voices(voice_input: Union[str, List[str]], tts_se
async def stream_audio_chunks(tts_service: TTSService, request: Union[OpenAISpeechRequest, CaptionedSpeechRequest], client_request: Request, writer: StreamingAudioWriter) -> AsyncGenerator[AudioChunk, None]:
"""Stream audio chunks as they're generated with client disconnect handling"""
voice_name = await process_and_validate_voices(request.voice, tts_service)
-
unique_properties = {"return_timestamps": False}
if hasattr(request, "return_timestamps"):
unique_properties["return_timestamps"] = request.return_timestamps
try:
- logger.info(f"Starting audio generation with lang_code: {request.lang_code}")
async for chunk_data in tts_service.generate_audio_stream(
text=request.input,
voice=voice_name,
writer=writer,
speed=request.speed,
output_format=request.response_format,
- lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(),
+ lang_code=request.lang_code,
normalization_options=request.normalization_options,
return_timestamps=unique_properties["return_timestamps"],
):
diff --git a/api/src/services/streaming_audio_writer.py b/api/src/services/streaming_audio_writer.py
index 763c5eb..75d87b4 100644
--- a/api/src/services/streaming_audio_writer.py
+++ b/api/src/services/streaming_audio_writer.py
@@ -25,7 +25,7 @@ class StreamingAudioWriter:
if self.format in ["wav","flac","mp3","pcm","aac","opus"]:
if self.format != "pcm":
self.output_buffer = BytesIO()
- self.container = av.open(self.output_buffer, mode="w", format=self.format)
+ self.container = av.open(self.output_buffer, mode="w", format=self.format if self.format != "aac" else "adts")
self.stream = self.container.add_stream(codec_map[self.format],sample_rate=self.sample_rate,layout='mono' if self.channels == 1 else 'stereo')
self.stream.bit_rate = 128000
else:
diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
index 84c3694..0acfe4d 100644
--- a/api/src/services/text_processing/normalizer.py
+++ b/api/src/services/text_processing/normalizer.py
@@ -8,9 +8,11 @@ import re
from functools import lru_cache
import inflect
from numpy import number
-
+from torch import mul
from ...structures.schemas import NormalizationOptions
+from text_to_num import text2num
+
# Constants
VALID_TLDS = [
"com",
@@ -134,25 +136,35 @@ def handle_units(u: re.Match[str]) -> str:
unit[0]=INFLECT_ENGINE.no(unit[0],number)
return " ".join(unit)
+def conditional_int(number: float, threshold: float = 0.00001):
+ if abs(round(number) - number) < threshold:
+ return int(round(number))
+ return number
+
def handle_money(m: re.Match[str]) -> str:
"""Convert money expressions to spoken form"""
- m = m.group()
- bill = "dollar" if m[0] == "$" else "pound"
- if m[-1].isalpha():
- return f"{INFLECT_ENGINE.number_to_words(m[1:])} {bill}s"
- elif "." not in m:
- s = "" if m[1:] == "1" else "s"
- return f"{INFLECT_ENGINE.number_to_words(m[1:])} {bill}{s}"
- b, c = m[1:].split(".")
- s = "" if b == "1" else "s"
- c = int(c.ljust(2, "0"))
- coins = (
- f"cent{'' if c == 1 else 's'}"
- if m[0] == "$"
- else ("penny" if c == 1 else "pence")
- )
- return f"{INFLECT_ENGINE.number_to_words(b)} {bill}{s} and {INFLECT_ENGINE.number_to_words(c)} {coins}"
+ bill = "dollar" if m.group(2) == "$" else "pound"
+ coin = "cent" if m.group(2) == "$" else "pence"
+ number = m.group(3)
+
+ multiplier = m.group(4)
+ try:
+ number = float(number)
+ except:
+ return m.group()
+
+ if m.group(1) == "-":
+ number *= -1
+
+ if number % 1 == 0 or multiplier != "":
+ text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
+ else:
+ sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
+
+ text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
+
+ return text_number
def handle_decimal(num: re.Match[str]) -> str:
"""Convert decimal numbers to spoken form"""
@@ -297,7 +309,7 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(
- r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
+ r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
handle_money,
text,
)
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
index 0d8d36c..0bd4658 100644
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@@ -134,6 +134,7 @@ async def smart_split(
# Normalize text
if settings.advanced_text_normalization and normalization_options.normalize:
+ print(lang_code)
if lang_code in ["a","b","en-us","en-gb"]:
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
text=normalize_text(text,normalization_options)
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index f740a29..8a6bb42 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -258,7 +258,7 @@ class TTSService:
logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream")
# Process text in chunks with smart splitting
- async for chunk_text, tokens in smart_split(text, lang_code=lang_code, normalization_options=normalization_options):
+ async for chunk_text, tokens in smart_split(text, lang_code=pipeline_lang_code, normalization_options=normalization_options):
try:
# Process audio for chunk
async for chunk_data in self._process_chunk(
diff --git a/api/tests/test_kokoro_v1.py b/api/tests/test_kokoro_v1.py
index 850ed05..29d83c5 100644
--- a/api/tests/test_kokoro_v1.py
+++ b/api/tests/test_kokoro_v1.py
@@ -23,19 +23,18 @@ def test_initial_state(kokoro_backend):
@patch("torch.cuda.is_available", return_value=True)
-@patch("torch.cuda.memory_allocated")
+@patch("torch.cuda.memory_allocated", return_value=5e9)
def test_memory_management(mock_memory, mock_cuda, kokoro_backend):
"""Test GPU memory management functions."""
- # Mock GPU memory usage
- mock_memory.return_value = 5e9 # 5GB
+ # Patch backend so it thinks we have cuda
+ with patch.object(kokoro_backend, "_device", "cuda"):
+ # Test memory check
+ with patch("api.src.inference.kokoro_v1.model_config") as mock_config:
+ mock_config.pytorch_gpu.memory_threshold = 4
+ assert kokoro_backend._check_memory() == True
- # Test memory check
- with patch("api.src.inference.kokoro_v1.model_config") as mock_config:
- mock_config.pytorch_gpu.memory_threshold = 4
- assert kokoro_backend._check_memory() == True
-
- mock_config.pytorch_gpu.memory_threshold = 6
- assert kokoro_backend._check_memory() == False
+ mock_config.pytorch_gpu.memory_threshold = 6
+ assert kokoro_backend._check_memory() == False
@patch("torch.cuda.empty_cache")
diff --git a/api/tests/test_normalizer.py b/api/tests/test_normalizer.py
index 0aa963e..6dd4342 100644
--- a/api/tests/test_normalizer.py
+++ b/api/tests/test_normalizer.py
@@ -83,7 +83,12 @@ def test_url_email_addresses():
== "Send to test dot user at site dot com"
)
-
+def test_money():
+ """Test that money text is normalized correctly"""
+ assert normalize_text("He lost $5.3 thousand.",normalization_options=NormalizationOptions()) == "He lost five point three thousand dollars."
+ assert normalize_text("To put it weirdly -$6.9 million",normalization_options=NormalizationOptions()) == "To put it weirdly minus six point nine million dollars"
+ assert normalize_text("It costs $50.3.",normalization_options=NormalizationOptions()) == "It costs fifty dollars and thirty cents."
+
def test_non_url_text():
"""Test that non-URL text is unaffected"""
assert normalize_text("This is not.a.url text",normalization_options=NormalizationOptions()) == "This is not-a-url text"
diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py
index 3d844b1..7e5fb0f 100644
--- a/api/tests/test_text_processor.py
+++ b/api/tests/test_text_processor.py
@@ -34,7 +34,7 @@ def test_process_text_chunk_phonemes():
def test_get_sentence_info():
"""Test sentence splitting and info extraction."""
text = "This is sentence one. This is sentence two! What about three?"
- results = get_sentence_info(text)
+ results = get_sentence_info(text, {})
assert len(results) == 3
for sentence, tokens, count in results:
@@ -44,6 +44,19 @@ def test_get_sentence_info():
assert count == len(tokens)
assert count > 0
+def test_get_sentence_info_phenomoes():
+ """Test sentence splitting and info extraction."""
+ text = "This is sentence one. This is |custom_phonemes_0|/> two! What about three?"
+ results = get_sentence_info(text, {"|custom_phonemes_0|/>": r"sˈɛntᵊns"})
+
+ assert len(results) == 3
+ assert "sˈɛntᵊns" in results[1][0]
+ for sentence, tokens, count in results:
+ assert isinstance(sentence, str)
+ assert isinstance(tokens, list)
+ assert isinstance(count, int)
+ assert count == len(tokens)
+ assert count > 0
@pytest.mark.asyncio
async def test_smart_split_short_text():
diff --git a/Test Threads.py b/dev/Test Threads.py
similarity index 100%
rename from Test Threads.py
rename to dev/Test Threads.py
diff --git a/Test copy.py b/dev/Test copy.py
similarity index 100%
rename from Test copy.py
rename to dev/Test copy.py
diff --git a/dev/Test money.py b/dev/Test money.py
new file mode 100644
index 0000000..4956070
--- /dev/null
+++ b/dev/Test money.py
@@ -0,0 +1,26 @@
+import requests
+import base64
+import json
+
+text="""the administration has offered up a platter of repression for more than a year and is still slated to lose $400 million.
+
+Columbia is the largest private landowner in New York City and boasts an endowment of $14.8 billion;"""
+
+
+Type="wav"
+
+response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart+af_sky",
+ "speed": 1.0,
+ "response_format": Type,
+ "stream": False,
+ },
+ stream=True
+)
+
+with open(f"outputnostreammoney.{Type}", "wb") as f:
+ f.write(response.content)
diff --git a/dev/Test num.py b/dev/Test num.py
new file mode 100644
index 0000000..15bd4d7
--- /dev/null
+++ b/dev/Test num.py
@@ -0,0 +1,45 @@
+from text_to_num import text2num
+import re
+import inflect
+from torch import mul
+
+INFLECT_ENGINE = inflect.engine()
+
+
+def conditional_int(number: float, threshold: float = 0.00001):
+ if abs(round(number) - number) < threshold:
+ return int(round(number))
+ return number
+
+def handle_money(m: re.Match[str]) -> str:
+ """Convert money expressions to spoken form"""
+
+ bill = "dollar" if m.group(2) == "$" else "pound"
+ coin = "cent" if m.group(2) == "$" else "pence"
+ number = m.group(3)
+
+ multiplier = m.group(4)
+ try:
+ number = float(number)
+ except:
+ return m.group()
+
+ if m.group(1) == "-":
+ number *= -1
+
+ if number % 1 == 0 or multiplier != "":
+ text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
+ else:
+ sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
+
+ text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
+
+ return text_number
+
+
+text = re.sub(
+ r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
+ handle_money,
+ "he administration has offered up a platter of repression for more than a year and is still slated to lose -$5.3 billion",
+)
+print(text)
diff --git a/Test.py b/dev/Test.py
similarity index 100%
rename from Test.py
rename to dev/Test.py
diff --git a/pyproject.toml b/pyproject.toml
index 3b9e486..5e6bd9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
"inflect>=7.5.0",
"phonemizer-fork>=3.3.2",
"av>=14.2.0",
+ "text2num>=2.5.1",
]
[project.optional-dependencies]