diff --git a/README.md b/README.md
index 04a235c..9b6d3f7 100644
--- a/README.md
+++ b/README.md
@@ -243,7 +243,7 @@ response = requests.post(
- wav
- opus
- flac
-- aac
+- m4a
- pcm
@@ -348,7 +348,7 @@ Key Performance Metrics:
GPU Vs. CPU
```bash
-# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x-100x realtime speed)
+# GPU: Requires NVIDIA GPU with CUDA 12.8 support (~35x-100x realtime speed)
cd docker/gpu
docker compose up --build
@@ -373,9 +373,10 @@ The model is capable of processing up to a 510 phonemized token chunk at a time,
Timestamped Captions & Phonemes
-Generate audio with word-level timestamps:
+Generate audio with word-level timestamps without streaming:
```python
import requests
+import base64
import json
response = requests.post(
@@ -385,19 +386,58 @@ response = requests.post(
"input": "Hello world!",
"voice": "af_bella",
"speed": 1.0,
- "response_format": "wav"
- }
+ "response_format": "mp3",
+ "stream": False,
+ },
+ stream=False
)
-# Get timestamps from header
-timestamps = json.loads(response.headers['X-Word-Timestamps'])
-print("Word-level timestamps:")
-for ts in timestamps:
- print(f"{ts['word']}: {ts['start_time']:.3f}s - {ts['end_time']:.3f}s")
+with open("output.mp3","wb") as f:
-# Save audio
-with open("output.wav", "wb") as f:
- f.write(response.content)
+ audio_json=json.loads(response.content)
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8"))
+
+ # Process streaming chunks
+ f.write(chunk_audio)
+
+ # Print word level timestamps
+ print(audio_json["timestamps"])
+```
+
+Generate audio with word-level timestamps with streaming:
+```python
+import requests
+import base64
+import json
+
+response = requests.post(
+ "http://localhost:8880/dev/captioned_speech",
+ json={
+ "model": "kokoro",
+ "input": "Hello world!",
+ "voice": "af_bella",
+ "speed": 1.0,
+ "response_format": "mp3",
+ "stream": True,
+ },
+ stream=True
+)
+
+f=open("output.mp3","wb")
+for chunk in response.iter_lines(decode_unicode=True):
+ if chunk:
+ chunk_json=json.loads(chunk)
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(chunk_json["audio"].encode("utf-8"))
+
+ # Process streaming chunks
+ f.write(chunk_audio)
+
+ # Print word level timestamps
+ print(chunk_json["timestamps"])
```
diff --git a/Test copy.py b/Test copy.py
new file mode 100644
index 0000000..4ecbc5e
--- /dev/null
+++ b/Test copy.py
@@ -0,0 +1,88 @@
+import requests
+import base64
+import json
+import pydub
+text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
+
+5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences.
+
+The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrator’s composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships.
+
+Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire.
+
+However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile.
+
+The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence.
+
+Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation.
+
+In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience."""
+
+"""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
+
+5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos."""
+
+
+Type="mp3"
+response = requests.post(
+ "http://localhost:8880/dev/captioned_speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart+af_sky",
+ "speed": 1.0,
+ "response_format": Type,
+ "stream": True,
+ },
+ stream=True
+)
+
+f=open(f"outputstream.{Type}","wb")
+for chunk in response.iter_lines(decode_unicode=True):
+ if chunk:
+ temp_json=json.loads(chunk)
+ if temp_json["timestamps"] != []:
+ chunk_json=temp_json
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(temp_json["audio"].encode("utf-8"))
+
+ # Process streaming chunks
+ f.write(chunk_audio)
+
+ # Print word level timestamps
+last3=chunk_json["timestamps"][-3]
+
+print(f"CUTTING TO {last3['word']}")
+
+audioseg=pydub.AudioSegment.from_file(f"outputstream.{Type}",format=Type)
+audioseg=audioseg[last3["start_time"]*1000:last3["end_time"] * 1000]
+audioseg.export(f"outputstreamcut.{Type}",format=Type)
+
+
+"""
+response = requests.post(
+ "http://localhost:8880/dev/captioned_speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart+af_sky",
+ "speed": 1.0,
+ "response_format": Type,
+ "stream": False,
+ },
+ stream=True
+)
+
+with open(f"outputnostream.{Type}", "wb") as f:
+ audio_json=json.loads(response.content)
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8"))
+
+ # Process streaming chunks
+ f.write(chunk_audio)
+
+ # Print word level timestamps
+ print(audio_json["timestamps"])
+"""
\ No newline at end of file
diff --git a/Test.py b/Test.py
new file mode 100644
index 0000000..b1db365
--- /dev/null
+++ b/Test.py
@@ -0,0 +1,63 @@
+import requests
+import base64
+import json
+
+text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
+
+5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences.
+
+The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrator’s composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships.
+
+Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire.
+
+However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile.
+
+The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence.
+
+Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation.
+
+In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience."""
+
+text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
+
+5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos."""
+
+
+Type="wav"
+
+
+response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart+af_sky",
+ "speed": 1.0,
+ "response_format": Type,
+ "stream": True,
+ },
+ stream=True
+)
+
+
+f=open(f"outputstream.{Type}","wb")
+for chunk in response.iter_content():
+ if chunk:
+ # Process streaming chunks
+ f.write(chunk)
+
+response = requests.post(
+ "http://localhost:8880/v1/audio/speech",
+ json={
+ "model": "kokoro",
+ "input": text,
+ "voice": "af_heart+af_sky",
+ "speed": 1.0,
+ "response_format": Type,
+ "stream": False,
+ },
+ stream=True
+)
+
+with open(f"outputnostream.{Type}", "wb") as f:
+ f.write(response.content)
diff --git a/api/src/inference/base.py b/api/src/inference/base.py
index 5629bd7..6b59fa9 100644
--- a/api/src/inference/base.py
+++ b/api/src/inference/base.py
@@ -1,12 +1,34 @@
"""Base interface for Kokoro inference."""
from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Optional, Tuple, Union
+from typing import AsyncGenerator, Optional, Tuple, Union, List
import numpy as np
import torch
-
+class AudioChunk:
+ """Class for audio chunks returned by model backends"""
+
+ def __init__(self,
+ audio: np.ndarray,
+ word_timestamps: Optional[List]=[],
+ output: Optional[Union[bytes,np.ndarray]]=b""
+ ):
+ self.audio=audio
+ self.word_timestamps=word_timestamps
+ self.output=output
+
+ @staticmethod
+ def combine(audio_chunk_list: List):
+ output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
+
+ for audio_chunk in audio_chunk_list[1:]:
+ output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16)
+ if output.word_timestamps is not None:
+ output.word_timestamps+=audio_chunk.word_timestamps
+
+ return output
+
class ModelBackend(ABC):
"""Abstract base class for model inference backend."""
@@ -28,7 +50,7 @@ class ModelBackend(ABC):
text: str,
voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
speed: float = 1.0,
- ) -> AsyncGenerator[np.ndarray, None]:
+ ) -> AsyncGenerator[AudioChunk, None]:
"""Generate audio from text.
Args:
diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py
index 99e76fa..419ade7 100644
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@@ -12,8 +12,8 @@ from ..core import paths
from ..core.config import settings
from ..core.model_config import model_config
from .base import BaseModelBackend
-
-
+from .base import AudioChunk
+from ..structures.schemas import WordTimestamp
class KokoroV1(BaseModelBackend):
"""Kokoro backend with controlled resource management."""
@@ -181,7 +181,8 @@ class KokoroV1(BaseModelBackend):
voice: Union[str, Tuple[str, Union[torch.Tensor, str]]],
speed: float = 1.0,
lang_code: Optional[str] = None,
- ) -> AsyncGenerator[np.ndarray, None]:
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
"""Generate audio using model.
Args:
@@ -249,7 +250,54 @@ class KokoroV1(BaseModelBackend):
):
if result.audio is not None:
logger.debug(f"Got audio chunk with shape: {result.audio.shape}")
- yield result.audio.numpy()
+ word_timestamps=None
+ if return_timestamps and hasattr(result, "tokens") and result.tokens:
+ word_timestamps=[]
+ current_offset=0.0
+ logger.debug(
+ f"Processing chunk timestamps with {len(result.tokens)} tokens"
+ )
+ if result.pred_dur is not None:
+ try:
+ # Join timestamps for this chunk's tokens
+ KPipeline.join_timestamps(
+ result.tokens, result.pred_dur
+ )
+
+ # Add timestamps with offset
+ for token in result.tokens:
+ if not all(
+ hasattr(token, attr)
+ for attr in [
+ "text",
+ "start_ts",
+ "end_ts",
+ ]
+ ):
+ continue
+ if not token.text or not token.text.strip():
+ continue
+
+ start_time = float(token.start_ts) + current_offset
+ end_time = float(token.end_ts) + current_offset
+ word_timestamps.append(
+ WordTimestamp(
+ word=str(token.text).strip(),
+ start_time=start_time,
+ end_time=end_time
+ )
+ )
+ logger.debug(
+ f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
+ )
+
+ except Exception as e:
+ logger.error(
+ f"Failed to process timestamps for chunk: {e}"
+ )
+
+
+ yield AudioChunk(result.audio.numpy(),word_timestamps=word_timestamps)
else:
logger.warning("No audio in chunk")
diff --git a/api/src/routers/development.py b/api/src/routers/development.py
index 20c2206..7fbcc56 100644
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@@ -1,12 +1,14 @@
+import re
from typing import List, Union, AsyncGenerator, Tuple
import numpy as np
import torch
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
-from fastapi.responses import StreamingResponse, FileResponse
+from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
from kokoro import KPipeline
from loguru import logger
+from ..inference.base import AudioChunk
from ..core.config import settings
from ..services.audio import AudioNormalizer, AudioService
from ..services.streaming_audio_writer import StreamingAudioWriter
@@ -14,13 +16,16 @@ from ..services.text_processing import smart_split
from ..services.tts_service import TTSService
from ..services.temp_manager import TempFileWriter
from ..structures import CaptionedSpeechRequest, CaptionedSpeechResponse, WordTimestamp
+from ..structures.custom_responses import JSONStreamingResponse
from ..structures.text_schemas import (
GenerateFromPhonemesRequest,
PhonemeRequest,
PhonemeResponse,
)
+from .openai_compatible import process_voices, stream_audio_chunks
import json
import os
+import base64
from pathlib import Path
@@ -152,40 +157,6 @@ async def generate_from_phonemes(
},
)
-
-@router.get("/dev/timestamps/{filename}")
-async def get_timestamps(filename: str):
- """Download timestamps from temp storage"""
- try:
- from ..core.paths import _find_file
-
- # Search for file in temp directory
- file_path = await _find_file(
- filename=filename, search_paths=[settings.temp_file_dir]
- )
-
- return FileResponse(
- file_path,
- media_type="application/json",
- filename=filename,
- headers={
- "Cache-Control": "no-cache",
- "Content-Disposition": f"attachment; filename={filename}",
- },
- )
-
- except Exception as e:
- logger.error(f"Error serving timestamps file {filename}: {e}")
- raise HTTPException(
- status_code=500,
- detail={
- "error": "server_error",
- "message": "Failed to serve timestamps file",
- "type": "server_error",
- },
- )
-
-
@router.post("/dev/captioned_speech")
async def create_captioned_speech(
request: CaptionedSpeechRequest,
@@ -194,127 +165,162 @@ async def create_captioned_speech(
tts_service: TTSService = Depends(get_tts_service),
):
"""Generate audio with word-level timestamps using streaming approach"""
+
try:
+ # model_name = get_model_name(request.model)
+ tts_service = await get_tts_service()
+ voice_name = await process_voices(request.voice, tts_service)
+
# Set content type based on format
content_type = {
"mp3": "audio/mpeg",
"opus": "audio/opus",
- "aac": "audio/aac",
+ "m4a": "audio/mp4",
"flac": "audio/flac",
"wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
- # Create streaming audio writer and normalizer
- writer = StreamingAudioWriter(
- format=request.response_format, sample_rate=24000, channels=1
- )
- normalizer = AudioNormalizer()
+ # Check if streaming is requested (default for OpenAI client)
+ if request.stream:
+ # Create generator but don't start it yet
+ generator = stream_audio_chunks(tts_service, request, client_request)
- # Get voice path
- voice_name, voice_path = await tts_service._get_voice_path(request.voice)
+ # If download link requested, wrap generator with temp file writer
+ if request.return_download_link:
+ from ..services.temp_manager import TempFileWriter
- # Use provided lang_code or determine from voice name
- pipeline_lang_code = request.lang_code if request.lang_code else request.voice[0].lower()
- logger.info(
- f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
- )
+ temp_writer = TempFileWriter(request.response_format)
+ await temp_writer.__aenter__() # Initialize temp file
- # Get backend and pipeline
- backend = tts_service.model_manager.get_backend()
- pipeline = backend._get_pipeline(pipeline_lang_code)
+ # Get download path immediately after temp file creation
+ download_path = temp_writer.download_path
- # Create temp file writer for timestamps
- temp_writer = TempFileWriter("json")
- await temp_writer.__aenter__() # Initialize temp file
- # Get just the filename without the path
- timestamps_filename = Path(temp_writer.download_path).name
+ # Create response headers with download path
+ headers = {
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ "X-Download-Path": download_path,
+ }
- # Initialize variables for timestamps
- word_timestamps = []
- current_offset = 0.0
+ # Create async generator for streaming
+ async def dual_output():
+ try:
+ # Write chunks to temp file and stream
+ async for chunk_data in generator:
+ # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+ timestamp_acumulator=[]
+
+ if chunk_data.output: # Skip empty chunks
+ await temp_writer.write(chunk_data.output)
+ base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
+
+ # Add any chunks that may be in the acumulator into the return word_timestamps
+ chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
+ timestamp_acumulator=[]
+
+ yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
+ else:
+ if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
+ timestamp_acumulator+=chunk_data.word_timestamps
+
+ # Finalize the temp file
+ await temp_writer.finalize()
+ except Exception as e:
+ logger.error(f"Error in dual output streaming: {e}")
+ await temp_writer.__aexit__(type(e), e, e.__traceback__)
+ raise
+ finally:
+ # Ensure temp writer is closed
+ if not temp_writer._finalized:
+ await temp_writer.__aexit__(None, None, None)
- async def generate_chunks():
- nonlocal current_offset, word_timestamps
- try:
- # Process text in chunks with smart splitting
- async for chunk_text, tokens in smart_split(request.input):
- # Process chunk with pipeline
- for result in pipeline(chunk_text, voice=voice_path, speed=request.speed):
- if result.audio is not None:
- # Process timestamps for this chunk
- if hasattr(result, "tokens") and result.tokens and result.pred_dur is not None:
- try:
- # Join timestamps for this chunk's tokens
- KPipeline.join_timestamps(result.tokens, result.pred_dur)
+ # Stream with temp file writing
+ return JSONStreamingResponse(
+ dual_output(), media_type="application/json", headers=headers
+ )
- # Add timestamps with offset
- for token in result.tokens:
- if not all(
- hasattr(token, attr)
- for attr in ["text", "start_ts", "end_ts"]
- ):
- continue
- if not token.text or not token.text.strip():
- continue
+ async def single_output():
+ try:
+ # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
+ timestamp_acumulator=[]
+
+ # Stream chunks
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ # Encode the chunk bytes into base 64
+ base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
+
+ # Add any chunks that may be in the acumulator into the return word_timestamps
+ chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
+ timestamp_acumulator=[]
+
+ yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
+ else:
+ if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
+ timestamp_acumulator+=chunk_data.word_timestamps
+
+ except Exception as e:
+ logger.error(f"Error in single output streaming: {e}")
+ raise
- # Apply offset to timestamps
- start_time = float(token.start_ts) + current_offset
- end_time = float(token.end_ts) + current_offset
-
- word_timestamps.append(
- {
- "word": str(token.text).strip(),
- "start_time": start_time,
- "end_time": end_time,
- }
- )
-
- # Update offset for next chunk
- chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds
- current_offset = max(current_offset + chunk_duration, end_time)
-
- except Exception as e:
- logger.error(f"Failed to process timestamps for chunk: {e}")
-
- # Process audio
- audio_chunk = result.audio.numpy()
- normalized_audio = await normalizer.normalize(audio_chunk)
- chunk_bytes = writer.write_chunk(normalized_audio)
- if chunk_bytes:
- yield chunk_bytes
-
- # Write timestamps to temp file
- timestamps_json = json.dumps(word_timestamps)
- await temp_writer.write(timestamps_json.encode())
- await temp_writer.finalize()
-
- # Finalize audio
- final_bytes = writer.write_chunk(finalize=True)
- if final_bytes:
- yield final_bytes
-
- except Exception as e:
- logger.error(f"Error in audio generation: {str(e)}")
- # Clean up writer on error
- writer.write_chunk(finalize=True)
- await temp_writer.__aexit__(type(e), e, e.__traceback__)
- # Re-raise the original exception
- raise
-
- return StreamingResponse(
- generate_chunks(),
- media_type=content_type,
- headers={
- "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
- "X-Accel-Buffering": "no",
- "Cache-Control": "no-cache",
- "Transfer-Encoding": "chunked",
- "X-Timestamps-Path": timestamps_filename,
- },
- )
+ # Standard streaming without download link
+ return JSONStreamingResponse(
+ single_output(),
+ media_type="application/json",
+ headers={
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "X-Accel-Buffering": "no",
+ "Cache-Control": "no-cache",
+ "Transfer-Encoding": "chunked",
+ },
+ )
+ else:
+ # Generate complete audio using public interface
+ audio_data = await tts_service.generate_audio(
+ text=request.input,
+ voice=voice_name,
+ speed=request.speed,
+ return_timestamps=request.return_timestamps,
+ normalization_options=request.normalization_options,
+ lang_code=request.lang_code,
+ )
+
+ audio_data = await AudioService.convert_audio(
+ audio_data,
+ 24000,
+ request.response_format,
+ is_first_chunk=True,
+ is_last_chunk=False,
+ trim_audio=False,
+ )
+
+ # Convert to requested format with proper finalization
+ final = await AudioService.convert_audio(
+ AudioChunk(np.array([], dtype=np.int16)),
+ 24000,
+ request.response_format,
+ is_first_chunk=False,
+ is_last_chunk=True,
+ )
+ output=audio_data.output + final.output
+
+ base64_output= base64.b64encode(output).decode("utf-8")
+
+ content=CaptionedSpeechResponse(audio=base64_output,audio_format=content_type,timestamps=audio_data.word_timestamps).model_dump()
+ return JSONResponse(
+ content=content,
+ media_type="application/json",
+ headers={
+ "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
+ "Cache-Control": "no-cache", # Prevent caching
+ },
+ )
except ValueError as e:
+ # Handle validation errors
logger.warning(f"Invalid request: {str(e)}")
raise HTTPException(
status_code=400,
@@ -325,6 +331,7 @@ async def create_captioned_speech(
},
)
except RuntimeError as e:
+ # Handle runtime/processing errors
logger.error(f"Processing error: {str(e)}")
raise HTTPException(
status_code=500,
@@ -335,7 +342,8 @@ async def create_captioned_speech(
},
)
except Exception as e:
- logger.error(f"Unexpected error in speech generation: {str(e)}")
+ # Handle unexpected errors
+ logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
raise HTTPException(
status_code=500,
detail={
@@ -343,4 +351,4 @@ async def create_captioned_speech(
"message": str(e),
"type": "server_error",
},
- )
+ )
\ No newline at end of file
diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
index 327ccdd..c4036d6 100644
--- a/api/src/routers/openai_compatible.py
+++ b/api/src/routers/openai_compatible.py
@@ -5,14 +5,19 @@ import json
import os
import re
import tempfile
-from typing import AsyncGenerator, Dict, List, Union
+from typing import AsyncGenerator, Dict, List, Union, Tuple
+from urllib import response
+import numpy as np
import aiofiles
+
+from structures.schemas import CaptionedSpeechRequest
import torch
from fastapi import APIRouter, Depends, Header, HTTPException, Request, Response
from fastapi.responses import FileResponse, StreamingResponse
from loguru import logger
+from ..inference.base import AudioChunk
from ..core.config import settings
from ..services.audio import AudioService
from ..services.tts_service import TTSService
@@ -126,21 +131,29 @@ async def process_voices(
async def stream_audio_chunks(
- tts_service: TTSService, request: OpenAISpeechRequest, client_request: Request
-) -> AsyncGenerator[bytes, None]:
+ tts_service: TTSService, request: Union[OpenAISpeechRequest,CaptionedSpeechRequest], client_request: Request
+) -> AsyncGenerator[AudioChunk, None]:
"""Stream audio chunks as they're generated with client disconnect handling"""
voice_name = await process_voices(request.voice, tts_service)
+ unique_properties={
+ "return_timestamps":False
+ }
+ if hasattr(request, "return_timestamps"):
+ unique_properties["return_timestamps"]=request.return_timestamps
+
try:
logger.info(f"Starting audio generation with lang_code: {request.lang_code}")
- async for chunk in tts_service.generate_audio_stream(
+ async for chunk_data in tts_service.generate_audio_stream(
text=request.input,
voice=voice_name,
speed=request.speed,
output_format=request.response_format,
lang_code=request.lang_code or settings.default_voice_code or voice_name[0].lower(),
- normalization_options=request.normalization_options
+ normalization_options=request.normalization_options,
+ return_timestamps=unique_properties["return_timestamps"],
):
+
# Check if client is still connected
is_disconnected = client_request.is_disconnected
if callable(is_disconnected):
@@ -148,7 +161,8 @@ async def stream_audio_chunks(
if is_disconnected:
logger.info("Client disconnected, stopping audio generation")
break
- yield chunk
+
+ yield chunk_data
except Exception as e:
logger.error(f"Error in audio streaming: {str(e)}")
# Let the exception propagate to trigger cleanup
@@ -157,6 +171,7 @@ async def stream_audio_chunks(
@router.post("/audio/speech")
async def create_speech(
+
request: OpenAISpeechRequest,
client_request: Request,
x_raw_response: str = Header(None, alias="x-raw-response"),
@@ -218,10 +233,13 @@ async def create_speech(
async def dual_output():
try:
# Write chunks to temp file and stream
- async for chunk in generator:
- if chunk: # Skip empty chunks
- await temp_writer.write(chunk)
- yield chunk
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ await temp_writer.write(chunk_data.output)
+ #if return_json:
+ # yield chunk, chunk_data
+ #else:
+ yield chunk_data.output
# Finalize the temp file
await temp_writer.finalize()
@@ -239,9 +257,19 @@ async def create_speech(
dual_output(), media_type=content_type, headers=headers
)
+ async def single_output():
+ try:
+ # Stream chunks
+ async for chunk_data in generator:
+ if chunk_data.output: # Skip empty chunks
+ yield chunk_data.output
+ except Exception as e:
+ logger.error(f"Error in single output streaming: {e}")
+ raise
+
# Standard streaming without download link
return StreamingResponse(
- generator,
+ single_output(),
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
@@ -252,24 +280,34 @@ async def create_speech(
)
else:
# Generate complete audio using public interface
- audio, _ = await tts_service.generate_audio(
+ audio_data = await tts_service.generate_audio(
text=request.input,
voice=voice_name,
speed=request.speed,
+ normalization_options=request.normalization_options,
lang_code=request.lang_code,
)
- # Convert to requested format with proper finalization
- content = await AudioService.convert_audio(
- audio,
+ audio_data = await AudioService.convert_audio(
+ audio_data,
24000,
request.response_format,
is_first_chunk=True,
+ is_last_chunk=False,
+ trim_audio=False
+ )
+
+ # Convert to requested format with proper finalization
+ final = await AudioService.convert_audio(
+ AudioChunk(np.array([], dtype=np.int16)),
+ 24000,
+ request.response_format,
+ is_first_chunk=False,
is_last_chunk=True,
)
-
+ output=audio_data.output + final.output
return Response(
- content=content,
+ content=output,
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
diff --git a/api/src/services/audio.py b/api/src/services/audio.py
index 64062b8..0c75224 100644
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@@ -1,6 +1,8 @@
"""Audio conversion service"""
import struct
+import time
+from typing import Tuple
from io import BytesIO
import numpy as np
@@ -13,7 +15,7 @@ from torch import norm
from ..core.config import settings
from .streaming_audio_writer import StreamingAudioWriter
-
+from ..inference.base import AudioChunk
class AudioNormalizer:
"""Handles audio normalization state for a single stream"""
@@ -55,7 +57,6 @@ class AudioNormalizer:
non_silent_index_start, non_silent_index_end = None,None
for X in range(0,len(audio_data)):
- #print(audio_data[X])
if audio_data[X] > amplitude_threshold:
non_silent_index_start=X
break
@@ -71,7 +72,7 @@ class AudioNormalizer:
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
- async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
+ def normalize(self, audio_data: np.ndarray) -> np.ndarray:
"""Convert audio data to int16 range
Args:
@@ -79,18 +80,16 @@ class AudioNormalizer:
Returns:
Normalized audio data
"""
- if len(audio_data) == 0:
- raise ValueError("Empty audio data")
-
- # Scale directly to int16 range with clipping
- return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
-
+ if audio_data.dtype != np.int16:
+ # Scale directly to int16 range with clipping
+ return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
+ return audio_data
class AudioService:
"""Service for audio format conversions with streaming support"""
# Supported formats
- SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm", "ogg"}
+ SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm"}
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
@@ -113,15 +112,16 @@ class AudioService:
@staticmethod
async def convert_audio(
- audio_data: np.ndarray,
+ audio_chunk: AudioChunk,
sample_rate: int,
output_format: str,
speed: float = 1,
chunk_text: str = "",
is_first_chunk: bool = True,
is_last_chunk: bool = False,
+ trim_audio: bool = True,
normalizer: AudioNormalizer = None,
- ) -> bytes:
+ ) -> Tuple[AudioChunk]:
"""Convert audio data to specified format with streaming support
Args:
@@ -137,6 +137,7 @@ class AudioService:
Returns:
Bytes of the converted audio chunk
"""
+
try:
# Validate format
if output_format not in AudioService.SUPPORTED_FORMATS:
@@ -145,9 +146,11 @@ class AudioService:
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
-
- normalized_audio = await normalizer.normalize(audio_data)
- normalized_audio = AudioService.trim_audio(normalized_audio,chunk_text,speed,is_last_chunk,normalizer)
+
+ audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
+
+ if trim_audio == True:
+ audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
# Get or create format-specific writer
writer_key = f"{output_format}_{sample_rate}"
@@ -155,19 +158,24 @@ class AudioService:
AudioService._writers[writer_key] = StreamingAudioWriter(
output_format, sample_rate
)
+
writer = AudioService._writers[writer_key]
-
+
# Write audio data first
- if len(normalized_audio) > 0:
- chunk_data = writer.write_chunk(normalized_audio)
+ if len(audio_chunk.audio) > 0:
+ chunk_data = writer.write_chunk(audio_chunk.audio)
# Then finalize if this is the last chunk
if is_last_chunk:
final_data = writer.write_chunk(finalize=True)
del AudioService._writers[writer_key]
- return final_data if final_data else b""
-
- return chunk_data if chunk_data else b""
+ if final_data:
+ audio_chunk.output=final_data
+ return audio_chunk
+
+ if chunk_data:
+ audio_chunk.output=chunk_data
+ return audio_chunk
except Exception as e:
logger.error(f"Error converting audio stream to {output_format}: {str(e)}")
@@ -175,7 +183,7 @@ class AudioService:
f"Failed to convert audio stream to {output_format}: {str(e)}"
)
@staticmethod
- def trim_audio(audio_data: np.ndarray, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> np.ndarray:
+ def trim_audio(audio_chunk: AudioChunk, chunk_text: str = "", speed: float = 1, is_last_chunk: bool = False, normalizer: AudioNormalizer = None) -> AudioChunk:
"""Trim silence from start and end
Args:
@@ -191,10 +199,23 @@ class AudioService:
if normalizer is None:
normalizer = AudioNormalizer()
+ audio_chunk.audio=normalizer.normalize(audio_chunk.audio)
+
+ trimed_samples=0
# Trim start and end if enough samples
- if len(audio_data) > (2 * normalizer.samples_to_trim):
- audio_data = audio_data[normalizer.samples_to_trim : -normalizer.samples_to_trim]
+ if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
+ audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim]
+ trimed_samples+=normalizer.samples_to_trim
# Find non silent portion and trim
- start_index,end_index=normalizer.find_first_last_non_silent(audio_data,chunk_text,speed,is_last_chunk=is_last_chunk)
- return audio_data[start_index:end_index]
\ No newline at end of file
+ start_index,end_index=normalizer.find_first_last_non_silent(audio_chunk.audio,chunk_text,speed,is_last_chunk=is_last_chunk)
+
+ audio_chunk.audio=audio_chunk.audio[start_index:end_index]
+ trimed_samples+=start_index
+
+ if audio_chunk.word_timestamps is not None:
+ for timestamp in audio_chunk.word_timestamps:
+ timestamp.start_time-=trimed_samples / 24000
+ timestamp.end_time-=trimed_samples / 24000
+ return audio_chunk
+
\ No newline at end of file
diff --git a/api/src/services/streaming_audio_writer.py b/api/src/services/streaming_audio_writer.py
index 1a45eec..b69e813 100644
--- a/api/src/services/streaming_audio_writer.py
+++ b/api/src/services/streaming_audio_writer.py
@@ -8,7 +8,7 @@ import numpy as np
import soundfile as sf
from loguru import logger
from pydub import AudioSegment
-
+import av
class StreamingAudioWriter:
"""Handles streaming audio format conversions"""
@@ -18,60 +18,19 @@ class StreamingAudioWriter:
self.sample_rate = sample_rate
self.channels = channels
self.bytes_written = 0
- self.buffer = BytesIO()
+ self.pts=0
+ codec_map = {"wav":"pcm_s16le","mp3":"mp3","opus":"libopus","flac":"flac", "aac":"aac"}
# Format-specific setup
- if self.format == "wav":
- self._write_wav_header_initial()
- elif self.format in ["ogg", "opus"]:
- # For OGG/Opus, write to memory buffer
- self.writer = sf.SoundFile(
- file=self.buffer,
- mode="w",
- samplerate=sample_rate,
- channels=channels,
- format="OGG",
- subtype="VORBIS" if self.format == "ogg" else "OPUS",
- )
- elif self.format == "flac":
- # For FLAC, write to memory buffer
- self.writer = sf.SoundFile(
- file=self.buffer,
- mode="w",
- samplerate=sample_rate,
- channels=channels,
- format="FLAC",
- )
- elif self.format in ["mp3", "aac"]:
- # For MP3/AAC, we'll use pydub's incremental writer
- self.segments = [] # Store segments until we have enough data
- self.total_duration = 0 # Track total duration in milliseconds
- # Initialize an empty AudioSegment as our encoder
- self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate)
- elif self.format == "pcm":
- # PCM doesn't need initialization, we'll write raw bytes
- pass
+ if self.format in ["wav", "opus","flac","mp3","aac","pcm"]:
+ if self.format != "pcm":
+ self.output_buffer = BytesIO()
+ self.container = av.open(self.output_buffer, mode="w", format=self.format)
+ #print(av.codecs_available)
+ self.stream = self.container.add_stream(codec_map[self.format], rate=self.sample_rate,sample_rate=self.sample_rate,layout='mono' if self.channels == 1 else 'stereo')
else:
raise ValueError(f"Unsupported format: {format}")
- def _write_wav_header_initial(self) -> None:
- """Write initial WAV header with placeholders"""
- self.buffer.write(b"RIFF")
- self.buffer.write(struct.pack(" bytes:
@@ -81,153 +40,37 @@ class StreamingAudioWriter:
audio_data: Audio data to write, or None if finalizing
finalize: Whether this is the final write to close the stream
"""
- output_buffer = BytesIO()
if finalize:
- if self.format == "wav":
- # Calculate actual file and data sizes
- file_size = self.bytes_written + 36 # RIFF header bytes
- data_size = self.bytes_written
-
- # Seek to the beginning to overwrite the placeholders
- self.buffer.seek(4)
- self.buffer.write(struct.pack(" 0:
- format_args = {
- "mp3": {"format": "mp3", "codec": "libmp3lame"},
- "aac": {"format": "adts", "codec": "aac"},
- }[self.format]
-
- parameters = []
- if self.format == "mp3":
- parameters.extend(
- [
- "-q:a",
- "0", # Highest quality
- "-write_xing",
- "1", # XING header for MP3
- "-id3v1",
- "1",
- "-id3v2",
- "1",
- "-write_vbr",
- "1",
- "-vbr_quality",
- "2",
- ]
- )
- elif self.format == "aac":
- parameters.extend(
- [
- "-q:a",
- "2",
- "-write_xing",
- "0",
- "-write_id3v1",
- "0",
- "-write_id3v2",
- "0",
- ]
- )
-
- self.encoder.export(
- output_buffer,
- **format_args,
- bitrate="192k", # Optimal for 24kHz/16-bit mono source
- parameters=parameters,
- )
- self.encoder = None
-
- return output_buffer.getvalue()
+ if self.format != "pcm":
+ packets = self.stream.encode(None)
+ for packet in packets:
+ self.container.mux(packet)
+
+ data=self.output_buffer.getvalue()
+ self.container.close()
+ return data
if audio_data is None or len(audio_data) == 0:
return b""
- if self.format == "wav":
- # Write raw PCM data
- self.buffer.write(audio_data.tobytes())
- self.bytes_written += len(audio_data.tobytes())
- return b""
-
- elif self.format in ["ogg", "opus", "flac"]:
- # Write to soundfile buffer
- self.writer.write(audio_data)
- self.writer.flush()
- return self.buffer.getvalue()
-
- elif self.format in ["mp3", "aac"]:
- # Convert chunk to AudioSegment and encode
- segment = AudioSegment(
- audio_data.tobytes(),
- frame_rate=self.sample_rate,
- sample_width=audio_data.dtype.itemsize,
- channels=self.channels,
- )
-
- # Track total duration
- self.total_duration += len(segment)
-
- # Add segment to encoder
- self.encoder += segment
-
- # Export current state to buffer without final metadata
- format_args = {
- "mp3": {"format": "mp3", "codec": "libmp3lame"},
- "aac": {"format": "adts", "codec": "aac"},
- }[self.format]
-
- # For chunks, export without duration metadata or XING headers
- self.encoder.export(
- output_buffer,
- **format_args,
- bitrate="192k", # Optimal for 24kHz/16-bit mono source
- parameters=[
- "-q:a",
- "0", # Highest quality for chunks too
- "-write_xing",
- "0", # No XING headers for chunks
- ],
- )
-
- # Get the encoded data
- encoded_data = output_buffer.getvalue()
-
- # Reset encoder to prevent memory growth
- self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate)
-
- return encoded_data
-
- elif self.format == "pcm":
+ if self.format == "pcm":
# Write raw bytes
return audio_data.tobytes()
+ else:
+ frame = av.AudioFrame.from_ndarray(audio_data.reshape(1, -1), format='s16', layout='mono' if self.channels == 1 else 'stereo')
+ frame.sample_rate=self.sample_rate
- return b""
+
+ frame.pts = self.pts
+ self.pts += frame.samples
+
+ packets = self.stream.encode(frame)
+ for packet in packets:
+ self.container.mux(packet)
+
+ data = self.output_buffer.getvalue()
+ self.output_buffer.seek(0)
+ self.output_buffer.truncate(0)
+ return data
- def close(self) -> Optional[bytes]:
- """Finish the audio file and return any remaining data"""
- if self.format == "wav":
- # Re-finalize WAV file by updating headers
- self.buffer.seek(0)
- file_content = self.write_chunk(finalize=True)
- return file_content
-
- elif self.format in ["ogg", "opus", "flac"]:
- # Finalize other formats
- self.writer.close()
- return self.buffer.getvalue()
-
- elif self.format in ["mp3", "aac"]:
- # Finalize MP3/AAC
- final_data = self.write_chunk(finalize=True)
- return final_data
-
- return None
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
index 4466cca..018cc51 100644
--- a/api/src/services/text_processing/text_processor.py
+++ b/api/src/services/text_processing/text_processor.py
@@ -106,6 +106,7 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
async def smart_split(
text: str,
max_tokens: int = settings.absolute_max_tokens,
+ lang_code: str = "a",
normalization_options: NormalizationOptions = NormalizationOptions()
) -> AsyncGenerator[Tuple[str, List[int]], None]:
"""Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
@@ -113,9 +114,12 @@ async def smart_split(
chunk_count = 0
logger.info(f"Starting smart split for {len(text)} chars")
- # Normilize text
+ # Normalize text
if settings.advanced_text_normalization and normalization_options.normalize:
- text=normalize_text(text,normalization_options)
+ if lang_code in ["a","b","en-us","en-gb"]:
+ text=normalize_text(text,normalization_options)
+ else:
+ logger.info("Skipping text normalization as it is only supported for english")
# Process all sentences
sentences = get_sentence_info(text)
@@ -241,4 +245,4 @@ async def smart_split(
total_time = time.time() - start_time
logger.info(
f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks"
- )
+ )
\ No newline at end of file
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index ba4dcc4..a115c18 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -6,6 +6,7 @@ import tempfile
import time
from typing import AsyncGenerator, List, Optional, Tuple, Union
+from ..inference.base import AudioChunk
import numpy as np
import torch
from kokoro import KPipeline
@@ -52,7 +53,8 @@ class TTSService:
is_last: bool = False,
normalizer: Optional[AudioNormalizer] = None,
lang_code: Optional[str] = None,
- ) -> AsyncGenerator[Union[np.ndarray, bytes], None]:
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
"""Process tokens into audio."""
async with self._chunk_semaphore:
try:
@@ -60,11 +62,10 @@ class TTSService:
if is_last:
# Skip format conversion for raw audio mode
if not output_format:
- yield np.array([], dtype=np.float32)
+ yield AudioChunk(np.array([], dtype=np.int16),output=b'')
return
-
- result = await AudioService.convert_audio(
- np.array([0], dtype=np.float32), # Dummy data for type checking
+ chunk_data = await AudioService.convert_audio(
+ AudioChunk(np.array([], dtype=np.float32)), # Dummy data for type checking
24000,
output_format,
speed,
@@ -73,7 +74,7 @@ class TTSService:
normalizer=normalizer,
is_last_chunk=True,
)
- yield result
+ yield chunk_data
return
# Skip empty chunks
@@ -85,58 +86,62 @@ class TTSService:
# Generate audio using pre-warmed model
if isinstance(backend, KokoroV1):
+ chunk_index=0
# For Kokoro V1, pass text and voice info with lang_code
- async for chunk_audio in self.model_manager.generate(
+ async for chunk_data in self.model_manager.generate(
chunk_text,
(voice_name, voice_path),
speed=speed,
lang_code=lang_code,
+ return_timestamps=return_timestamps,
):
# For streaming, convert to bytes
if output_format:
try:
- converted = await AudioService.convert_audio(
- chunk_audio,
+ chunk_data = await AudioService.convert_audio(
+ chunk_data,
24000,
output_format,
speed,
chunk_text,
- is_first_chunk=is_first,
+ is_first_chunk=is_first and chunk_index == 0,
is_last_chunk=is_last,
normalizer=normalizer,
)
- yield converted
+ yield chunk_data
except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}")
else:
- trimmed = await AudioService.trim_audio(chunk_audio,
+ chunk_data = AudioService.trim_audio(chunk_data,
chunk_text,
speed,
is_last,
normalizer)
- yield trimmed
+ yield chunk_data
+ chunk_index+=1
else:
+
# For legacy backends, load voice tensor
voice_tensor = await self._voice_manager.load_voice(
voice_name, device=backend.device
)
- chunk_audio = await self.model_manager.generate(
- tokens, voice_tensor, speed=speed
+ chunk_data = await self.model_manager.generate(
+ tokens, voice_tensor, speed=speed, return_timestamps=return_timestamps
)
- if chunk_audio is None:
+ if chunk_data.audio is None:
logger.error("Model generated None for audio chunk")
return
- if len(chunk_audio) == 0:
+ if len(chunk_data.audio) == 0:
logger.error("Model generated empty audio chunk")
return
# For streaming, convert to bytes
if output_format:
try:
- converted = await AudioService.convert_audio(
- chunk_audio,
+ chunk_data = await AudioService.convert_audio(
+ chunk_data,
24000,
output_format,
speed,
@@ -145,11 +150,11 @@ class TTSService:
normalizer=normalizer,
is_last_chunk=is_last,
)
- yield converted
+ yield chunk_data
except Exception as e:
logger.error(f"Failed to convert audio: {str(e)}")
else:
- trimmed = await AudioService.trim_audio(chunk_audio,
+ trimmed = AudioService.trim_audio(chunk_data,
chunk_text,
speed,
is_last,
@@ -222,6 +227,8 @@ class TTSService:
return voice, combined_path
else:
# Single voice
+ if "(" in voice and ")" in voice:
+ voice = voice.split("(")[0].strip()
path = await self._voice_manager.get_voice_path(voice)
if not path:
raise RuntimeError(f"Voice not found: {voice}")
@@ -238,12 +245,13 @@ class TTSService:
speed: float = 1.0,
output_format: str = "wav",
lang_code: Optional[str] = None,
- normalization_options: Optional[NormalizationOptions] = NormalizationOptions()
- ) -> AsyncGenerator[bytes, None]:
+ normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
+ return_timestamps: Optional[bool] = False,
+ ) -> AsyncGenerator[AudioChunk, None]:
"""Generate and stream audio chunks."""
stream_normalizer = AudioNormalizer()
chunk_index = 0
-
+ current_offset=0.0
try:
# Get backend
backend = self.model_manager.get_backend()
@@ -257,12 +265,13 @@ class TTSService:
logger.info(
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in audio stream"
)
-
+
+
# Process text in chunks with smart splitting
- async for chunk_text, tokens in smart_split(text,normalization_options=normalization_options):
+ async for chunk_text, tokens in smart_split(text,lang_code=lang_code,normalization_options=normalization_options):
try:
# Process audio for chunk
- async for result in self._process_chunk(
+ async for chunk_data in self._process_chunk(
chunk_text, # Pass text for Kokoro V1
tokens, # Pass tokens for legacy backends
voice_name, # Pass voice name
@@ -273,15 +282,23 @@ class TTSService:
is_last=False, # We'll update the last chunk later
normalizer=stream_normalizer,
lang_code=pipeline_lang_code, # Pass lang_code
+ return_timestamps=return_timestamps,
):
- if result is not None:
- yield result
- chunk_index += 1
+ if chunk_data.word_timestamps is not None:
+ for timestamp in chunk_data.word_timestamps:
+ timestamp.start_time+=current_offset
+ timestamp.end_time+=current_offset
+
+ current_offset+=len(chunk_data.audio) / 24000
+
+ if chunk_data.output is not None:
+ yield chunk_data
+
else:
logger.warning(
f"No audio generated for chunk: '{chunk_text[:100]}...'"
)
-
+ chunk_index += 1
except Exception as e:
logger.error(
f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}"
@@ -292,7 +309,7 @@ class TTSService:
if chunk_index > 0:
try:
# Empty tokens list to finalize audio
- async for result in self._process_chunk(
+ async for chunk_data in self._process_chunk(
"", # Empty text
[], # Empty tokens
voice_name,
@@ -304,14 +321,15 @@ class TTSService:
normalizer=stream_normalizer,
lang_code=pipeline_lang_code, # Pass lang_code
):
- if result is not None:
- yield result
+ if chunk_data.output is not None:
+ yield chunk_data
except Exception as e:
logger.error(f"Failed to finalize audio stream: {str(e)}")
except Exception as e:
logger.error(f"Error in phoneme audio generation: {str(e)}")
- raise
+ raise e
+
async def generate_audio(
self,
@@ -319,260 +337,24 @@ class TTSService:
voice: str,
speed: float = 1.0,
return_timestamps: bool = False,
+ normalization_options: Optional[NormalizationOptions] = NormalizationOptions(),
lang_code: Optional[str] = None,
- ) -> Union[Tuple[np.ndarray, float], Tuple[np.ndarray, float, List[dict]]]:
+ ) -> AudioChunk:
"""Generate complete audio for text using streaming internally."""
- start_time = time.time()
- chunks = []
- word_timestamps = []
-
+ audio_data_chunks=[]
+
try:
- # Get backend and voice path
- backend = self.model_manager.get_backend()
- voice_name, voice_path = await self._get_voice_path(voice)
+ async for audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,normalization_options=normalization_options,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
+ if len(audio_stream_data.audio) > 0:
+ audio_data_chunks.append(audio_stream_data)
- if isinstance(backend, KokoroV1):
- # Use provided lang_code or determine from voice name
- pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
- logger.info(
- f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
- )
-
- # Get pipelines from backend for proper device management
- try:
- # Initialize quiet pipeline for text chunking
- text_chunks = []
- current_offset = 0.0 # Track time offset for timestamps
-
- logger.debug("Splitting text into chunks...")
- # Use backend's pipeline management
- for result in backend._get_pipeline(pipeline_lang_code)(text):
- if result.graphemes and result.phonemes:
- text_chunks.append((result.graphemes, result.phonemes))
- logger.debug(f"Split text into {len(text_chunks)} chunks")
-
- # Process each chunk
- for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(
- text_chunks
- ):
- logger.debug(
- f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'"
- )
-
- # Use backend's pipeline for generation
- for result in backend._get_pipeline(pipeline_lang_code)(
- chunk_text, voice=voice_path, speed=speed
- ):
- # Collect audio chunks
- if result.audio is not None:
- chunks.append(result.audio.numpy())
-
- # Process timestamps for this chunk
- if (
- return_timestamps
- and hasattr(result, "tokens")
- and result.tokens
- ):
- logger.debug(
- f"Processing chunk timestamps with {len(result.tokens)} tokens"
- )
- if result.pred_dur is not None:
- try:
- # Join timestamps for this chunk's tokens
- KPipeline.join_timestamps(
- result.tokens, result.pred_dur
- )
-
- # Add timestamps with offset
- for token in result.tokens:
- if not all(
- hasattr(token, attr)
- for attr in [
- "text",
- "start_ts",
- "end_ts",
- ]
- ):
- continue
- if not token.text or not token.text.strip():
- continue
-
- # Apply offset to timestamps
- start_time = (
- float(token.start_ts) + current_offset
- )
- end_time = (
- float(token.end_ts) + current_offset
- )
-
- word_timestamps.append(
- {
- "word": str(token.text).strip(),
- "start_time": start_time,
- "end_time": end_time,
- }
- )
- logger.debug(
- f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
- )
-
- # Update offset for next chunk based on pred_dur
- chunk_duration = (
- float(result.pred_dur.sum()) / 80
- ) # Convert frames to seconds
- current_offset = max(
- current_offset + chunk_duration, end_time
- )
- logger.debug(
- f"Updated time offset to {current_offset:.3f}s"
- )
-
- except Exception as e:
- logger.error(
- f"Failed to process timestamps for chunk: {e}"
- )
- logger.debug(
- f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}"
- )
- try:
- # Join timestamps for this chunk's tokens
- KPipeline.join_timestamps(
- result.tokens, result.pred_dur
- )
- logger.debug(
- "Successfully joined timestamps for chunk"
- )
- except Exception as e:
- logger.error(
- f"Failed to join timestamps for chunk: {e}"
- )
- continue
-
- # Convert tokens to timestamps
- for token in result.tokens:
- try:
- # Skip tokens without required attributes
- if not all(
- hasattr(token, attr)
- for attr in ["text", "start_ts", "end_ts"]
- ):
- logger.debug(
- f"Skipping token missing attributes: {dir(token)}"
- )
- continue
-
- # Get and validate text
- text = (
- str(token.text).strip()
- if token.text is not None
- else ""
- )
- if not text:
- logger.debug("Skipping empty token")
- continue
-
- # Get and validate timestamps
- start_ts = getattr(token, "start_ts", None)
- end_ts = getattr(token, "end_ts", None)
- if start_ts is None or end_ts is None:
- logger.debug(
- f"Skipping token with None timestamps: {text}"
- )
- continue
-
- # Convert timestamps to float
- try:
- start_time = float(start_ts)
- end_time = float(end_ts)
- except (TypeError, ValueError):
- logger.debug(
- f"Skipping token with invalid timestamps: {text}"
- )
- continue
-
- # Add timestamp
- word_timestamps.append(
- {
- "word": text,
- "start_time": start_time,
- "end_time": end_time,
- }
- )
- logger.debug(
- f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s"
- )
- except Exception as e:
- logger.warning(f"Error processing token: {e}")
- continue
-
- except Exception as e:
- logger.error(f"Failed to process text with pipeline: {e}")
- raise RuntimeError(f"Pipeline processing failed: {e}")
-
- if not chunks:
- raise ValueError("No audio chunks were generated successfully")
-
- # Combine chunks
- audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
- processing_time = time.time() - start_time
-
- if return_timestamps:
- # Validate timestamps before returning
- if not word_timestamps:
- logger.warning("No valid timestamps were generated")
- else:
- # Sort timestamps by start time to ensure proper order
- word_timestamps.sort(key=lambda x: x["start_time"])
- # Validate timestamp sequence
- for i in range(1, len(word_timestamps)):
- prev = word_timestamps[i - 1]
- curr = word_timestamps[i]
- if curr["start_time"] < prev["end_time"]:
- logger.warning(
- f"Overlapping timestamps detected: '{prev['word']}' ({prev['start_time']:.3f}-{prev['end_time']:.3f}) and '{curr['word']}' ({curr['start_time']:.3f}-{curr['end_time']:.3f})"
- )
-
- logger.debug(
- f"Returning {len(word_timestamps)} word timestamps"
- )
- logger.debug(
- f"First timestamp: {word_timestamps[0]['word']} at {word_timestamps[0]['start_time']:.3f}s"
- )
- logger.debug(
- f"Last timestamp: {word_timestamps[-1]['word']} at {word_timestamps[-1]['end_time']:.3f}s"
- )
-
- return audio, processing_time, word_timestamps
- return audio, processing_time
-
- else:
- # For legacy backends
- async for chunk in self.generate_audio_stream(
- text,
- voice,
- speed, # Default to WAV for raw audio
- ):
- if chunk is not None:
- chunks.append(chunk)
-
- if not chunks:
- raise ValueError("No audio chunks were generated successfully")
-
- # Combine chunks
- audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
- processing_time = time.time() - start_time
-
- if return_timestamps:
- return (
- audio,
- processing_time,
- [],
- ) # Empty timestamps for legacy backends
- return audio, processing_time
+ combined_audio_data=AudioChunk.combine(audio_data_chunks)
+ return combined_audio_data
except Exception as e:
logger.error(f"Error in audio generation: {str(e)}")
raise
+
async def combine_voices(self, voices: List[str]) -> torch.Tensor:
"""Combine multiple voices.
diff --git a/api/src/structures/custom_responses.py b/api/src/structures/custom_responses.py
new file mode 100644
index 0000000..3d3a987
--- /dev/null
+++ b/api/src/structures/custom_responses.py
@@ -0,0 +1,51 @@
+from collections.abc import AsyncIterable, Iterable
+
+import json
+import typing
+from pydantic import BaseModel
+from starlette.background import BackgroundTask
+from starlette.concurrency import iterate_in_threadpool
+from starlette.responses import JSONResponse, StreamingResponse
+
+
+class JSONStreamingResponse(StreamingResponse, JSONResponse):
+ """StreamingResponse that also render with JSON."""
+
+ def __init__(
+ self,
+ content: Iterable | AsyncIterable,
+ status_code: int = 200,
+ headers: dict[str, str] | None = None,
+ media_type: str | None = None,
+ background: BackgroundTask | None = None,
+ ) -> None:
+ if isinstance(content, AsyncIterable):
+ self._content_iterable: AsyncIterable = content
+ else:
+ self._content_iterable = iterate_in_threadpool(content)
+
+
+
+ async def body_iterator() -> AsyncIterable[bytes]:
+ async for content_ in self._content_iterable:
+ if isinstance(content_, BaseModel):
+ content_ = content_.model_dump()
+ yield self.render(content_)
+
+
+
+ self.body_iterator = body_iterator()
+ self.status_code = status_code
+ if media_type is not None:
+ self.media_type = media_type
+ self.background = background
+ self.init_headers(headers)
+
+ def render(self, content: typing.Any) -> bytes:
+ return (json.dumps(
+ content,
+ ensure_ascii=False,
+ allow_nan=False,
+ indent=None,
+ separators=(",", ":"),
+ ) + "\n").encode("utf-8")
\ No newline at end of file
diff --git a/api/src/structures/schemas.py b/api/src/structures/schemas.py
index 22c28bc..fa65b4d 100644
--- a/api/src/structures/schemas.py
+++ b/api/src/structures/schemas.py
@@ -33,8 +33,9 @@ class WordTimestamp(BaseModel):
class CaptionedSpeechResponse(BaseModel):
"""Response schema for captioned speech endpoint"""
- audio: bytes = Field(..., description="The generated audio data")
- words: List[WordTimestamp] = Field(..., description="Word-level timestamps")
+ audio: str = Field(..., description="The generated audio data encoded in base 64")
+ audio_format: str = Field(..., description="The format of the output audio")
+ timestamps: Optional[List[WordTimestamp]] = Field(..., description="Word-level timestamps")
class NormalizationOptions(BaseModel):
"""Options for the normalization system"""
@@ -54,7 +55,7 @@ class OpenAISpeechRequest(BaseModel):
)
input: str = Field(..., description="The text to generate audio for")
voice: str = Field(
- default="af",
+ default="af_heart",
description="The voice to use for generation. Can be a base voice or a combined voice name.",
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
@@ -98,7 +99,7 @@ class CaptionedSpeechRequest(BaseModel):
)
input: str = Field(..., description="The text to generate audio for")
voice: str = Field(
- default="af",
+ default="af_heart",
description="The voice to use for generation. Can be a base voice or a combined voice name.",
)
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
@@ -111,11 +112,23 @@ class CaptionedSpeechRequest(BaseModel):
le=4.0,
description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
)
+ stream: bool = Field(
+ default=True, # Default to streaming for OpenAI compatibility
+ description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
+ )
return_timestamps: bool = Field(
default=True,
description="If true (default), returns word-level timestamps in the response",
)
+ return_download_link: bool = Field(
+ default=False,
+ description="If true, returns a download link in X-Download-Path header after streaming completes",
+ )
lang_code: Optional[str] = Field(
default=None,
description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
)
+ normalization_options: Optional[NormalizationOptions] = Field(
+ default= NormalizationOptions(),
+ description= "Options for the normalization system"
+ )
diff --git a/api/tests/test_audio_service.py b/api/tests/test_audio_service.py
index eae248c..ca2d25d 100644
--- a/api/tests/test_audio_service.py
+++ b/api/tests/test_audio_service.py
@@ -6,7 +6,7 @@ import numpy as np
import pytest
from api.src.services.audio import AudioNormalizer, AudioService
-
+from api.src.inference.base import AudioChunk
@pytest.fixture(autouse=True)
def mock_settings():
@@ -31,67 +31,83 @@ async def test_convert_to_wav(sample_audio):
"""Test converting to WAV format"""
audio_data, sample_rate = sample_audio
# Write and finalize in one step for WAV
- result = await AudioService.convert_audio(
- audio_data, sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "wav", is_first_chunk=True, is_last_chunk=False
)
- assert isinstance(result, bytes)
- assert len(result) > 0
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# Check WAV header
- assert result.startswith(b"RIFF")
- assert b"WAVE" in result[:12]
+ assert audio_chunk.output.startswith(b"RIFF")
+ assert b"WAVE" in audio_chunk.output[:12]
@pytest.mark.asyncio
async def test_convert_to_mp3(sample_audio):
"""Test converting to MP3 format"""
audio_data, sample_rate = sample_audio
- result = await AudioService.convert_audio(audio_data, sample_rate, "mp3")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "mp3"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# Check MP3 header (ID3 or MPEG frame sync)
- assert result.startswith(b"ID3") or result.startswith(b"\xff\xfb")
+ assert audio_chunk.output.startswith(b"ID3") or audio_chunk.output.startswith(b"\xff\xfb")
@pytest.mark.asyncio
async def test_convert_to_opus(sample_audio):
"""Test converting to Opus format"""
audio_data, sample_rate = sample_audio
- result = await AudioService.convert_audio(audio_data, sample_rate, "opus")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "opus"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# Check OGG header
- assert result.startswith(b"OggS")
+ assert audio_chunk.output.startswith(b"OggS")
@pytest.mark.asyncio
async def test_convert_to_flac(sample_audio):
"""Test converting to FLAC format"""
audio_data, sample_rate = sample_audio
- result = await AudioService.convert_audio(audio_data, sample_rate, "flac")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "flac"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# Check FLAC header
- assert result.startswith(b"fLaC")
+ assert audio_chunk.output.startswith(b"fLaC")
@pytest.mark.asyncio
async def test_convert_to_aac(sample_audio):
- """Test converting to AAC format"""
+ """Test converting to M4A format"""
audio_data, sample_rate = sample_audio
- result = await AudioService.convert_audio(audio_data, sample_rate, "aac")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "aac"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# Check ADTS header (AAC)
- assert result.startswith(b"\xff\xf0") or result.startswith(b"\xff\xf1")
+ assert audio_chunk.output.startswith(b"\xff\xf0") or audio_chunk.output.startswith(b"\xff\xf1")
@pytest.mark.asyncio
async def test_convert_to_pcm(sample_audio):
"""Test converting to PCM format"""
audio_data, sample_rate = sample_audio
- result = await AudioService.convert_audio(audio_data, sample_rate, "pcm")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "pcm"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
# PCM is raw bytes, so no header to check
@@ -110,11 +126,12 @@ async def test_normalization_wav(sample_audio):
# Create audio data outside int16 range
large_audio = audio_data * 1e5
# Write and finalize in one step for WAV
- result = await AudioService.convert_audio(
- large_audio, sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(large_audio), sample_rate, "wav", is_first_chunk=True
)
- assert isinstance(result, bytes)
- assert len(result) > 0
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
@pytest.mark.asyncio
@@ -123,9 +140,12 @@ async def test_normalization_pcm(sample_audio):
audio_data, sample_rate = sample_audio
# Create audio data outside int16 range
large_audio = audio_data * 1e5
- result = await AudioService.convert_audio(large_audio, sample_rate, "pcm")
- assert isinstance(result, bytes)
- assert len(result) > 0
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(large_audio), sample_rate, "pcm"
+ )
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
@pytest.mark.asyncio
@@ -144,11 +164,12 @@ async def test_different_sample_rates(sample_audio):
sample_rates = [8000, 16000, 44100, 48000]
for rate in sample_rates:
- result = await AudioService.convert_audio(
- audio_data, rate, "wav", is_first_chunk=True, is_last_chunk=True
+ audio_chunk = await AudioService.convert_audio(
+ AudioChunk(audio_data), rate, "wav", is_first_chunk=True
)
- assert isinstance(result, bytes)
- assert len(result) > 0
+ assert isinstance(audio_chunk.output, bytes)
+ assert isinstance(audio_chunk, AudioChunk)
+ assert len(audio_chunk.output) > 0
@pytest.mark.asyncio
@@ -156,11 +177,15 @@ async def test_buffer_position_after_conversion(sample_audio):
"""Test that buffer position is reset after writing"""
audio_data, sample_rate = sample_audio
# Write and finalize in one step for first conversion
- result = await AudioService.convert_audio(
- audio_data, sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
+ audio_chunk1 = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
)
+ assert isinstance(audio_chunk1.output, bytes)
+ assert isinstance(audio_chunk1, AudioChunk)
# Convert again to ensure buffer was properly reset
- result2 = await AudioService.convert_audio(
- audio_data, sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
+ audio_chunk2 = await AudioService.convert_audio(
+ AudioChunk(audio_data), sample_rate, "wav", is_first_chunk=True, is_last_chunk=True
)
- assert len(result) == len(result2)
+ assert isinstance(audio_chunk2.output, bytes)
+ assert isinstance(audio_chunk2, AudioChunk)
+ assert len(audio_chunk1.output) == len(audio_chunk2.output)
diff --git a/api/tests/test_development.py b/api/tests/test_development.py
index a05ba23..4760347 100644
--- a/api/tests/test_development.py
+++ b/api/tests/test_development.py
@@ -1,24 +1,24 @@
import pytest
from unittest.mock import patch, MagicMock
import requests
+import base64
+import json
def test_generate_captioned_speech():
"""Test the generate_captioned_speech function with mocked responses"""
# Mock the API responses
mock_audio_response = MagicMock()
mock_audio_response.status_code = 200
- mock_audio_response.content = b"mock audio data"
- mock_audio_response.headers = {"X-Timestamps-Path": "test.json"}
mock_timestamps_response = MagicMock()
mock_timestamps_response.status_code = 200
- mock_timestamps_response.json.return_value = [
- {"word": "test", "start_time": 0.0, "end_time": 1.0}
- ]
+ mock_timestamps_response.content = json.dumps({
+ "audio":base64.b64encode(b"mock audio data").decode("utf-8"),
+ "timestamps":[{"word": "test", "start_time": 0.0, "end_time": 1.0}]
+ })
- # Patch both HTTP requests
- with patch('requests.post', return_value=mock_audio_response), \
- patch('requests.get', return_value=mock_timestamps_response):
+ # Patch the HTTP requests
+ with patch('requests.post', return_value=mock_timestamps_response):
# Import here to avoid module-level import issues
from examples.captioned_speech_example import generate_captioned_speech
diff --git a/api/tests/test_openai_endpoints.py b/api/tests/test_openai_endpoints.py
index 531480f..527cb1f 100644
--- a/api/tests/test_openai_endpoints.py
+++ b/api/tests/test_openai_endpoints.py
@@ -1,9 +1,10 @@
import asyncio
import json
import os
-from typing import AsyncGenerator
+from typing import AsyncGenerator, Tuple
from unittest.mock import AsyncMock, MagicMock, patch
+from api.src.inference.base import AudioChunk
import numpy as np
import pytest
from fastapi.testclient import TestClient
@@ -144,7 +145,7 @@ async def test_stream_audio_chunks_client_disconnect():
async def mock_stream(*args, **kwargs):
for i in range(5):
- yield b"chunk"
+ yield AudioChunk(np.ndarray([],np.int16),output=b"chunk")
mock_service.generate_audio_stream = mock_stream
mock_service.list_voices.return_value = ["test_voice"]
@@ -236,10 +237,10 @@ def mock_tts_service(mock_audio_bytes):
"""Mock TTS service for testing."""
with patch("api.src.routers.openai_compatible.get_tts_service") as mock_get:
service = AsyncMock(spec=TTSService)
- service.generate_audio.return_value = (np.zeros(1000), 0.1)
+ service.generate_audio.return_value = AudioChunk(np.zeros(1000,np.int16))
- async def mock_stream(*args, **kwargs) -> AsyncGenerator[bytes, None]:
- yield mock_audio_bytes
+ async def mock_stream(*args, **kwargs) -> AsyncGenerator[AudioChunk, None]:
+ yield AudioChunk(np.ndarray([],np.int16),output=mock_audio_bytes)
service.generate_audio_stream = mock_stream
service.list_voices.return_value = ["test_voice", "voice1", "voice2"]
@@ -256,8 +257,8 @@ def test_openai_speech_endpoint(
):
"""Test the OpenAI-compatible speech endpoint with basic MP3 generation"""
# Configure mocks
- mock_tts_service.generate_audio.return_value = (np.zeros(1000), 0.1)
- mock_convert.return_value = mock_audio_bytes
+ mock_tts_service.generate_audio.return_value = AudioChunk(np.zeros(1000,np.int16))
+ mock_convert.return_value = AudioChunk(np.zeros(1000,np.int16),output=mock_audio_bytes)
response = client.post(
"/v1/audio/speech",
@@ -272,10 +273,10 @@ def test_openai_speech_endpoint(
assert response.status_code == 200
assert response.headers["content-type"] == "audio/mpeg"
assert len(response.content) > 0
- assert response.content == mock_audio_bytes
+ assert response.content == mock_audio_bytes + mock_audio_bytes
mock_tts_service.generate_audio.assert_called_once()
- mock_convert.assert_called_once()
+ assert mock_convert.call_count == 2
def test_openai_speech_streaming(mock_tts_service, test_voice, mock_audio_bytes):
diff --git a/docker/scripts/entrypoint.sh b/docker/scripts/entrypoint.sh
index ca72910..a578495 100644
--- a/docker/scripts/entrypoint.sh
+++ b/docker/scripts/entrypoint.sh
@@ -5,4 +5,4 @@ if [ "$DOWNLOAD_MODEL" = "true" ]; then
python download_model.py --output api/src/models/v1_0
fi
-exec uv run --extra $DEVICE python -m uvicorn api.src.main:app --host 0.0.0.0 --port 8880 --log-level debug
\ No newline at end of file
+exec uv run --extra $DEVICE --no-sync python -m uvicorn api.src.main:app --host 0.0.0.0 --port 8880 --log-level debug
\ No newline at end of file
diff --git a/examples/assorted_checks/benchmarks/benchmark_first_token.py b/examples/assorted_checks/benchmarks/benchmark_first_token.py
index a9e47bb..16f94c0 100644
--- a/examples/assorted_checks/benchmarks/benchmark_first_token.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token.py
@@ -34,7 +34,7 @@ def measure_first_token(
json={
"model": "kokoro",
"input": text,
- "voice": "af",
+ "voice": "af_heart",
"response_format": "wav",
"stream": False,
},
diff --git a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
index df1855c..c3b6d92 100644
--- a/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
+++ b/examples/assorted_checks/benchmarks/benchmark_first_token_stream_unified.py
@@ -34,7 +34,7 @@ def measure_first_token_requests(
json={
"model": "kokoro",
"input": text,
- "voice": "af",
+ "voice": "af_heart",
"response_format": "pcm",
"stream": True,
},
@@ -123,7 +123,7 @@ def measure_first_token_openai(
# Make streaming request using OpenAI client
with OPENAI_CLIENT.audio.speech.with_streaming_response.create(
model="kokoro",
- voice="af",
+ voice="af_heart",
response_format="pcm",
input=text,
) as response:
diff --git a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
index f44f7eb..4e9657e 100644
--- a/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
+++ b/examples/assorted_checks/benchmarks/lib/shared_benchmark_utils.py
@@ -56,7 +56,7 @@ def make_tts_request(
json={
"model": "kokoro",
"input": text,
- "voice": "af",
+ "voice": "af_heart",
"response_format": "wav",
"stream": True,
},
@@ -77,7 +77,7 @@ def make_tts_request(
json={
"model": "kokoro",
"input": text,
- "voice": "af",
+ "voice": "af_heart",
"response_format": "wav",
"stream": False,
},
diff --git a/examples/assorted_checks/test_formats/test_audio_formats.py b/examples/assorted_checks/test_formats/test_audio_formats.py
index 68156b6..efd527a 100644
--- a/examples/assorted_checks/test_formats/test_audio_formats.py
+++ b/examples/assorted_checks/test_formats/test_audio_formats.py
@@ -246,7 +246,7 @@ def main():
output_dir.mkdir(exist_ok=True, parents=True)
# First generate audio in each format using the API
- voice = "af" # Using default voice
+ voice = "af_heart" # Using default voice
formats = ["wav", "mp3", "opus", "flac", "pcm"]
stats = []
diff --git a/examples/assorted_checks/test_openai/test_openai_tts.py b/examples/assorted_checks/test_openai/test_openai_tts.py
index 80e3602..01124af 100644
--- a/examples/assorted_checks/test_openai/test_openai_tts.py
+++ b/examples/assorted_checks/test_openai/test_openai_tts.py
@@ -23,7 +23,7 @@ def test_format(
try:
response = client.audio.speech.create(
- model="tts-1", voice="af", input=text, response_format=format
+ model="tts-1", voice="af_heart", input=text, response_format=format
)
print("Got response, saving to file...")
@@ -43,7 +43,7 @@ def test_speed(speed: float):
try:
response = client.audio.speech.create(
model="tts-1",
- voice="af",
+ voice="af_heart",
input="The quick brown fox jumped over the lazy dog.",
response_format="wav",
speed=speed,
diff --git a/examples/captioned_speech_example.py b/examples/captioned_speech_example.py
index 2433134..6a622b1 100644
--- a/examples/captioned_speech_example.py
+++ b/examples/captioned_speech_example.py
@@ -2,6 +2,7 @@ import json
from typing import Tuple, Optional, Dict, List
from pathlib import Path
+import base64
import requests
# Get the directory this script is in
@@ -9,9 +10,9 @@ SCRIPT_DIR = Path(__file__).absolute().parent
def generate_captioned_speech(
text: str,
- voice: str = "af_bella",
+ voice: str = "af_heart",
speed: float = 1.0,
- response_format: str = "wav"
+ response_format: str = "mp3"
) -> Tuple[Optional[bytes], Optional[List[Dict]]]:
"""Generate audio with word-level timestamps."""
response = requests.post(
@@ -21,40 +22,31 @@ def generate_captioned_speech(
"input": text,
"voice": voice,
"speed": speed,
- "response_format": response_format
+ "response_format": response_format,
+ "stream": False
}
)
print(f"Response status: {response.status_code}")
- print(f"Response headers: {dict(response.headers)}")
if response.status_code != 200:
print(f"Error response: {response.text}")
return None, None
try:
- # Get timestamps path from header
- timestamps_filename = response.headers.get('X-Timestamps-Path')
- if not timestamps_filename:
- print("Error: No timestamps path in response headers")
- return None, None
-
- # Get timestamps from the path
- timestamps_response = requests.get(f"http://localhost:8880/dev/timestamps/{timestamps_filename}")
- if timestamps_response.status_code != 200:
- print(f"Error getting timestamps: {timestamps_response.text}")
- return None, None
-
- word_timestamps = timestamps_response.json()
+ audio_json=json.loads(response.content)
+
+ # Decode base 64 stream to bytes
+ chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8"))
- # Get audio bytes from content
- audio_bytes = response.content
-
- if not audio_bytes:
+ # Print word level timestamps
+ print(audio_json["timestamps"])
+
+ if not chunk_audio:
print("Error: Empty audio content")
return None, None
- return audio_bytes, word_timestamps
+ return chunk_audio, audio_json["timestamps"]
except json.JSONDecodeError as e:
print(f"Error parsing timestamps: {e}")
return None, None
diff --git a/pyproject.toml b/pyproject.toml
index acb5004..d0ff675 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
"inflect>=7.5.0",
"phonemizer-fork>=3.3.2",
+ "av>=14.1.0",
]
[project.optional-dependencies]
diff --git a/start-cpu.sh b/start-cpu.sh
index 1af531d..651f645 100755
--- a/start-cpu.sh
+++ b/start-cpu.sh
@@ -14,4 +14,4 @@ export WEB_PLAYER_PATH=$PROJECT_ROOT/web
# Run FastAPI with CPU extras using uv run
# Note: espeak may still require manual installation,
uv pip install -e ".[cpu]"
-uv run uvicorn api.src.main:app --host 0.0.0.0 --port 8880
+uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880
diff --git a/start-gpu.sh b/start-gpu.sh
index d932df4..b079978 100755
--- a/start-gpu.sh
+++ b/start-gpu.sh
@@ -13,4 +13,4 @@ export WEB_PLAYER_PATH=$PROJECT_ROOT/web
# Run FastAPI with GPU extras using uv run
uv pip install -e ".[gpu]"
-uv run uvicorn api.src.main:app --host 0.0.0.0 --port 8880
+uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880
diff --git a/ui/depr_tests/conftest.py b/ui/depr_tests/conftest.py
index 9b01e6b..f0c2a2e 100644
--- a/ui/depr_tests/conftest.py
+++ b/ui/depr_tests/conftest.py
@@ -16,7 +16,7 @@ async def mock_model_manager():
async def mock_voice_manager():
"""Mock voice manager for UI tests"""
manager = AsyncMock()
- manager.list_voices = AsyncMock(return_value=["af", "bm_lewis", "af_sarah"])
+ manager.list_voices = AsyncMock(return_value=["af_heart", "bm_lewis", "af_sarah"])
return manager