Converted the stream writer to use pyav

This commit is contained in:
Fireblade 2025-02-19 23:10:51 -05:00
parent 4ee4d36822
commit c5a3e13670
10 changed files with 197 additions and 221 deletions

View file

@ -243,7 +243,7 @@ response = requests.post(
- wav
- opus
- flac
- aac
- m4a
- pcm
<p align="center">

88
Test copy.py Normal file
View file

@ -0,0 +1,88 @@
import requests
import base64
import json
import pydub
text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences.
The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrators composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships.
Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire.
However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile.
The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence.
Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation.
In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience."""
"""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos."""
Type="mp3"
response = requests.post(
"http://localhost:8880/dev/captioned_speech",
json={
"model": "kokoro",
"input": text,
"voice": "af_heart+af_sky",
"speed": 1.0,
"response_format": Type,
"stream": True,
},
stream=True
)
f=open(f"outputstream.{Type}","wb")
for chunk in response.iter_lines(decode_unicode=True):
if chunk:
temp_json=json.loads(chunk)
if temp_json["timestamps"] != []:
chunk_json=temp_json
# Decode base 64 stream to bytes
chunk_audio=base64.b64decode(temp_json["audio"].encode("utf-8"))
# Process streaming chunks
f.write(chunk_audio)
# Print word level timestamps
last3=chunk_json["timestamps"][-3]
print(f"CUTTING TO {last3['word']}")
audioseg=pydub.AudioSegment.from_file(f"outputstream.{Type}",format=Type)
audioseg=audioseg[last3["start_time"]*1000:last3["end_time"] * 1000]
audioseg.export(f"outputstreamcut.{Type}",format=Type)
"""
response = requests.post(
"http://localhost:8880/dev/captioned_speech",
json={
"model": "kokoro",
"input": text,
"voice": "af_heart+af_sky",
"speed": 1.0,
"response_format": Type,
"stream": False,
},
stream=True
)
with open(f"outputnostream.{Type}", "wb") as f:
audio_json=json.loads(response.content)
# Decode base 64 stream to bytes
chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8"))
# Process streaming chunks
f.write(chunk_audio)
# Print word level timestamps
print(audio_json["timestamps"])
"""

85
Test.py
View file

@ -1,22 +1,63 @@
import requests
response = requests.get("http://localhost:8880/v1/audio/voices")
voices = response.json()["voices"]
# Generate audio
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": "http://localhost:8880/web/",
"voice": "af_heart",
"response_format": "mp3", # Supported: mp3, wav, opus, flac
"speed": 1.0,
"stream":False,
}
)
# Save audio
with open("output.mp3", "wb") as f:
f.write(response.content)
import requests
import base64
import json
text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences.
The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrators composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships.
Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire.
However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile.
The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence.
Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation.
In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience."""
text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart"
5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos."""
Type="aac"
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af_heart+af_sky",
"speed": 1.0,
"response_format": Type,
"stream": True,
},
stream=True
)
f=open(f"outputstream.{Type}","wb")
for chunk in response.iter_content():
if chunk:
# Process streaming chunks
f.write(chunk)
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": "af_heart+af_sky",
"speed": 1.0,
"response_format": Type,
"stream": False,
},
stream=True
)
with open(f"outputnostream.{Type}", "wb") as f:
f.write(response.content)

View file

@ -175,7 +175,7 @@ async def create_captioned_speech(
content_type = {
"mp3": "audio/mpeg",
"opus": "audio/opus",
"aac": "audio/aac",
"m4a": "audio/mp4",
"flac": "audio/flac",
"wav": "audio/wav",
"pcm": "audio/pcm",

View file

@ -89,7 +89,7 @@ class AudioService:
"""Service for audio format conversions with streaming support"""
# Supported formats
SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm", "ogg"}
SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm"}
# Default audio format settings balanced for speed and compression
DEFAULT_SETTINGS = {
@ -158,6 +158,7 @@ class AudioService:
AudioService._writers[writer_key] = StreamingAudioWriter(
output_format, sample_rate
)
writer = AudioService._writers[writer_key]
# Write audio data first

View file

@ -8,7 +8,7 @@ import numpy as np
import soundfile as sf
from loguru import logger
from pydub import AudioSegment
import av
class StreamingAudioWriter:
"""Handles streaming audio format conversions"""
@ -18,60 +18,19 @@ class StreamingAudioWriter:
self.sample_rate = sample_rate
self.channels = channels
self.bytes_written = 0
self.buffer = BytesIO()
self.pts=0
codec_map = {"wav":"pcm_s16le","mp3":"mp3","opus":"libopus","flac":"flac", "aac":"aac"}
# Format-specific setup
if self.format == "wav":
self._write_wav_header_initial()
elif self.format in ["ogg", "opus"]:
# For OGG/Opus, write to memory buffer
self.writer = sf.SoundFile(
file=self.buffer,
mode="w",
samplerate=sample_rate,
channels=channels,
format="OGG",
subtype="VORBIS" if self.format == "ogg" else "OPUS",
)
elif self.format == "flac":
# For FLAC, write to memory buffer
self.writer = sf.SoundFile(
file=self.buffer,
mode="w",
samplerate=sample_rate,
channels=channels,
format="FLAC",
)
elif self.format in ["mp3", "aac"]:
# For MP3/AAC, we'll use pydub's incremental writer
self.segments = [] # Store segments until we have enough data
self.total_duration = 0 # Track total duration in milliseconds
# Initialize an empty AudioSegment as our encoder
self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate)
elif self.format == "pcm":
# PCM doesn't need initialization, we'll write raw bytes
pass
if self.format in ["wav", "opus","flac","mp3","aac","pcm"]:
if self.format != "pcm":
self.output_buffer = BytesIO()
self.container = av.open(self.output_buffer, mode="w", format=self.format)
#print(av.codecs_available)
self.stream = self.container.add_stream(codec_map[self.format], rate=self.sample_rate,sample_rate=self.sample_rate,layout='mono' if self.channels == 1 else 'stereo')
else:
raise ValueError(f"Unsupported format: {format}")
def _write_wav_header_initial(self) -> None:
"""Write initial WAV header with placeholders"""
self.buffer.write(b"RIFF")
self.buffer.write(struct.pack("<L", 0)) # Placeholder for file size
self.buffer.write(b"WAVE")
self.buffer.write(b"fmt ")
self.buffer.write(struct.pack("<L", 16)) # fmt chunk size
self.buffer.write(struct.pack("<H", 1)) # PCM format
self.buffer.write(struct.pack("<H", self.channels))
self.buffer.write(struct.pack("<L", self.sample_rate))
self.buffer.write(
struct.pack("<L", self.sample_rate * self.channels * 2)
) # Byte rate
self.buffer.write(struct.pack("<H", self.channels * 2)) # Block align
self.buffer.write(struct.pack("<H", 16)) # Bits per sample
self.buffer.write(b"data")
self.buffer.write(struct.pack("<L", 0)) # Placeholder for data size
def write_chunk(
self, audio_data: Optional[np.ndarray] = None, finalize: bool = False
) -> bytes:
@ -81,153 +40,37 @@ class StreamingAudioWriter:
audio_data: Audio data to write, or None if finalizing
finalize: Whether this is the final write to close the stream
"""
output_buffer = BytesIO()
if finalize:
if self.format == "wav":
# Calculate actual file and data sizes
file_size = self.bytes_written + 36 # RIFF header bytes
data_size = self.bytes_written
# Seek to the beginning to overwrite the placeholders
self.buffer.seek(4)
self.buffer.write(struct.pack("<L", file_size))
self.buffer.seek(40)
self.buffer.write(struct.pack("<L", data_size))
self.buffer.seek(0)
return self.buffer.read()
elif self.format in ["ogg", "opus", "flac"]:
self.writer.close()
return self.buffer.getvalue()
elif self.format in ["mp3", "aac"]:
if hasattr(self, "encoder") and len(self.encoder) > 0:
format_args = {
"mp3": {"format": "mp3", "codec": "libmp3lame"},
"aac": {"format": "adts", "codec": "aac"},
}[self.format]
parameters = []
if self.format == "mp3":
parameters.extend(
[
"-q:a",
"0", # Highest quality
"-write_xing",
"1", # XING header for MP3
"-id3v1",
"1",
"-id3v2",
"1",
"-write_vbr",
"1",
"-vbr_quality",
"2",
]
)
elif self.format == "aac":
parameters.extend(
[
"-q:a",
"2",
"-write_xing",
"0",
"-write_id3v1",
"0",
"-write_id3v2",
"0",
]
)
self.encoder.export(
output_buffer,
**format_args,
bitrate="192k", # Optimal for 24kHz/16-bit mono source
parameters=parameters,
)
self.encoder = None
return output_buffer.getvalue()
if self.format != "pcm":
packets = self.stream.encode(None)
for packet in packets:
self.container.mux(packet)
data=self.output_buffer.getvalue()
self.container.close()
return data
if audio_data is None or len(audio_data) == 0:
return b""
if self.format == "wav":
# Write raw PCM data
self.buffer.write(audio_data.tobytes())
self.bytes_written += len(audio_data.tobytes())
return b""
elif self.format in ["ogg", "opus", "flac"]:
# Write to soundfile buffer
self.writer.write(audio_data)
self.writer.flush()
return self.buffer.getvalue()
elif self.format in ["mp3", "aac"]:
# Convert chunk to AudioSegment and encode
segment = AudioSegment(
audio_data.tobytes(),
frame_rate=self.sample_rate,
sample_width=audio_data.dtype.itemsize,
channels=self.channels,
)
# Track total duration
self.total_duration += len(segment)
# Add segment to encoder
self.encoder += segment
# Export current state to buffer without final metadata
format_args = {
"mp3": {"format": "mp3", "codec": "libmp3lame"},
"aac": {"format": "adts", "codec": "aac"},
}[self.format]
# For chunks, export without duration metadata or XING headers
self.encoder.export(
output_buffer,
**format_args,
bitrate="192k", # Optimal for 24kHz/16-bit mono source
parameters=[
"-q:a",
"0", # Highest quality for chunks too
"-write_xing",
"0", # No XING headers for chunks
],
)
# Get the encoded data
encoded_data = output_buffer.getvalue()
# Reset encoder to prevent memory growth
self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate)
return encoded_data
elif self.format == "pcm":
if self.format == "pcm":
# Write raw bytes
return audio_data.tobytes()
else:
frame = av.AudioFrame.from_ndarray(audio_data.reshape(1, -1), format='s16', layout='mono' if self.channels == 1 else 'stereo')
frame.sample_rate=self.sample_rate
return b""
frame.pts = self.pts
self.pts += frame.samples
packets = self.stream.encode(frame)
for packet in packets:
self.container.mux(packet)
data = self.output_buffer.getvalue()
self.output_buffer.seek(0)
self.output_buffer.truncate(0)
return data
def close(self) -> Optional[bytes]:
"""Finish the audio file and return any remaining data"""
if self.format == "wav":
# Re-finalize WAV file by updating headers
self.buffer.seek(0)
file_content = self.write_chunk(finalize=True)
return file_content
elif self.format in ["ogg", "opus", "flac"]:
# Finalize other formats
self.writer.close()
return self.buffer.getvalue()
elif self.format in ["mp3", "aac"]:
# Finalize MP3/AAC
final_data = self.write_chunk(finalize=True)
return final_data
return None

View file

@ -86,6 +86,7 @@ class TTSService:
# Generate audio using pre-warmed model
if isinstance(backend, KokoroV1):
chunk_index=0
# For Kokoro V1, pass text and voice info with lang_code
async for chunk_data in self.model_manager.generate(
chunk_text,
@ -103,7 +104,7 @@ class TTSService:
output_format,
speed,
chunk_text,
is_first_chunk=is_first,
is_first_chunk=is_first and chunk_index == 0,
is_last_chunk=is_last,
normalizer=normalizer,
)
@ -117,6 +118,7 @@ class TTSService:
is_last,
normalizer)
yield chunk_data
chunk_index+=1
else:
# For legacy backends, load voice tensor
@ -291,12 +293,12 @@ class TTSService:
if chunk_data.output is not None:
yield chunk_data
chunk_index += 1
else:
logger.warning(
f"No audio generated for chunk: '{chunk_text[:100]}...'"
)
chunk_index += 1
except Exception as e:
logger.error(
f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}"

View file

@ -85,11 +85,11 @@ async def test_convert_to_flac(sample_audio):
@pytest.mark.asyncio
async def test_convert_to_aac(sample_audio):
"""Test converting to AAC format"""
async def test_convert_to_m4a(sample_audio):
"""Test converting to M4A format"""
audio_data, sample_rate = sample_audio
audio_chunk = await AudioService.convert_audio(
AudioChunk(audio_data), sample_rate, "aac"
AudioChunk(audio_data), sample_rate, "m4a"
)
assert isinstance(audio_chunk.output, bytes)
assert isinstance(audio_chunk, AudioChunk)

View file

View file

@ -37,6 +37,7 @@ dependencies = [
"en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
"inflect>=7.5.0",
"phonemizer-fork>=3.3.2",
"av>=14.1.0",
]
[project.optional-dependencies]