From c5a3e136708c28f8118cf8555d6fcd3c173f4407 Mon Sep 17 00:00:00 2001 From: Fireblade Date: Wed, 19 Feb 2025 23:10:51 -0500 Subject: [PATCH] Converted the stream writer to use pyav --- README.md | 2 +- Test copy.py | 88 ++++++++ Test.py | 85 ++++++-- api/src/routers/development.py | 2 +- api/src/services/audio.py | 3 +- api/src/services/streaming_audio_writer.py | 223 +++------------------ api/src/services/tts_service.py | 8 +- api/tests/test_audio_service.py | 6 +- output.mp3 | 0 pyproject.toml | 1 + 10 files changed, 197 insertions(+), 221 deletions(-) create mode 100644 Test copy.py delete mode 100644 output.mp3 diff --git a/README.md b/README.md index b307e7c..9b6d3f7 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ response = requests.post( - wav - opus - flac -- aac +- m4a - pcm

diff --git a/Test copy.py b/Test copy.py new file mode 100644 index 0000000..4ecbc5e --- /dev/null +++ b/Test copy.py @@ -0,0 +1,88 @@ +import requests +import base64 +import json +import pydub +text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart" + +5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences. + +The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrator’s composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships. + +Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire. + +However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile. + +The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence. + +Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation. + +In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience.""" + +"""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart" + +5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos.""" + + +Type="mp3" +response = requests.post( + "http://localhost:8880/dev/captioned_speech", + json={ + "model": "kokoro", + "input": text, + "voice": "af_heart+af_sky", + "speed": 1.0, + "response_format": Type, + "stream": True, + }, + stream=True +) + +f=open(f"outputstream.{Type}","wb") +for chunk in response.iter_lines(decode_unicode=True): + if chunk: + temp_json=json.loads(chunk) + if temp_json["timestamps"] != []: + chunk_json=temp_json + + # Decode base 64 stream to bytes + chunk_audio=base64.b64decode(temp_json["audio"].encode("utf-8")) + + # Process streaming chunks + f.write(chunk_audio) + + # Print word level timestamps +last3=chunk_json["timestamps"][-3] + +print(f"CUTTING TO {last3['word']}") + +audioseg=pydub.AudioSegment.from_file(f"outputstream.{Type}",format=Type) +audioseg=audioseg[last3["start_time"]*1000:last3["end_time"] * 1000] +audioseg.export(f"outputstreamcut.{Type}",format=Type) + + +""" +response = requests.post( + "http://localhost:8880/dev/captioned_speech", + json={ + "model": "kokoro", + "input": text, + "voice": "af_heart+af_sky", + "speed": 1.0, + "response_format": Type, + "stream": False, + }, + stream=True +) + +with open(f"outputnostream.{Type}", "wb") as f: + audio_json=json.loads(response.content) + + # Decode base 64 stream to bytes + chunk_audio=base64.b64decode(audio_json["audio"].encode("utf-8")) + + # Process streaming chunks + f.write(chunk_audio) + + # Print word level timestamps + print(audio_json["timestamps"]) +""" \ No newline at end of file diff --git a/Test.py b/Test.py index f9b1a35..d5c1cf2 100644 --- a/Test.py +++ b/Test.py @@ -1,22 +1,63 @@ -import requests - - -response = requests.get("http://localhost:8880/v1/audio/voices") -voices = response.json()["voices"] - -# Generate audio -response = requests.post( - "http://localhost:8880/v1/audio/speech", - json={ - "model": "kokoro", - "input": "http://localhost:8880/web/", - "voice": "af_heart", - "response_format": "mp3", # Supported: mp3, wav, opus, flac - "speed": 1.0, - "stream":False, - } -) - -# Save audio -with open("output.mp3", "wb") as f: - f.write(response.content) \ No newline at end of file +import requests +import base64 +import json + +text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart" + +5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos. Through potent metaphors, starkly honest lyrics, and a sonic landscape that mirrors its thematic weight, the song offers a profound meditation on the human condition, grappling with the shadows that reside within us all and their far-reaching consequences. + +The very title, "Jet Black Heart," immediately establishes the song's central motif: an intrinsic darkness residing within the narrator's emotional core. The phrase "jet black" is not simply a descriptor of color; it evokes a sense of absolute darkness, a void devoid of light, and a profound absence of hope. This is not a heart merely bruised by external circumstances, but one fundamentally shaded by internal struggles, suggesting a chronic condition of emotional pain. The opening lines, "Everybody's got their demons, even wide awake or dreaming," acknowledge the universality of inner conflict, a shared human experience of battling internal anxieties and insecurities. However, the designation of a "jet black heart" elevates this struggle to a more profound and potentially entrenched level. It suggests a darkness that is not fleeting or situational, but rather a deeply ingrained aspect of the narrator's being, casting a long shadow over their life and relationships. This internal darkness is further amplified by the subsequent metaphor, "there's a hurricane underneath it." The imagery of a hurricane is intensely evocative, conjuring images of destructive force, uncontrollable chaos, and overwhelming power. This "hurricane" represents the tumultuous emotions and internal disorder raging beneath the surface of the narrator’s composed exterior. It is a maelstrom of pain, anxiety, and self-doubt that threatens to erupt and engulf everything in its path. Crucially, this internal hurricane is not merely passive suffering; it is actively "trying to keep us apart," revealing the insidious way in which these inner demons sabotage connections and erect formidable barriers to genuine intimacy and meaningful relationships. + +Expanding on this internal struggle, "Jet Black Heart" delves into the narrator's self-destructive patterns, particularly within the realm of romantic relationships. The lyrics "See a war, I wanna fight it, See a match, I wanna strike it" paint a stark picture of a deeply ingrained tendency towards conflict and destruction. This is not simply a reactive response to external aggression, but rather an active seeking out of discord, a subconscious drive to ignite conflict even in peaceful situations. This behavior can be interpreted as a manifestation of their inner turmoil, a projection of their internal chaos onto their external world. Perhaps the narrator, accustomed to internal strife, unconsciously recreates this turbulence in their relationships, finding a perverse sense of familiarity or even control within the chaos. This destructive impulse is further emphasized by the line "Every fire I've ignited faded to gray." The imagery of fire, initially representing passion, intensity, or perhaps even anger, ultimately devolving into "gray" underscores a recurring cycle of destructive behavior that culminates in emptiness and disappointment. The color gray, often associated with neutrality, lifelessness, and a lack of vibrancy, perfectly encapsulates the emotional aftermath of these self-inflicted relational fires. The initial spark of connection or excitement is inevitably extinguished, leaving behind a landscape of emotional flatness and a profound sense of failure in sustaining meaningful bonds. Further solidifying this theme of self-sabotage is the powerful phrase "I write with a poison pen." This metaphor extends beyond mere hurtful words, encompassing actions, behaviors, and the narrator's overall negative influence on their relationships. The "poison pen" suggests a deliberate, albeit perhaps unconscious, act of inflicting harm, highlighting the narrator's painful awareness of their own damaging tendencies and their capacity to erode the very connections they seemingly desire. + +However, amidst this pervasive darkness and self-destructive cycle, "Jet Black Heart" subtly introduces a fragile glimmer of hope, a faint light flickering in the abyss. The pivotal moment of vulnerability and potential transformation arrives with the plaintive plea, "But now that I'm broken, now that you're knowing, caught up in a moment, can you see inside?" This is a desperate and profoundly vulnerable call for understanding, a raw and unfiltered exposure of the "jet black heart" after reaching a critical breaking point. The narrator, stripped bare by the weight of their own struggles and the consequences of their self-destructive behavior, finally seeks empathy and genuine connection. The admission of being "broken" is not a declaration of defeat, but rather a necessary precursor to potential healing. It is in this state of vulnerability, in the raw aftermath of emotional collapse, that the narrator dares to ask, "Can you see inside?" This question is laden with yearning, a desperate hope that someone, perhaps a partner in the strained relationship, can perceive beyond the surface darkness and recognize the wounded humanity beneath the "jet black heart." It is a plea for acceptance, not despite the darkness, but perhaps even because of it, a hope that vulnerability will be met not with judgment or rejection, but with compassion and understanding. Despite the acknowledgement of their "poison pen" and destructive tendencies, the narrator also recognizes a paradoxical source of potential redemption within the very relationship that is strained by their inner darkness: "these chemicals moving between us are the reason to start again." The ambiguous term "chemicals" can be interpreted on multiple levels. It could symbolize the complex and often volatile dynamics of human connection, the unpredictable and sometimes turbulent interplay of emotions and personalities in a relationship. Alternatively, "chemicals" might allude to a more literal, perhaps even neurochemical, imbalance within the narrator, suggesting that the very forces driving their darkness might also hold the key to transformation. Crucially, the phrase "reason to start again" emphasizes the potential for renewal and redemption, not a guaranteed outcome. It is a tentative step towards hope, acknowledging that the path forward will be fraught with challenges, but that the possibility of healing and rebuilding remains, however fragile. + +The concluding verses of "Jet Black Heart" further solidify this nascent theme of potential transformation and tentative redemption. "The blood in my veins is made up of mistakes" is a powerful and profoundly honest admission of past errors and a crucial acceptance of human imperfection. This acknowledgement of fallibility is essential for personal growth and relational healing. By owning their mistakes, the narrator begins to dismantle the cycle of self-blame and self-destruction, paving the way for a more compassionate and forgiving self-perception. The subsequent lines, "let's forget who we are and dive into the dark, as we burst into color, returning to life," present a radical and transformative vision of shared vulnerability and mutual healing. The call to "forget who we are" is not an invitation to erase individual identity, but rather a suggestion to shed the constructed personas, ego-driven defenses, and pre-conceived notions that often hinder genuine connection. It is about stripping away the masks and embracing a state of raw, unfiltered vulnerability. The imperative to "dive into the dark" is perhaps the most challenging and transformative element of the song. It is a call to confront the pain, to face the demons, and to embrace the shared vulnerability that lies at the heart of genuine intimacy. This shared descent into darkness is not an act of succumbing to despair, but rather a courageous journey towards healing, suggesting that true connection and growth can only emerge from acknowledging and confronting the deepest, most painful aspects of ourselves and each other. The subsequent image of "bursting into color, returning to life" provides a powerful counterpoint to the prevailing darkness, symbolizing transformation, healing, and a vibrant renewal of life and connection. "Bursting into color" evokes a sense of vibrancy, joy, and emotional richness that stands in stark contrast to the "jet black" and "gray" imagery prevalent throughout the song. This suggests that by confronting and embracing the darkness, there is a possibility of emerging transformed, experiencing a rebirth and a renewed sense of purpose and joy in life. "Returning to life" further reinforces this idea of resurrection and revitalization, implying that the journey through darkness is not an end in itself, but rather a necessary passage towards a fuller, more authentic, and more vibrant existence. + +Beyond the lyrical content, the musical elements of "Jet Black Heart" contribute significantly to its overall meaning and emotional impact. Compared to 5 Seconds of Summer's earlier, more upbeat work, "Jet Black Heart" adopts a heavier, more brooding sonic landscape. The driving rhythm, the prominent bassline, and the raw, emotive vocal delivery all mirror the thematic weight of the lyrics, creating an atmosphere of intense emotionality and vulnerability. The song's structure, building from a quiet, introspective beginning to a powerful, anthemic chorus, reflects the narrator's journey from internal struggle to a desperate plea for connection and ultimately a tentative hope for transformation. + +In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typical pop song; it is a poignant and deeply resonant exploration of inner darkness, self-destructive tendencies, and the fragile yet persistent hope for human connection and redemption. Through its powerful central metaphor of the "jet black heart," its unflinching portrayal of internal turmoil, and its subtle yet potent message of vulnerability and potential transformation, the song resonates with anyone who has grappled with their own inner demons and the complexities of human relationships. It is a reminder that even in the deepest darkness, a flicker of hope can endure, and that true healing and connection often emerge from the courageous act of confronting and sharing our most vulnerable selves. "Jet Black Heart" stands as a testament to 5 Seconds of Summer's artistic growth, showcasing their capacity to delve into profound emotional territories and create music that is not only catchy and engaging but also deeply meaningful and emotionally resonant, solidifying their position as a band capable of capturing the complexities of the human experience.""" + +text="""Delving into the Abyss: A Deeper Exploration of Meaning in 5 Seconds of Summer's "Jet Black Heart" + +5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos.""" + + +Type="aac" + + +response = requests.post( + "http://localhost:8880/v1/audio/speech", + json={ + "model": "kokoro", + "input": text, + "voice": "af_heart+af_sky", + "speed": 1.0, + "response_format": Type, + "stream": True, + }, + stream=True +) + + +f=open(f"outputstream.{Type}","wb") +for chunk in response.iter_content(): + if chunk: + # Process streaming chunks + f.write(chunk) + +response = requests.post( + "http://localhost:8880/v1/audio/speech", + json={ + "model": "kokoro", + "input": text, + "voice": "af_heart+af_sky", + "speed": 1.0, + "response_format": Type, + "stream": False, + }, + stream=True +) + +with open(f"outputnostream.{Type}", "wb") as f: + f.write(response.content) diff --git a/api/src/routers/development.py b/api/src/routers/development.py index e1cdbab..7fbcc56 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -175,7 +175,7 @@ async def create_captioned_speech( content_type = { "mp3": "audio/mpeg", "opus": "audio/opus", - "aac": "audio/aac", + "m4a": "audio/mp4", "flac": "audio/flac", "wav": "audio/wav", "pcm": "audio/pcm", diff --git a/api/src/services/audio.py b/api/src/services/audio.py index eb31c59..0c75224 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -89,7 +89,7 @@ class AudioService: """Service for audio format conversions with streaming support""" # Supported formats - SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm", "ogg"} + SUPPORTED_FORMATS = {"wav", "mp3", "opus", "flac", "aac", "pcm"} # Default audio format settings balanced for speed and compression DEFAULT_SETTINGS = { @@ -158,6 +158,7 @@ class AudioService: AudioService._writers[writer_key] = StreamingAudioWriter( output_format, sample_rate ) + writer = AudioService._writers[writer_key] # Write audio data first diff --git a/api/src/services/streaming_audio_writer.py b/api/src/services/streaming_audio_writer.py index 1a45eec..b69e813 100644 --- a/api/src/services/streaming_audio_writer.py +++ b/api/src/services/streaming_audio_writer.py @@ -8,7 +8,7 @@ import numpy as np import soundfile as sf from loguru import logger from pydub import AudioSegment - +import av class StreamingAudioWriter: """Handles streaming audio format conversions""" @@ -18,60 +18,19 @@ class StreamingAudioWriter: self.sample_rate = sample_rate self.channels = channels self.bytes_written = 0 - self.buffer = BytesIO() + self.pts=0 + codec_map = {"wav":"pcm_s16le","mp3":"mp3","opus":"libopus","flac":"flac", "aac":"aac"} # Format-specific setup - if self.format == "wav": - self._write_wav_header_initial() - elif self.format in ["ogg", "opus"]: - # For OGG/Opus, write to memory buffer - self.writer = sf.SoundFile( - file=self.buffer, - mode="w", - samplerate=sample_rate, - channels=channels, - format="OGG", - subtype="VORBIS" if self.format == "ogg" else "OPUS", - ) - elif self.format == "flac": - # For FLAC, write to memory buffer - self.writer = sf.SoundFile( - file=self.buffer, - mode="w", - samplerate=sample_rate, - channels=channels, - format="FLAC", - ) - elif self.format in ["mp3", "aac"]: - # For MP3/AAC, we'll use pydub's incremental writer - self.segments = [] # Store segments until we have enough data - self.total_duration = 0 # Track total duration in milliseconds - # Initialize an empty AudioSegment as our encoder - self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate) - elif self.format == "pcm": - # PCM doesn't need initialization, we'll write raw bytes - pass + if self.format in ["wav", "opus","flac","mp3","aac","pcm"]: + if self.format != "pcm": + self.output_buffer = BytesIO() + self.container = av.open(self.output_buffer, mode="w", format=self.format) + #print(av.codecs_available) + self.stream = self.container.add_stream(codec_map[self.format], rate=self.sample_rate,sample_rate=self.sample_rate,layout='mono' if self.channels == 1 else 'stereo') else: raise ValueError(f"Unsupported format: {format}") - def _write_wav_header_initial(self) -> None: - """Write initial WAV header with placeholders""" - self.buffer.write(b"RIFF") - self.buffer.write(struct.pack(" bytes: @@ -81,153 +40,37 @@ class StreamingAudioWriter: audio_data: Audio data to write, or None if finalizing finalize: Whether this is the final write to close the stream """ - output_buffer = BytesIO() if finalize: - if self.format == "wav": - # Calculate actual file and data sizes - file_size = self.bytes_written + 36 # RIFF header bytes - data_size = self.bytes_written - - # Seek to the beginning to overwrite the placeholders - self.buffer.seek(4) - self.buffer.write(struct.pack(" 0: - format_args = { - "mp3": {"format": "mp3", "codec": "libmp3lame"}, - "aac": {"format": "adts", "codec": "aac"}, - }[self.format] - - parameters = [] - if self.format == "mp3": - parameters.extend( - [ - "-q:a", - "0", # Highest quality - "-write_xing", - "1", # XING header for MP3 - "-id3v1", - "1", - "-id3v2", - "1", - "-write_vbr", - "1", - "-vbr_quality", - "2", - ] - ) - elif self.format == "aac": - parameters.extend( - [ - "-q:a", - "2", - "-write_xing", - "0", - "-write_id3v1", - "0", - "-write_id3v2", - "0", - ] - ) - - self.encoder.export( - output_buffer, - **format_args, - bitrate="192k", # Optimal for 24kHz/16-bit mono source - parameters=parameters, - ) - self.encoder = None - - return output_buffer.getvalue() + if self.format != "pcm": + packets = self.stream.encode(None) + for packet in packets: + self.container.mux(packet) + + data=self.output_buffer.getvalue() + self.container.close() + return data if audio_data is None or len(audio_data) == 0: return b"" - if self.format == "wav": - # Write raw PCM data - self.buffer.write(audio_data.tobytes()) - self.bytes_written += len(audio_data.tobytes()) - return b"" - - elif self.format in ["ogg", "opus", "flac"]: - # Write to soundfile buffer - self.writer.write(audio_data) - self.writer.flush() - return self.buffer.getvalue() - - elif self.format in ["mp3", "aac"]: - # Convert chunk to AudioSegment and encode - segment = AudioSegment( - audio_data.tobytes(), - frame_rate=self.sample_rate, - sample_width=audio_data.dtype.itemsize, - channels=self.channels, - ) - - # Track total duration - self.total_duration += len(segment) - - # Add segment to encoder - self.encoder += segment - - # Export current state to buffer without final metadata - format_args = { - "mp3": {"format": "mp3", "codec": "libmp3lame"}, - "aac": {"format": "adts", "codec": "aac"}, - }[self.format] - - # For chunks, export without duration metadata or XING headers - self.encoder.export( - output_buffer, - **format_args, - bitrate="192k", # Optimal for 24kHz/16-bit mono source - parameters=[ - "-q:a", - "0", # Highest quality for chunks too - "-write_xing", - "0", # No XING headers for chunks - ], - ) - - # Get the encoded data - encoded_data = output_buffer.getvalue() - - # Reset encoder to prevent memory growth - self.encoder = AudioSegment.silent(duration=0, frame_rate=self.sample_rate) - - return encoded_data - - elif self.format == "pcm": + if self.format == "pcm": # Write raw bytes return audio_data.tobytes() + else: + frame = av.AudioFrame.from_ndarray(audio_data.reshape(1, -1), format='s16', layout='mono' if self.channels == 1 else 'stereo') + frame.sample_rate=self.sample_rate - return b"" + + frame.pts = self.pts + self.pts += frame.samples + + packets = self.stream.encode(frame) + for packet in packets: + self.container.mux(packet) + + data = self.output_buffer.getvalue() + self.output_buffer.seek(0) + self.output_buffer.truncate(0) + return data - def close(self) -> Optional[bytes]: - """Finish the audio file and return any remaining data""" - if self.format == "wav": - # Re-finalize WAV file by updating headers - self.buffer.seek(0) - file_content = self.write_chunk(finalize=True) - return file_content - - elif self.format in ["ogg", "opus", "flac"]: - # Finalize other formats - self.writer.close() - return self.buffer.getvalue() - - elif self.format in ["mp3", "aac"]: - # Finalize MP3/AAC - final_data = self.write_chunk(finalize=True) - return final_data - - return None diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 716c603..a115c18 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -86,6 +86,7 @@ class TTSService: # Generate audio using pre-warmed model if isinstance(backend, KokoroV1): + chunk_index=0 # For Kokoro V1, pass text and voice info with lang_code async for chunk_data in self.model_manager.generate( chunk_text, @@ -103,7 +104,7 @@ class TTSService: output_format, speed, chunk_text, - is_first_chunk=is_first, + is_first_chunk=is_first and chunk_index == 0, is_last_chunk=is_last, normalizer=normalizer, ) @@ -117,6 +118,7 @@ class TTSService: is_last, normalizer) yield chunk_data + chunk_index+=1 else: # For legacy backends, load voice tensor @@ -291,12 +293,12 @@ class TTSService: if chunk_data.output is not None: yield chunk_data - chunk_index += 1 + else: logger.warning( f"No audio generated for chunk: '{chunk_text[:100]}...'" ) - + chunk_index += 1 except Exception as e: logger.error( f"Failed to process audio for chunk: '{chunk_text[:100]}...'. Error: {str(e)}" diff --git a/api/tests/test_audio_service.py b/api/tests/test_audio_service.py index 6a15a62..4afc345 100644 --- a/api/tests/test_audio_service.py +++ b/api/tests/test_audio_service.py @@ -85,11 +85,11 @@ async def test_convert_to_flac(sample_audio): @pytest.mark.asyncio -async def test_convert_to_aac(sample_audio): - """Test converting to AAC format""" +async def test_convert_to_m4a(sample_audio): + """Test converting to M4A format""" audio_data, sample_rate = sample_audio audio_chunk = await AudioService.convert_audio( - AudioChunk(audio_data), sample_rate, "aac" + AudioChunk(audio_data), sample_rate, "m4a" ) assert isinstance(audio_chunk.output, bytes) assert isinstance(audio_chunk, AudioChunk) diff --git a/output.mp3 b/output.mp3 deleted file mode 100644 index e69de29..0000000 diff --git a/pyproject.toml b/pyproject.toml index acb5004..d0ff675 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", "inflect>=7.5.0", "phonemizer-fork>=3.3.2", + "av>=14.1.0", ] [project.optional-dependencies]