fixes and corrections to code that didn't cause errors but didn't really make sense

2025-04-13 09:39:17 +00:00 · 2025-03-02 21:36:34 -05:00 · 2025-03-02 21:36:34 -05:00 · b3d5f4de08
commit b3d5f4de08
parent 9c6e72943c
4 changed files with 9 additions and 10 deletions
--- a/copy.py
+++ b/copy.py
@ -23,7 +23,7 @@ In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typic
 5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos."""


-Type="mp3"
+Type="wav"
 response = requests.post(
    "http://localhost:8880/dev/captioned_speech",
    json={
@ -51,12 +51,12 @@ for chunk in response.iter_lines(decode_unicode=True):
        f.write(chunk_audio)
        
        # Print word level timestamps
-last3=chunk_json["timestamps"][-3]
+last_chunks={"start_time":chunk_json["timestamps"][-10]["start_time"],"end_time":chunk_json["timestamps"][-3]["end_time"],"word":" ".join([X["word"] for X in chunk_json["timestamps"][-10:-3]])}

-print(f"CUTTING TO {last3['word']}")
+print(f"CUTTING TO {last_chunks['word']}")

 audioseg=pydub.AudioSegment.from_file(f"outputstream.{Type}",format=Type)
-audioseg=audioseg[last3["start_time"]*1000:last3["end_time"] * 1000]
+audioseg=audioseg[last_chunks["start_time"]*1000:last_chunks["end_time"] * 1000]
 audioseg.export(f"outputstreamcut.{Type}",format=Type)


--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@ -259,10 +259,6 @@ class KokoroV1(BaseModelBackend):
                                )
                        if result.pred_dur is not None:
                            try:
-                                # Join timestamps for this chunk's tokens
-                                KPipeline.join_timestamps(
-                                    result.tokens, result.pred_dur
-                                )

                                # Add timestamps with offset
                                for token in result.tokens:
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@ -254,7 +254,10 @@ async def create_captioned_speech(
                            base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
                            
                            # Add any chunks that may be in the acumulator into the return word_timestamps
-                            chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
+                            if chunk_data.word_timestamps != None:
+                                chunk_data.word_timestamps = timestamp_acumulator + chunk_data.word_timestamps
+                            else:
+                                chunk_data.word_timestamps = []
                            timestamp_acumulator=[]
                            
                            yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
--- a/api/src/services/audio.py
+++ b/api/src/services/audio.py
@ -121,7 +121,7 @@ class AudioService:
        is_last_chunk: bool = False,
        trim_audio: bool = True,
        normalizer: AudioNormalizer = None,
-    ) -> Tuple[AudioChunk]:
+    ) -> AudioChunk:
        """Convert audio data to specified format with streaming support

        Args: