diff --git a/Test copy.py b/Test copy.py index 4ecbc5e..c8e562e 100644 --- a/Test copy.py +++ b/Test copy.py @@ -23,7 +23,7 @@ In conclusion, "Jet Black Heart" by 5 Seconds of Summer is far more than a typic 5 Seconds of Summer, initially perceived as purveyors of upbeat, radio-friendly pop-punk, embarked on a significant artistic evolution with their album Sounds Good Feels Good. Among its tracks, "Jet Black Heart" stands out as a powerful testament to this shift, moving beyond catchy melodies and embracing a darker, more emotionally complex sound. Released in 2015, the song transcends the typical themes of youthful exuberance and romantic angst, instead plunging into the depths of personal turmoil and the corrosive effects of inner darkness on interpersonal relationships. "Jet Black Heart" is not merely a song about heartbreak; it is a raw and vulnerable exploration of internal struggle, self-destructive patterns, and the precarious flicker of hope that persists even in the face of profound emotional chaos.""" -Type="mp3" +Type="wav" response = requests.post( "http://localhost:8880/dev/captioned_speech", json={ @@ -51,12 +51,12 @@ for chunk in response.iter_lines(decode_unicode=True): f.write(chunk_audio) # Print word level timestamps -last3=chunk_json["timestamps"][-3] +last_chunks={"start_time":chunk_json["timestamps"][-10]["start_time"],"end_time":chunk_json["timestamps"][-3]["end_time"],"word":" ".join([X["word"] for X in chunk_json["timestamps"][-10:-3]])} -print(f"CUTTING TO {last3['word']}") +print(f"CUTTING TO {last_chunks['word']}") audioseg=pydub.AudioSegment.from_file(f"outputstream.{Type}",format=Type) -audioseg=audioseg[last3["start_time"]*1000:last3["end_time"] * 1000] +audioseg=audioseg[last_chunks["start_time"]*1000:last_chunks["end_time"] * 1000] audioseg.export(f"outputstreamcut.{Type}",format=Type) diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py index 419ade7..cc90023 100644 --- a/api/src/inference/kokoro_v1.py +++ b/api/src/inference/kokoro_v1.py @@ -259,10 +259,6 @@ class KokoroV1(BaseModelBackend): ) if result.pred_dur is not None: try: - # Join timestamps for this chunk's tokens - KPipeline.join_timestamps( - result.tokens, result.pred_dur - ) # Add timestamps with offset for token in result.tokens: diff --git a/api/src/routers/development.py b/api/src/routers/development.py index 7fbcc56..569ae25 100644 --- a/api/src/routers/development.py +++ b/api/src/routers/development.py @@ -254,7 +254,10 @@ async def create_captioned_speech( base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8") # Add any chunks that may be in the acumulator into the return word_timestamps - chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps + if chunk_data.word_timestamps != None: + chunk_data.word_timestamps = timestamp_acumulator + chunk_data.word_timestamps + else: + chunk_data.word_timestamps = [] timestamp_acumulator=[] yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps) diff --git a/api/src/services/audio.py b/api/src/services/audio.py index 0c75224..d1b412e 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -121,7 +121,7 @@ class AudioService: is_last_chunk: bool = False, trim_audio: bool = True, normalizer: AudioNormalizer = None, - ) -> Tuple[AudioChunk]: + ) -> AudioChunk: """Convert audio data to specified format with streaming support Args: