Fix streaming a wav file with captions not reaturning any captions (This is only a problem because wav streaming does not acually work)

2025-08-05 16:48:53 +00:00 · 2025-02-16 16:49:33 -05:00 · 2025-02-16 16:49:33 -05:00 · cb22aab239
commit cb22aab239
parent e3dc959775
1 changed files with 22 additions and 1 deletions
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@ -210,11 +210,21 @@ async def create_captioned_speech(
                    try:
                        # Write chunks to temp file and stream
                        async for chunk_data in generator:
                            # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
                            timestamp_acumulator=[]
                            if chunk_data.output:  # Skip empty chunks
                                await temp_writer.write(chunk_data.output)
                                base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
                                # Add any chunks that may be in the acumulator into the return word_timestamps
                                chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
                                timestamp_acumulator=[]
                                yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
                            else:
                                if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
                                    timestamp_acumulator+=chunk_data.word_timestamps
                        # Finalize the temp file
                        await temp_writer.finalize()
@ -234,13 +244,24 @@ async def create_captioned_speech(
            async def single_output():
                try:
                    # The timestamp acumulator is only used when word level time stamps are generated but no audio is returned.
                    timestamp_acumulator=[]
                    # Stream chunks
                    async for chunk_data in generator:
                        if chunk_data.output:  # Skip empty chunks
                            # Encode the chunk bytes into base 64
                            base64_chunk= base64.b64encode(chunk_data.output).decode("utf-8")
                            # Add any chunks that may be in the acumulator into the return word_timestamps
                            chunk_data.word_timestamps=timestamp_acumulator + chunk_data.word_timestamps
                            timestamp_acumulator=[]
                            yield CaptionedSpeechResponse(audio=base64_chunk,audio_format=content_type,timestamps=chunk_data.word_timestamps)
                        else:
                            if chunk_data.word_timestamps is not None and len(chunk_data.word_timestamps) > 0:
                                timestamp_acumulator+=chunk_data.word_timestamps
                except Exception as e:
                    logger.error(f"Error in single output streaming: {e}")
                    raise