import base64 import json import pydub import requests text = """Running on localhost:7860""" Type = "wav" response = requests.post( "http://localhost:8880/dev/captioned_speech", json={ "model": "kokoro", "input": text, "voice": "af_heart+af_sky", "speed": 1.0, "response_format": Type, "stream": True, }, stream=True, ) f = open(f"outputstream.{Type}", "wb") for chunk in response.iter_lines(decode_unicode=True): if chunk: temp_json = json.loads(chunk) if temp_json["timestamps"] != []: chunk_json = temp_json # Decode base 64 stream to bytes chunk_audio = base64.b64decode(temp_json["audio"].encode("utf-8")) # Process streaming chunks f.write(chunk_audio) # Print word level timestamps print(chunk_json["timestamps"])