Kokoro-FastAPI/examples/stream_tts_playback.py

#!/usr/bin/env python3
import os
import time
import wave

import numpy as np
import requests
import sounddevice as sd


def play_streaming_tts(text: str, output_file: str = None, voice: str = "af_sky"):
    """Stream TTS audio and play it back in real-time"""

    print("\nStarting TTS stream request...")
    start_time = time.time()

    # Initialize variables
    sample_rate = 24000  # Known sample rate for Kokoro
    audio_started = False
    chunk_count = 0
    total_bytes = 0
    first_chunk_time = None
    all_audio_data = bytearray()  # Raw PCM audio data

    # Start sounddevice stream with buffer
    stream = sd.OutputStream(
        samplerate=sample_rate,
        channels=1,
        dtype=np.int16,
        blocksize=1024,  # Buffer size in samples
        latency="low",  # Request low latency
    )
    stream.start()

    # Make streaming request to API
    try:
        response = requests.post(
            "http://localhost:8880/v1/audio/speech",
            json={
                "model": "kokoro",
                "input": text,
                "voice": voice,
                "response_format": "pcm",
                "stream": True,
            },
            stream=True,
            timeout=1800,
        )
        response.raise_for_status()
        print(f"Request started successfully after {time.time() - start_time:.2f}s")

        # Process streaming response with smaller chunks for lower latency
        for chunk in response.iter_content(
            chunk_size=512
        ):  # 512 bytes = 256 samples at 16-bit
            if chunk:
                chunk_count += 1
                total_bytes += len(chunk)

                # Handle first chunk
                if not audio_started:
                    first_chunk_time = time.time()
                    print(
                        f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s"
                    )
                    print(f"First chunk size: {len(chunk)} bytes")
                    audio_started = True

                # Convert bytes to numpy array and play
                audio_chunk = np.frombuffer(chunk, dtype=np.int16)
                stream.write(audio_chunk)

                # Accumulate raw audio data
                all_audio_data.extend(chunk)

                # Log progress every 10 chunks
                if chunk_count % 100 == 0:
                    elapsed = time.time() - start_time
                    print(
                        f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed"
                    )

        # Final stats
        total_time = time.time() - start_time
        print(f"\nStream complete:")
        print(f"Total chunks: {chunk_count}")
        print(f"Total data: {total_bytes/1024:.1f}KB")
        print(f"Total time: {total_time:.2f}s")
        print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")

        # Save as WAV file
        if output_file:
            print(f"\nWriting audio to {output_file}")
            with wave.open(output_file, "wb") as wav_file:
                wav_file.setnchannels(1)  # Mono
                wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit)
                wav_file.setframerate(sample_rate)
                wav_file.writeframes(all_audio_data)
            print(f"Saved {len(all_audio_data)} bytes of audio data")

        # Clean up
        stream.stop()
        stream.close()

    except requests.exceptions.ConnectionError as e:
        print(f"Connection error - Is the server running? Error: {str(e)}")
        stream.stop()
        stream.close()
    except Exception as e:
        print(f"Error during streaming: {str(e)}")
        stream.stop()
        stream.close()


def main():
    # Load sample text from HG Wells
    script_dir = os.path.dirname(os.path.abspath(__file__))
    wells_path = os.path.join(
        script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt"
    )
    output_path = os.path.join(script_dir, "output.wav")

    with open(wells_path, "r", encoding="utf-8") as f:
        full_text = f.read()
        # Take first few paragraphs
        text = " ".join(full_text.split("\n\n")[1:3])

    print("\nStarting TTS stream playback...")
    print(f"Text length: {len(text)} characters")
    print("\nFirst 100 characters:")
    print(text[:100] + "...")

    play_streaming_tts(text, output_file=output_path)


if __name__ == "__main__":
    main()