Skip to main content
def tts_sse_with_phoneme_timestamps(client: Cartesia) -> None:
    """SSE streaming with phoneme timestamps."""
    stream = client.tts.generate_sse(
        model_id="sonic-latest",
        transcript="Hello, world!",
        voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
        output_format={"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
        language="en",
        add_phoneme_timestamps=True,
    )

    import datetime

    filename = f"tts_sse_with_phoneme_timestamps_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pcm"

    with open(filename, "wb") as f:
        for event in stream:
            if event.type == "phoneme_timestamps":
                pt = event.phoneme_timestamps
                if pt:
                    print(f"Phonemes: {pt.phonemes}, Starts: {pt.start}, Ends: {pt.end}")
            elif event.type == "chunk":
                if event.audio:
                    f.write(event.audio)
            elif event.type == "done":
                break
            elif event.type == "error":
                raise Exception(f"{event.title}: {event.message}")

    print(f"Saved audio to {filename}")
    print(f"Play with: ffplay -f s16le -ar 44100 {filename}")
From cartesia-python/examples/examples.py:126

Run this example

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py tts_sse_with_phoneme_timestamps