Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt

Use this file to discover all available pages before exploring further.

The model signals when a user turn starts and ends, so your agent reacts to events. This way your agent pipeline does not need to run its own Voice Activity Detection (VAD) to detect whether users have finished speaking.
def stt_auto_finalize_websocket(client: Cartesia, *args: str) -> None:
    """Realtime STT with native turn detection (recommended for voice agents).

    The model signals when a user turn starts and ends, so your agent reacts
    to events rather than running its own VAD.

    Pass a mono uncompressed PCM WAV file (16-bit or 32-bit) as an argument,
    or call with no args to synthesize sample audio via TTS.
    """
    import sys
    import time
    import wave

    from cartesia.types import STTEncoding, RawOutputFormatParam

    encoding: STTEncoding
    chunks: list[bytes]
    if args:
        with wave.open(args[0], "rb") as wf:
            if wf.getnchannels() != 1:
                print(f"Error: WAV must be mono, got {wf.getnchannels()} channels.")
                sys.exit(1)
            if wf.getcomptype() != "NONE":
                print(f"Error: WAV must be uncompressed PCM, got {wf.getcomptype()!r}.")
                sys.exit(1)
            sample_width = wf.getsampwidth()
            if sample_width == 2:
                encoding = "pcm_s16le"
            elif sample_width == 4:
                encoding = "pcm_s32le"
            else:
                print(f"Error: unsupported sample width {sample_width} bytes (expected 2 or 4).")
                sys.exit(1)
            sample_rate = wf.getframerate()
            chunks = []
            while True:
                data = wf.readframes(sample_rate // 10)  # 100ms per chunk
                if not data:
                    break
                chunks.append(data)
    else:
        output_format: RawOutputFormatParam = {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000}
        encoding = output_format["encoding"]
        sample_rate = output_format["sample_rate"]
        generation_transcript = "Hello, world! The quick brown fox jumps over the lazy dog."
        print(f"No WAV file provided — synthesizing audio with TTS: {generation_transcript!r}")
        audio = client.tts.generate(
            model_id="sonic-latest",
            transcript=generation_transcript,
            voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
            output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
            language="en",
        ).read()
        chunk_bytes = (sample_rate * 2) // 10  # 100ms of pcm_s16le (2 bytes/sample)
        chunks = [audio[i : i + chunk_bytes] for i in range(0, len(audio), chunk_bytes)]

    # Concatenate transcripts from all turn.end events to get the full transcript
    # Do not strip or add whitespace!
    full_transcript = ""

    with client.stt.auto_finalize.websocket(
        encoding=encoding,
        model="ink-2",
        sample_rate=sample_rate,
    ) as connection:
        for chunk in chunks:
            connection.send_raw(chunk)
            time.sleep(0.1)  # each chunk is 100ms of audio — pace sends to match real time

        # Flush remaining audio and close the session cleanly.
        connection.send({"type": "close"})

        for event in connection:
            if event.type == "connected":
                print(f"connected      | request_id={event.request_id}")
            elif event.type == "turn.start":
                print("turn.start     |")
            elif event.type == "turn.update":
                # event.transcript is cumulative within a turn.
                print(f"turn.update    | {event.transcript}")
            elif event.type == "turn.eager_end":
                print(f"turn.eager_end | {event.transcript}")
            elif event.type == "turn.resume":
                print("turn.resume     |")
            elif event.type == "turn.end":
                print(f"turn.end       | {event.transcript}")
                full_transcript += event.transcript
            elif event.type == "error":
                print(f"error          | {event.message}")

        print(f"\nFull transcript: {full_transcript!r}")
From cartesia-python/examples/examples.py:653

Run this example

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_auto_finalize_websocket