Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt

Use this file to discover all available pages before exploring further.

You tell the model when the user is done speaking by sending finalize. This can come from a user releasing a push-to-talk button, or from your own voice activity detection (VAD). Transcript events are deltas — concatenate text from is_final events (without stripping whitespace) to assemble the full transcript.
Generates complete sentences and calls finalize after each one.In a real voice agent, you wouldn’t know where sentence boundaries are ahead of time and would only call finalize when the user is done speaking.
def stt_manual_finalize_websocket(client: Cartesia, *args: str) -> None:
    """Realtime STT (manual finalize): recommended for push-to-talk apps.

    Generates test audio via TTS, pushes it into the STT WebSocket in real-time
    100ms chunks, then sends `finalize` to trigger transcription of the buffered
    audio.

    You control when the model emits transcripts by sending `finalize`.
    Transcript events are deltas — concatenate `text` from `is_final` events
    (without stripping whitespace) to assemble the full transcript.

    Pass the transcript to synthesize as arguments, or call with no args to use
    a default sample transcript.
    """
    import re
    import time
    from typing_extensions import Literal

    encoding: Literal["pcm_s16le"] = "pcm_s16le"
    sample_rate: Literal[16000] = 16000

    input_text = (
        " ".join(args)
        if args
        else "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore."
    )
    print(f"Generating audio for: {input_text!r}")

    with client.stt.manual_finalize.websocket(
        encoding=encoding,
        model="ink-2",
        sample_rate=sample_rate,
    ) as connection:

        def generate_audio_and_push(utterance: str) -> None:
            audio = client.tts.generate(
                model_id="sonic-latest",
                transcript=utterance,
                voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
                output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
                language="en",
            ).read()
            chunk_bytes = (sample_rate * 2) // 10  # 100ms of pcm_s16le (2 bytes/sample)
            for i in range(0, len(audio), chunk_bytes):
                connection.send_raw(audio[i : i + chunk_bytes])
                time.sleep(0.1)  # each chunk is 100ms of audio — pace sends to match real time
            # Triggers transcription of buffered audio.
            connection.send("finalize")

        # Split the transcript on full stops to simulate multiple user utterances.
        # In a real app you would run voice activity detection (VAD) on the user's
        # audio stream to decide when to send the `finalize` command.
        for utterance in (u for u in input_text.split(".") if re.search(r"\w", u)):
            generate_audio_and_push(utterance)

        # Flush remaining audio, get a `done` ack, then close the socket.
        connection.send("close")

        full_transcript = ""
        for event in connection:
            if event.type == "transcript":
                if event.is_final:
                    print(f"transcript | {event.text}")
                    full_transcript += event.text
            elif event.type == "flush_done":
                print("flush_done |")
            elif event.type == "done":
                print("done       |")
            elif event.type == "error":
                print(f"error    | {event.message}")

        print(f"\nFull transcript: {full_transcript!r}")
From cartesia-python/examples/examples.py:746

Run this example

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_manual_finalize_websocket