Realtime STT (Manual)

You tell the model when the user is done speaking by sending finalize. This can come from a user releasing a push-to-talk button, or from your own voice activity detection (VAD). Transcript events are deltas — concatenate text from is_final events (without stripping whitespace) to assemble the full transcript.

Python
Python (Async)
TypeScript
TypeScript (Browser)

Generates complete sentences and calls finalize after each one.In a real voice agent, you wouldn’t know where sentence boundaries are ahead of time and would only call finalize when the user is done speaking.

def stt_manual_finalize_websocket(client: Cartesia, *args: str) -> None:
    """Realtime STT (manual finalize): recommended for push-to-talk apps.

    Generates test audio via TTS, pushes it into the STT WebSocket in real-time
    100ms chunks, then sends `finalize` to trigger transcription of the buffered
    audio.

    You control when the model emits transcripts by sending `finalize`.
    Transcript events are deltas — concatenate `text` from `is_final` events
    (without stripping whitespace) to assemble the full transcript.

    Pass the transcript to synthesize as arguments, or call with no args to use
    a default sample transcript.
    """
    import re
    import time
    from typing_extensions import Literal

    encoding: Literal["pcm_s16le"] = "pcm_s16le"
    sample_rate: Literal[16000] = 16000

    input_text = (
        " ".join(args)
        if args
        else "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore."
    )
    print(f"Generating audio for: {input_text!r}")

    with client.stt.manual_finalize.websocket(
        encoding=encoding,
        model="ink-2",
        sample_rate=sample_rate,
    ) as connection:

        def generate_audio_and_push(utterance: str) -> None:
            audio = client.tts.generate(
                model_id="sonic-latest",
                transcript=utterance,
                voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
                output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
                language="en",
            ).read()
            chunk_bytes = (sample_rate * 2) // 10  # 100ms of pcm_s16le (2 bytes/sample)
            for i in range(0, len(audio), chunk_bytes):
                connection.send_raw(audio[i : i + chunk_bytes])
                time.sleep(0.1)  # each chunk is 100ms of audio — pace sends to match real time
            # Triggers transcription of buffered audio.
            connection.send("finalize")

        # Split the transcript on full stops to simulate multiple user utterances.
        # In a real app you would run voice activity detection (VAD) on the user's
        # audio stream to decide when to send the `finalize` command.
        for utterance in (u for u in input_text.split(".") if re.search(r"\w", u)):
            generate_audio_and_push(utterance)

        # Flush remaining audio, get a `done` ack, then close the socket.
        connection.send("close")

        full_transcript = ""
        for event in connection:
            if event.type == "transcript":
                if event.is_final:
                    print(f"transcript | {event.text}")
                    full_transcript += event.text
            elif event.type == "flush_done":
                print("flush_done |")
            elif event.type == "done":
                print("done       |")
            elif event.type == "error":
                print(f"error    | {event.message}")

        print(f"\nFull transcript: {full_transcript!r}")

From cartesia-python/examples/examples.py:746

async def stt_manual_finalize_websocket_async(client: AsyncCartesia, *args: str) -> None:
    """Async realtime STT (manual finalize): recommended for push-to-talk apps.

    Generates test audio via TTS, pushes it into the STT WebSocket in real-time
    100ms chunks, then sends `finalize` to trigger transcription of the buffered
    audio. Sends audio and receives events concurrently via ``asyncio.gather``.

    You control when the model emits transcripts by sending `finalize`.
    Transcript events are deltas — concatenate `text` from `is_final` events
    (without stripping whitespace) to assemble the full transcript.

    Pass the transcript to synthesize as arguments, or call with no args to use
    a default sample transcript.
    """
    import re
    import asyncio
    from typing_extensions import Literal

    encoding: Literal["pcm_s16le"] = "pcm_s16le"
    sample_rate: Literal[16000] = 16000

    input_text = (
        " ".join(args)
        if args
        else "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore."
    )
    print(f"Generating audio for: {input_text!r}")

    full_transcript = ""

    async with client.stt.manual_finalize.websocket(
        encoding=encoding,
        model="ink-2",
        sample_rate=sample_rate,
    ) as connection:

        async def generate_audio_and_push(utterance: str) -> None:
            tts_response = await client.tts.generate(
                model_id="sonic-latest",
                transcript=utterance,
                voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
                output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
                language="en",
            )
            audio = await tts_response.read()
            chunk_bytes = (sample_rate * 2) // 10  # 100ms of pcm_s16le (2 bytes/sample)
            for i in range(0, len(audio), chunk_bytes):
                await connection.send_raw(audio[i : i + chunk_bytes])
                await asyncio.sleep(0.1)  # each chunk is 100ms of audio — pace sends to match real time
            # Triggers transcription of buffered audio.
            await connection.send("finalize")

        async def send_audio() -> None:
            # Split the transcript on full stops to simulate multiple user utterances.
            # In a real app you would run voice activity detection (VAD) on the user's
            # audio stream to decide when to send the `finalize` command.
            for utterance in (u for u in input_text.split(".") if re.search(r"\w", u)):
                await generate_audio_and_push(utterance)
            # Flush remaining audio, get a `done` ack, then close the socket.
            await connection.send("close")

        async def receive_events() -> None:
            nonlocal full_transcript
            async for event in connection:
                if event.type == "transcript":
                    if event.is_final:
                        print(f"transcript | {event.text}")
                        full_transcript += event.text
                elif event.type == "flush_done":
                    print("flush_done |")
                elif event.type == "done":
                    print("done       |")
                elif event.type == "error":
                    print(f"error    | {event.message}")

        await asyncio.gather(send_audio(), receive_events())

    print(f"\nFull transcript: {full_transcript!r}")

From cartesia-python/examples/async_examples.py:605

async function sttManualFinalizeWebsocket(
  client: Cartesia,
  args: string[],
): Promise<void> {
  const input =
    args.length > 0
      ? args.join(" ")
      : "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore.";
  const encoding = "pcm_s16le";
  const sampleRate = 16000;

  console.log(`Generating audio for: ${JSON.stringify(input)}`);

  const ws = client.stt.manualFinalize.websocket({
    model: "ink-2",
    encoding,
    sample_rate: sampleRate,
  });
  ws.on("error", (err) => console.error("WS error:", err.message));

  const generateAudioAndPushToSTT = async (
    utterance: string,
  ): Promise<void> => {
    const ttsResponse = await client.tts.generate({
      model_id: "sonic-latest",
      transcript: utterance,
      voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
      output_format: { container: "raw", encoding, sample_rate: sampleRate },
      language: "en",
    });
    if (!ttsResponse.body) throw new Error("TTS response had no body");
    await sendRealtimeAudioChunks(
      ttsResponse.body.getReader(),
      (chunk) => ws.sendRaw(chunk),
      sampleRate,
      encoding,
    );
    // Triggers transcription of buffered audio.
    ws.send("finalize");
  };

  const sender = (async () => {
    // Split transcript on fullstops to simulate multiple user utterances
    // In reality, you would run voice activity detection (VAD) on the user audio stream
    // to decide when to send the "finalize" command
    for (const utterance of input.split(".").filter((u) => /\w/g.exec(u))) {
      await generateAudioAndPushToSTT(utterance);
    }

    // Flushes remaining audio, sends a `done` ack, then closes the socket.
    ws.send("close");
  })();

  // Transcript chunks are deltas — concatenate is_final chunks to build the
  // full transcript. Do not add or strip whitespace between them.
  let fullTranscript = "";

  for await (const event of ws.stream()) {
    if (event.type === "message") {
      const m = event.message;
      switch (m.type) {
        case "transcript": {
          if (m.is_final) {
            console.log(`transcript | ${m.text}`);
            fullTranscript += m.text;
          }
          break;
        }
        case "flush_done": {
          console.log("flush_done |");
          break;
        }
        case "done": {
          console.log("done       |");
          break;
        }
      }
    } else if (event.type === "error") {
      console.error(`error    | ${event.error.message}`);
    }
  }

  await sender;
  console.log(`Full transcript: ${JSON.stringify(fullTranscript)}`);
}

From cartesia-js/examples/node_examples.ts:749

Captures microphone audio, then calls finalize to ask the model for a transcript of everything sent so far. In a real push-to-talk UI you would call finalize on button-up; this example fires it after 5 seconds.

async function sttManualFinalizeWebsocket(client: Cartesia): Promise<void> {
  const audioCtx = new AudioContext();

  const workletSource = `
    class PCMCapture extends AudioWorkletProcessor {
      process(inputs) {
        const ch = inputs[0]?.[0];
        if (ch) this.port.postMessage(ch);
        return true;
      }
    }
    registerProcessor('pcm-capture', PCMCapture);
  `;
  const workletURL = URL.createObjectURL(
    new Blob([workletSource], { type: "application/javascript" }),
  );
  await audioCtx.audioWorklet.addModule(workletURL);
  URL.revokeObjectURL(workletURL);

  const mediaStream = await navigator.mediaDevices.getUserMedia({
    audio: true,
  });
  const source = audioCtx.createMediaStreamSource(mediaStream);
  const capture = new AudioWorkletNode(audioCtx, "pcm-capture");

  const ws = client.stt.manualFinalize.websocket({
    model: "ink-2",
    encoding: AUDIO_CONTEXT_ENCODING,
    sample_rate: audioCtx.sampleRate,
  });
  ws.on("error", (err) => console.error(err.message));

  const audioChunks = createFloat32AudioChunker(audioCtx.sampleRate, (chunk) =>
    ws.sendRaw(chunk),
  );
  let closed = false;

  capture.port.onmessage = (e) => {
    if (closed) return;
    const floats: Float32Array = e.data;
    audioChunks.append(floats);
  };
  source.connect(capture);

  // Push-to-talk: simulate "release button" after 5s, then close after 10s.
  const finalizeTimer = setTimeout(() => {
    audioChunks.flush();
    ws.send("finalize");
  }, 5_000);
  const closeTimer = setTimeout(() => {
    if (closed) return;
    closed = true;
    audioChunks.flush();
    ws.send("close");
  }, 10_000);

  // Transcript chunks are deltas — concatenate is_final chunks to build the
  // full transcript. Do not add or strip whitespace between them.
  let fullTranscript = "";

  try {
    for await (const event of ws.stream()) {
      if (event.type === "message") {
        const m = event.message;
        switch (m.type) {
          case "transcript": {
            if (m.is_final) {
              console.log(`transcript | ${m.text}`);
              fullTranscript += m.text;
            }
            break;
          }
          case "flush_done": {
            console.log("flush_done |");
            break;
          }
          case "done": {
            console.log("done       |");
            break;
          }
        }
      } else if (event.type === "error") {
        console.error(`error    | ${event.error.message}`);
      }
    }
  } finally {
    closed = true;
    clearTimeout(finalizeTimer);
    clearTimeout(closeTimer);
    source.disconnect();
    capture.disconnect();
    mediaStream.getTracks().forEach((t) => t.stop());
    await audioCtx.close();
  }

  console.log(`Full transcript: ${JSON.stringify(fullTranscript)}`);
}

From cartesia-js/examples/browser_examples.ts:386

Run this example

Python
Python (Async)
TypeScript
TypeScript (Browser)

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_manual_finalize_websocket

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/async_examples.py stt_manual_finalize_websocket_async

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-js
cd cartesia-js
pnpm i
CARTESIA_API_KEY=YOUR_KEY pnpm tsn examples/node_examples.ts sttManualFinalizeWebsocket

STT

TTS Generate

TTS WebSocket

TTS SSE

Voices

Other

Browser

Next.js

Run this example

​Run this example

Run this example