You tell the model when the user is done speaking by sendingDocumentation Index
Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
Use this file to discover all available pages before exploring further.
finalize. This can come from a user releasing a push-to-talk button, or from your own voice activity detection (VAD).
Transcript events are deltas — concatenate text from is_final events (without stripping whitespace) to assemble the full transcript.
- Python
- Python (Async)
- TypeScript
- TypeScript (Browser)
Generates complete sentences and calls From cartesia-python/examples/examples.py:746
finalize after each one.In a real voice agent, you wouldn’t know where sentence boundaries are ahead of time and would only call finalize when the user is done speaking.def stt_manual_finalize_websocket(client: Cartesia, *args: str) -> None:
"""Realtime STT (manual finalize): recommended for push-to-talk apps.
Generates test audio via TTS, pushes it into the STT WebSocket in real-time
100ms chunks, then sends `finalize` to trigger transcription of the buffered
audio.
You control when the model emits transcripts by sending `finalize`.
Transcript events are deltas — concatenate `text` from `is_final` events
(without stripping whitespace) to assemble the full transcript.
Pass the transcript to synthesize as arguments, or call with no args to use
a default sample transcript.
"""
import re
import time
from typing_extensions import Literal
encoding: Literal["pcm_s16le"] = "pcm_s16le"
sample_rate: Literal[16000] = 16000
input_text = (
" ".join(args)
if args
else "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore."
)
print(f"Generating audio for: {input_text!r}")
with client.stt.manual_finalize.websocket(
encoding=encoding,
model="ink-2",
sample_rate=sample_rate,
) as connection:
def generate_audio_and_push(utterance: str) -> None:
audio = client.tts.generate(
model_id="sonic-latest",
transcript=utterance,
voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
language="en",
).read()
chunk_bytes = (sample_rate * 2) // 10 # 100ms of pcm_s16le (2 bytes/sample)
for i in range(0, len(audio), chunk_bytes):
connection.send_raw(audio[i : i + chunk_bytes])
time.sleep(0.1) # each chunk is 100ms of audio — pace sends to match real time
# Triggers transcription of buffered audio.
connection.send("finalize")
# Split the transcript on full stops to simulate multiple user utterances.
# In a real app you would run voice activity detection (VAD) on the user's
# audio stream to decide when to send the `finalize` command.
for utterance in (u for u in input_text.split(".") if re.search(r"\w", u)):
generate_audio_and_push(utterance)
# Flush remaining audio, get a `done` ack, then close the socket.
connection.send("close")
full_transcript = ""
for event in connection:
if event.type == "transcript":
if event.is_final:
print(f"transcript | {event.text}")
full_transcript += event.text
elif event.type == "flush_done":
print("flush_done |")
elif event.type == "done":
print("done |")
elif event.type == "error":
print(f"error | {event.message}")
print(f"\nFull transcript: {full_transcript!r}")
Generates complete sentences and calls From cartesia-python/examples/async_examples.py:605
finalize after each one.In a real voice agent, you wouldn’t know where sentence boundaries are ahead of time and would only call finalize when the user is done speaking.async def stt_manual_finalize_websocket_async(client: AsyncCartesia, *args: str) -> None:
"""Async realtime STT (manual finalize): recommended for push-to-talk apps.
Generates test audio via TTS, pushes it into the STT WebSocket in real-time
100ms chunks, then sends `finalize` to trigger transcription of the buffered
audio. Sends audio and receives events concurrently via ``asyncio.gather``.
You control when the model emits transcripts by sending `finalize`.
Transcript events are deltas — concatenate `text` from `is_final` events
(without stripping whitespace) to assemble the full transcript.
Pass the transcript to synthesize as arguments, or call with no args to use
a default sample transcript.
"""
import re
import asyncio
from typing_extensions import Literal
encoding: Literal["pcm_s16le"] = "pcm_s16le"
sample_rate: Literal[16000] = 16000
input_text = (
" ".join(args)
if args
else "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore."
)
print(f"Generating audio for: {input_text!r}")
full_transcript = ""
async with client.stt.manual_finalize.websocket(
encoding=encoding,
model="ink-2",
sample_rate=sample_rate,
) as connection:
async def generate_audio_and_push(utterance: str) -> None:
tts_response = await client.tts.generate(
model_id="sonic-latest",
transcript=utterance,
voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
language="en",
)
audio = await tts_response.read()
chunk_bytes = (sample_rate * 2) // 10 # 100ms of pcm_s16le (2 bytes/sample)
for i in range(0, len(audio), chunk_bytes):
await connection.send_raw(audio[i : i + chunk_bytes])
await asyncio.sleep(0.1) # each chunk is 100ms of audio — pace sends to match real time
# Triggers transcription of buffered audio.
await connection.send("finalize")
async def send_audio() -> None:
# Split the transcript on full stops to simulate multiple user utterances.
# In a real app you would run voice activity detection (VAD) on the user's
# audio stream to decide when to send the `finalize` command.
for utterance in (u for u in input_text.split(".") if re.search(r"\w", u)):
await generate_audio_and_push(utterance)
# Flush remaining audio, get a `done` ack, then close the socket.
await connection.send("close")
async def receive_events() -> None:
nonlocal full_transcript
async for event in connection:
if event.type == "transcript":
if event.is_final:
print(f"transcript | {event.text}")
full_transcript += event.text
elif event.type == "flush_done":
print("flush_done |")
elif event.type == "done":
print("done |")
elif event.type == "error":
print(f"error | {event.message}")
await asyncio.gather(send_audio(), receive_events())
print(f"\nFull transcript: {full_transcript!r}")
Generates complete sentences and calls From cartesia-js/examples/node_examples.ts:749
finalize after each one.In a real voice agent, you wouldn’t know where sentence boundaries are ahead of time and would only call finalize when the user is done speaking.async function sttManualFinalizeWebsocket(
client: Cartesia,
args: string[],
): Promise<void> {
const input =
args.length > 0
? args.join(" ")
: "The quick brown fox jumps over the lazy dog. Sandy sells seashells on the sea shore.";
const encoding = "pcm_s16le";
const sampleRate = 16000;
console.log(`Generating audio for: ${JSON.stringify(input)}`);
const ws = client.stt.manualFinalize.websocket({
model: "ink-2",
encoding,
sample_rate: sampleRate,
});
ws.on("error", (err) => console.error("WS error:", err.message));
const generateAudioAndPushToSTT = async (
utterance: string,
): Promise<void> => {
const ttsResponse = await client.tts.generate({
model_id: "sonic-latest",
transcript: utterance,
voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
output_format: { container: "raw", encoding, sample_rate: sampleRate },
language: "en",
});
if (!ttsResponse.body) throw new Error("TTS response had no body");
await sendRealtimeAudioChunks(
ttsResponse.body.getReader(),
(chunk) => ws.sendRaw(chunk),
sampleRate,
encoding,
);
// Triggers transcription of buffered audio.
ws.send("finalize");
};
const sender = (async () => {
// Split transcript on fullstops to simulate multiple user utterances
// In reality, you would run voice activity detection (VAD) on the user audio stream
// to decide when to send the "finalize" command
for (const utterance of input.split(".").filter((u) => /\w/g.exec(u))) {
await generateAudioAndPushToSTT(utterance);
}
// Flushes remaining audio, sends a `done` ack, then closes the socket.
ws.send("close");
})();
// Transcript chunks are deltas — concatenate is_final chunks to build the
// full transcript. Do not add or strip whitespace between them.
let fullTranscript = "";
for await (const event of ws.stream()) {
if (event.type === "message") {
const m = event.message;
switch (m.type) {
case "transcript": {
if (m.is_final) {
console.log(`transcript | ${m.text}`);
fullTranscript += m.text;
}
break;
}
case "flush_done": {
console.log("flush_done |");
break;
}
case "done": {
console.log("done |");
break;
}
}
} else if (event.type === "error") {
console.error(`error | ${event.error.message}`);
}
}
await sender;
console.log(`Full transcript: ${JSON.stringify(fullTranscript)}`);
}
Captures microphone audio, then calls From cartesia-js/examples/browser_examples.ts:386
finalize to ask the model for a transcript of everything sent so far. In a real push-to-talk UI you would call finalize on button-up; this example fires it after 5 seconds.async function sttManualFinalizeWebsocket(client: Cartesia): Promise<void> {
const audioCtx = new AudioContext();
const workletSource = `
class PCMCapture extends AudioWorkletProcessor {
process(inputs) {
const ch = inputs[0]?.[0];
if (ch) this.port.postMessage(ch);
return true;
}
}
registerProcessor('pcm-capture', PCMCapture);
`;
const workletURL = URL.createObjectURL(
new Blob([workletSource], { type: "application/javascript" }),
);
await audioCtx.audioWorklet.addModule(workletURL);
URL.revokeObjectURL(workletURL);
const mediaStream = await navigator.mediaDevices.getUserMedia({
audio: true,
});
const source = audioCtx.createMediaStreamSource(mediaStream);
const capture = new AudioWorkletNode(audioCtx, "pcm-capture");
const ws = client.stt.manualFinalize.websocket({
model: "ink-2",
encoding: AUDIO_CONTEXT_ENCODING,
sample_rate: audioCtx.sampleRate,
});
ws.on("error", (err) => console.error(err.message));
const audioChunks = createFloat32AudioChunker(audioCtx.sampleRate, (chunk) =>
ws.sendRaw(chunk),
);
let closed = false;
capture.port.onmessage = (e) => {
if (closed) return;
const floats: Float32Array = e.data;
audioChunks.append(floats);
};
source.connect(capture);
// Push-to-talk: simulate "release button" after 5s, then close after 10s.
const finalizeTimer = setTimeout(() => {
audioChunks.flush();
ws.send("finalize");
}, 5_000);
const closeTimer = setTimeout(() => {
if (closed) return;
closed = true;
audioChunks.flush();
ws.send("close");
}, 10_000);
// Transcript chunks are deltas — concatenate is_final chunks to build the
// full transcript. Do not add or strip whitespace between them.
let fullTranscript = "";
try {
for await (const event of ws.stream()) {
if (event.type === "message") {
const m = event.message;
switch (m.type) {
case "transcript": {
if (m.is_final) {
console.log(`transcript | ${m.text}`);
fullTranscript += m.text;
}
break;
}
case "flush_done": {
console.log("flush_done |");
break;
}
case "done": {
console.log("done |");
break;
}
}
} else if (event.type === "error") {
console.error(`error | ${event.error.message}`);
}
}
} finally {
closed = true;
clearTimeout(finalizeTimer);
clearTimeout(closeTimer);
source.disconnect();
capture.disconnect();
mediaStream.getTracks().forEach((t) => t.stop());
await audioCtx.close();
}
console.log(`Full transcript: ${JSON.stringify(fullTranscript)}`);
}
Run this example
- Python
- Python (Async)
- TypeScript
- TypeScript (Browser)
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_manual_finalize_websocket
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/async_examples.py stt_manual_finalize_websocket_async
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-js
cd cartesia-js
pnpm i
CARTESIA_API_KEY=YOUR_KEY pnpm tsn examples/node_examples.ts sttManualFinalizeWebsocket
This example runs in the browser. See the Next.js example for a working setup.