メインコンテンツへスキップ
モデルがユーザーのターンの開始と終了を通知するため、エージェントはイベントに反応するだけで済みます。 これにより、エージェントのパイプライン側でユーザーが話し終えたかどうかを判定するための Voice Activity Detection (VAD) を独自に走らせる必要がなくなります。
def stt_auto_finalize_websocket(client: Cartesia, *args: str) -> None:
    """Realtime STT with native turn detection (recommended for voice agents).

    The model signals when a user turn starts and ends, so your agent reacts
    to events rather than running its own VAD.

    Pass a mono uncompressed PCM WAV file (16-bit or 32-bit) as an argument,
    or call with no args to synthesize sample audio via TTS.
    """
    import sys
    import time
    import wave

    from cartesia.types import STTEncoding, RawOutputFormatParam

    encoding: STTEncoding
    chunks: list[bytes]
    if args:
        with wave.open(args[0], "rb") as wf:
            if wf.getnchannels() != 1:
                print(f"Error: WAV must be mono, got {wf.getnchannels()} channels.")
                sys.exit(1)
            if wf.getcomptype() != "NONE":
                print(f"Error: WAV must be uncompressed PCM, got {wf.getcomptype()!r}.")
                sys.exit(1)
            sample_width = wf.getsampwidth()
            if sample_width == 2:
                encoding = "pcm_s16le"
            elif sample_width == 4:
                encoding = "pcm_s32le"
            else:
                print(f"Error: unsupported sample width {sample_width} bytes (expected 2 or 4).")
                sys.exit(1)
            sample_rate = wf.getframerate()
            chunks = []
            while True:
                data = wf.readframes(sample_rate // 10)  # 100ms per chunk
                if not data:
                    break
                chunks.append(data)
    else:
        output_format: RawOutputFormatParam = {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000}
        encoding = output_format["encoding"]
        sample_rate = output_format["sample_rate"]
        generation_transcript = "Hello, world! The quick brown fox jumps over the lazy dog."
        print(f"No WAV file provided — synthesizing audio with TTS: {generation_transcript!r}")
        audio = client.tts.generate(
            model_id="sonic-latest",
            transcript=generation_transcript,
            voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
            output_format={"container": "raw", "encoding": encoding, "sample_rate": sample_rate},
            language="en",
        ).read()
        chunk_bytes = (sample_rate * 2) // 10  # 100ms of pcm_s16le (2 bytes/sample)
        chunks = [audio[i : i + chunk_bytes] for i in range(0, len(audio), chunk_bytes)]

    # Concatenate transcripts from all turn.end events to get the full transcript
    # Do not strip or add whitespace!
    full_transcript = ""

    with client.stt.auto_finalize.websocket(
        encoding=encoding,
        model="ink-2",
        sample_rate=sample_rate,
    ) as connection:
        for chunk in chunks:
            connection.send_raw(chunk)
            time.sleep(0.1)  # each chunk is 100ms of audio — pace sends to match real time

        # Flush remaining audio and close the session cleanly.
        connection.send({"type": "close"})

        for event in connection:
            if event.type == "connected":
                print(f"connected      | request_id={event.request_id}")
            elif event.type == "turn.start":
                print("turn.start     |")
            elif event.type == "turn.update":
                # event.transcript is cumulative within a turn.
                print(f"turn.update    | {event.transcript}")
            elif event.type == "turn.eager_end":
                print(f"turn.eager_end | {event.transcript}")
            elif event.type == "turn.resume":
                print("turn.resume     |")
            elif event.type == "turn.end":
                print(f"turn.end       | {event.transcript}")
                full_transcript += event.transcript
            elif event.type == "error":
                print(f"error          | {event.message}")

        print(f"\nFull transcript: {full_transcript!r}")
出典: cartesia-python/examples/examples.py:653

このサンプルを実行する

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_auto_finalize_websocket