Skip to main content
def stt_transcribe(client: Cartesia, *args: str) -> None:
    """Transcribe an audio file with word timestamps.

    Pass a path to an audio file, or omit it to generate a sample WAV via TTS.
    """
    import datetime

    def generate_sample_wav() -> tuple[str, str]:
        transcript = "The quick brown fox jumps over the lazy dog."
        language = "en"
        print(f"No audio file provided. Generating a sample for: {transcript!r}")
        response = client.tts.generate(
            model_id="sonic-latest",
            transcript=transcript,
            voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
            output_format={"container": "wav", "encoding": "pcm_s16le", "sample_rate": 16000},
            language=language,
        )
        path = f"stt_sample_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
        response.write_to_file(path)
        print(f"Saved sample audio to {path}")
        return path, language

    if args:
        if len(args) < 2:
            print("Usage: stt_transcribe <audio_file> <language_code>")
            print("Example: stt_transcribe my_audio.wav en")
            sys.exit(1)
        file_path, language = args
    else:
        file_path, language = generate_sample_wav()

    with open(file_path, "rb") as f:
        response = client.stt.transcribe(
            file=f,
            model="ink-whisper",
            language=language,
            timestamp_granularities=["word"],  # Optional: get word timestamps
        )
    print(response.text)
    if response.words:
        for word in response.words:
            print(f"{word.word}: {word.start}s - {word.end}s")
From cartesia-python/examples/examples.py:608

Run this example

git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_transcribe