- Python
- Python (Async)
- TypeScript
- TypeScript (Browser)
def stt_transcribe(client: Cartesia, *args: str) -> None:
"""Transcribe an audio file with word timestamps.
Pass a path to an audio file, or omit it to generate a sample WAV via TTS.
"""
import datetime
def generate_sample_wav() -> tuple[str, str]:
transcript = "The quick brown fox jumps over the lazy dog."
language = "en"
print(f"No audio file provided. Generating a sample for: {transcript!r}")
response = client.tts.generate(
model_id="sonic-latest",
transcript=transcript,
voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
output_format={"container": "wav", "encoding": "pcm_s16le", "sample_rate": 16000},
language=language,
)
path = f"stt_sample_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
response.write_to_file(path)
print(f"Saved sample audio to {path}")
return path, language
if args:
if len(args) < 2:
print("Usage: stt_transcribe <audio_file> <language_code>")
print("Example: stt_transcribe my_audio.wav en")
sys.exit(1)
file_path, language = args
else:
file_path, language = generate_sample_wav()
with open(file_path, "rb") as f:
response = client.stt.transcribe(
file=f,
model="ink-whisper",
language=language,
timestamp_granularities=["word"], # Optional: get word timestamps
)
print(response.text)
if response.words:
for word in response.words:
print(f"{word.word}: {word.start}s - {word.end}s")
async def stt_transcribe_async(client: AsyncCartesia, *args: str) -> None:
"""Transcribe an audio file with word timestamps.
Pass a path to an audio file, or omit it to generate a sample WAV via TTS.
"""
import datetime
async def generate_sample_wav() -> tuple[str, str]:
transcript = "The quick brown fox jumps over the lazy dog."
language = "en"
print(f"No audio file provided. Generating a sample for: {transcript!r}")
response = await client.tts.generate(
model_id="sonic-latest",
transcript=transcript,
voice={"mode": "id", "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b"},
output_format={"container": "wav", "encoding": "pcm_s16le", "sample_rate": 16000},
language=language,
)
path = f"stt_sample_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
await response.write_to_file(path)
print(f"Saved sample audio to {path}")
return path, language
if args:
if len(args) < 2:
print("Usage: stt_transcribe_async <audio_file> <language_code>")
print("Example: stt_transcribe_async my_audio.wav en")
sys.exit(1)
file_path, language = args
else:
file_path, language = await generate_sample_wav()
with open(file_path, "rb") as f:
response = await client.stt.transcribe(
file=f,
model="ink-whisper",
language=language,
timestamp_granularities=["word"], # Optional: get word timestamps
)
print(response.text)
if response.words:
for word in response.words:
print(f"{word.word}: {word.start}s - {word.end}s")
async function sttTranscribe(client: Cartesia, args: string[]): Promise<void> {
async function generateSampleWav(): Promise<[string, string]> {
const transcript = "The quick brown fox jumps over the lazy dog.";
const language = "en";
console.log(
`No audio file provided. Generating a sample for: ${JSON.stringify(transcript)}`,
);
const response = await client.tts.generate({
model_id: "sonic-latest",
transcript,
voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
output_format: {
container: "wav",
encoding: "pcm_s16le",
sample_rate: 16000,
},
language: language,
});
const path = `stt_sample_${timestamp()}.wav`;
fs.writeFileSync(path, Buffer.from(await response.arrayBuffer()));
console.log(`Saved sample audio to ${path}`);
return [path, language];
}
let [filePath, language] = args;
if (!filePath) {
[filePath, language] = await generateSampleWav();
} else if (!language) {
console.error("Usage: sttTranscribe <audio_file> <language_code>");
console.error("Example: sttTranscribe my_audio.wav en");
return process.exit(1);
}
const file = fs.createReadStream(filePath);
const response = await client.stt.transcribe({
file,
model: "ink-whisper",
language,
timestamp_granularities: ["word"],
});
console.log(response.text);
if (response.words) {
for (const word of response.words) {
console.log(`${word.word}: ${word.start}s - ${word.end}s`);
}
}
}
/**
* Transcribe an audio file with word timestamps.
*
* Pass a `File` (e.g. the user's selection from an <input type="file">), or omit
* it to generate a sample WAV via TTS.
*/
async function sttTranscribeFile(
client: Cartesia,
file?: File,
language = "en",
): Promise<void> {
async function generateSampleFile(): Promise<File> {
const transcript = "The quick brown fox jumps over the lazy dog.";
console.log(
`No audio file provided. Generating a sample for: ${JSON.stringify(transcript)}`,
);
const response = await client.tts.generate({
model_id: "sonic-latest",
transcript,
voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
output_format: {
container: "wav",
encoding: "pcm_s16le",
sample_rate: 16000,
},
language: "en",
});
const blob = await response.blob();
return new File([blob], "stt_sample.wav", { type: "audio/wav" });
}
const audioFile = file ?? (await generateSampleFile());
const response = await client.stt.transcribe({
file: audioFile,
model: "ink-whisper",
language,
timestamp_granularities: ["word"],
});
console.log(response.text);
if (response.words) {
for (const word of response.words) {
console.log(`${word.word}: ${word.start}s - ${word.end}s`);
}
}
}
Run this example
- Python
- Python (Async)
- TypeScript
- TypeScript (Browser)
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/examples.py stt_transcribe
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-python
cd cartesia-python
uv sync
CARTESIA_API_KEY=YOUR_KEY uv run examples/async_examples.py stt_transcribe_async
git clone --branch v3.2.0 https://github.com/cartesia-ai/cartesia-js
cd cartesia-js
pnpm i
CARTESIA_API_KEY=YOUR_KEY pnpm tsn examples/node_examples.ts sttTranscribe
This example runs in the browser. See the Next.js example for a working setup.