Next.js Full Example

A full Next.js app demonstrating three approaches to Cartesia TTS in the browser: batch generation, HTTP streaming, and WebSocket streaming. Includes a server-side token endpoint so API keys are never exposed to the client.

Token Endpoint

app/api/token/route.ts

import Cartesia from "@cartesia/cartesia-js";

const client = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY });

export async function POST() {
  const { token } = await client.accessToken.create({
    grants: { tts: true },
    expires_in: 300,
  });
  return Response.json({ token });
}

Batch and HTTP Streaming

app/page.tsx

"use client";

import { useRef, useState } from "react";
import Cartesia from "@cartesia/cartesia-js";

const SAMPLE_RATE = 44100;
const BYTES_PER_SAMPLE = 4; // f32le

async function getToken(): Promise<string> {
  const res = await fetch("/api/token", { method: "POST" });
  const { token } = await res.json();
  return token;
}

// =============================================================================
// Batch: waits for the full response, then plays via <audio> element
// =============================================================================

function BatchCartesiaTTSExample() {
  const audioRef = useRef<HTMLAudioElement>(null);
  const [loading, setLoading] = useState(false);

  async function speak() {
    setLoading(true);
    try {
      const client = new Cartesia({ token: await getToken() });
      const response = await client.tts.generate({
        model_id: "sonic-3.5",
        transcript: "Hello! This audio was generated in one batch and then played.",
        voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
        output_format: { container: "wav", encoding: "pcm_s16le", sample_rate: SAMPLE_RATE },
      });

      const blob = await response.blob();
      const url = URL.createObjectURL(blob);
      const audio = audioRef.current!;
      audio.src = url;
      audio.onended = () => URL.revokeObjectURL(url);
      await audio.play();
    } finally {
      setLoading(false);
    }
  }

  return (
    <section>
      <h2>Batch</h2>
      <p>Waits for the full audio, then plays via an audio element.</p>
      <button onClick={speak} disabled={loading}>
        {loading ? "Generating..." : "Speak"}
      </button>
      <audio ref={audioRef} controls style={{ display: "block", marginTop: "0.5rem" }} />
    </section>
  );
}

// =============================================================================
// Streaming: plays audio chunks as they arrive via Web Audio API
// =============================================================================

function StreamingCartesiaTTSExample() {
  const [loading, setLoading] = useState(false);

  async function speak() {
    setLoading(true);
    try {
      const client = new Cartesia({ token: await getToken() });
      const response = await client.tts.generate({
        model_id: "sonic-3.5",
        transcript:
          "Hello! This audio is being streamed and played as chunks arrive.",
        voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
        output_format: { container: "raw", encoding: "pcm_f32le", sample_rate: SAMPLE_RATE },
      });

      // Stream the response and play each chunk as it arrives.
      // We buffer incoming bytes so we only decode complete f32 samples —
      // getReader() can split chunks at arbitrary byte boundaries.
      const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
      let nextStartTime = audioCtx.currentTime;
      const reader = response.body!.getReader();
      let leftover = new Uint8Array(0);

      while (true) {
        const { done, value } = await reader.read();
        if (done) break;

        // Prepend any leftover bytes from the previous chunk
        let bytes: Uint8Array;
        if (leftover.length > 0) {
          bytes = new Uint8Array(leftover.length + value.length);
          bytes.set(leftover);
          bytes.set(value, leftover.length);
        } else {
          bytes = value;
        }

        // Only decode complete samples, save the remainder
        const usableBytes = bytes.length - (bytes.length % BYTES_PER_SAMPLE);
        leftover = bytes.slice(usableBytes);

        if (usableBytes === 0) continue;

        // Copy to an aligned buffer so Float32Array doesn't throw on unaligned offset
        const aligned = new ArrayBuffer(usableBytes);
        new Uint8Array(aligned).set(bytes.subarray(0, usableBytes));
        const floats = new Float32Array(aligned);

        const buf = audioCtx.createBuffer(1, floats.length, SAMPLE_RATE);
        buf.getChannelData(0).set(floats);

        const source = audioCtx.createBufferSource();
        source.buffer = buf;
        source.connect(audioCtx.destination);

        const startTime = Math.max(nextStartTime, audioCtx.currentTime);
        source.start(startTime);
        nextStartTime = startTime + buf.duration;
      }
    } finally {
      setLoading(false);
    }
  }

  return (
    <section>
      <h2>Streaming</h2>
      <p>Plays audio chunks as they arrive via the Web Audio API.</p>
      <button onClick={speak} disabled={loading}>
        {loading ? "Streaming..." : "Speak"}
      </button>
    </section>
  );
}

// =============================================================================
// Page
// =============================================================================

export default function Home() {
  return (
    <main style={{ padding: "2rem", fontFamily: "system-ui" }}>
      <h1>Cartesia TTS — Next.js Example</h1>
      <div style={{ display: "flex", flexDirection: "column", gap: "2rem", marginTop: "1rem" }}>
        <BatchCartesiaTTSExample />
        <StreamingCartesiaTTSExample />
      </div>
      <p style={{ marginTop: "2rem" }}>
        <a href="/websocket">WebSocket streaming example →</a>
      </p>
    </main>
  );
}

WebSocket Streaming

app/websocket/page.tsx

"use client";

import { useState } from "react";
import Cartesia from "@cartesia/cartesia-js";

const SAMPLE_RATE = 44100;

export default function WebSocketExample() {
  const [loading, setLoading] = useState(false);

  async function speak() {
    setLoading(true);
    try {
      // 1. Get a short-lived token from our server
      const res = await fetch("/api/token", { method: "POST" });
      const { token } = await res.json();

      // 2. Connect via WebSocket from the browser
      const client = new Cartesia({ token });
      const ws = await client.tts.websocket();

      // 3. Stream audio and play each chunk as it arrives
      const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
      let nextStartTime = audioCtx.currentTime;

      const resp = ws.generate({
        model_id: "sonic-3.5",
        transcript:
          "Hello from a WebSocket! Each audio chunk is played the moment it arrives, giving you the lowest possible latency.",
        voice: { mode: "id", id: "6ccbfb76-1fc6-48f7-b71d-91ac6298247b" },
        output_format: { container: "raw", encoding: "pcm_f32le", sample_rate: SAMPLE_RATE },
      });

      for await (const event of resp) {
        if (event.type === "chunk" && event.audio) {
          // event.audio is a Uint8Array of f32le samples
          const aligned = new ArrayBuffer(event.audio.byteLength);
          new Uint8Array(aligned).set(event.audio);
          const floats = new Float32Array(aligned);

          const buf = audioCtx.createBuffer(1, floats.length, SAMPLE_RATE);
          buf.getChannelData(0).set(floats);

          const source = audioCtx.createBufferSource();
          source.buffer = buf;
          source.connect(audioCtx.destination);

          const startTime = Math.max(nextStartTime, audioCtx.currentTime);
          source.start(startTime);
          nextStartTime = startTime + buf.duration;
        }
      }

      ws.close();
    } finally {
      setLoading(false);
    }
  }

  return (
    <main style={{ padding: "2rem", fontFamily: "system-ui" }}>
      <h1>Cartesia TTS — WebSocket Streaming</h1>
      <p>
        Uses the SDK&apos;s WebSocket API directly from the browser.
        Audio plays as each chunk arrives for lowest latency.
      </p>
      <button onClick={speak} disabled={loading}>
        {loading ? "Streaming..." : "Speak"}
      </button>
      <p style={{ marginTop: "1rem" }}>
        <a href="/">← Back to HTTP examples</a>
      </p>
    </main>
  );
}

Run this example

cd cartesia-js/examples/nextjs
npm install
CARTESIA_API_KEY=YOUR_KEY npm run dev

Then open http://localhost:3000.

Source

View on GitHub

Full Next.js example project

TTS Generate

TTS WebSocket

TTS SSE

Voices

Other

Browser

Next.js

Token Endpoint

Batch and HTTP Streaming

WebSocket Streaming

Run this example

Source

View on GitHub

TTS Generate

TTS WebSocket

TTS SSE

Voices

Other

Browser

Next.js

Documentation Index

​Token Endpoint

​Batch and HTTP Streaming

​WebSocket Streaming

​Run this example

​Source

View on GitHub

Token Endpoint

Batch and HTTP Streaming

WebSocket Streaming

Run this example

Source