> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Batch Speech-to-Text

> Transcribes an audio file of any length



## OpenAPI

````yaml latest.yml POST /stt
openapi: 3.0.1
info:
  title: Cartesia API
  version: 0.0.1
servers:
  - url: https://api.cartesia.ai
    description: Production
security: []
paths:
  /stt:
    post:
      tags:
        - Stt
      summary: Batch Speech-to-Text
      description: Transcribes an audio file of any length
      operationId: stt_transcribe
      parameters:
        - $ref: '#/components/parameters/CartesiaVersionHeader'
        - name: encoding
          in: query
          description: >-
            Required when uploading raw PCM data without a container header.

            If not specified, the audio file will be decoded automatically from
            its container (e.g. WAV, MP3, FLAC).
          required: false
          schema:
            $ref: '#/components/schemas/STTEncoding'
            nullable: true
        - name: sample_rate
          in: query
          description: 'The sample rate of the audio in Hz. '
          required: false
          schema:
            type: integer
            nullable: true
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: >-
                    There's no need to break up your audio file. Long files are
                    intelligently chunked by our server.


                    Supported audio formats: `flac`, `m4a`, `mp3`, `mp4`,
                    `mpeg`, `mpga`, `oga`, `ogg`, `wav`, `webm`
                model:
                  type: string
                  enum:
                    - ink-whisper
                  example: ink-whisper
                  description: >-
                    ID of the model to use for transcription. Must be in the
                    `ink-whisper` family of models.
                language:
                  description: The language of the input audio in ISO-639-1 format
                  default: en
                  enum:
                    - en
                    - zh
                    - de
                    - es
                    - ru
                    - ko
                    - fr
                    - ja
                    - pt
                    - tr
                    - pl
                    - ca
                    - nl
                    - ar
                    - sv
                    - it
                    - id
                    - hi
                    - fi
                    - vi
                    - he
                    - uk
                    - el
                    - ms
                    - cs
                    - ro
                    - da
                    - hu
                    - ta
                    - 'no'
                    - th
                    - ur
                    - hr
                    - bg
                    - lt
                    - la
                    - mi
                    - ml
                    - cy
                    - sk
                    - te
                    - fa
                    - lv
                    - bn
                    - sr
                    - az
                    - sl
                    - kn
                    - et
                    - mk
                    - br
                    - eu
                    - is
                    - hy
                    - ne
                    - mn
                    - bs
                    - kk
                    - sq
                    - sw
                    - gl
                    - mr
                    - pa
                    - si
                    - km
                    - sn
                    - yo
                    - so
                    - af
                    - oc
                    - ka
                    - be
                    - tg
                    - sd
                    - gu
                    - am
                    - yi
                    - lo
                    - uz
                    - fo
                    - ht
                    - ps
                    - tk
                    - nn
                    - mt
                    - sa
                    - lb
                    - my
                    - bo
                    - tl
                    - mg
                    - as
                    - tt
                    - haw
                    - ln
                    - ha
                    - ba
                    - jw
                    - su
                    - yue
                  type: string
                timestamp_granularities[]:
                  type: array
                  items:
                    $ref: '#/components/schemas/TimestampGranularity'
              required:
                - file
                - model
      responses:
        '200':
          description: ''
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TranscriptionResponse'
      security:
        - TokenAuth: []
        - APIKeyAuth: []
components:
  parameters:
    CartesiaVersionHeader:
      name: Cartesia-Version
      in: header
      description: API version header.
      required: true
      schema:
        type: string
        format: date
        example: '2026-03-01'
        enum:
          - '2026-03-01'
  schemas:
    STTEncoding:
      title: STTEncoding
      type: string
      enum:
        - pcm_s16le
        - pcm_s32le
        - pcm_f16le
        - pcm_f32le
        - pcm_mulaw
        - pcm_alaw
      description: >-
        Must match the actual encoding of your audio.

        For detailed guidance on each format, see [Audio
        Input](/build-with-cartesia/stt/audio-input).
    TimestampGranularity:
      title: TimestampGranularity
      type: string
      enum:
        - word
      description: >-
        The granularity of timestamps to include in the response.

        Currently only `word` level timestamps are supported, providing start
        and end times for each word.
    TranscriptionResponse:
      title: TranscriptionResponse
      type: object
      properties:
        type:
          type: string
          enum:
            - transcript
          description: >-
            The message type. Always `transcript` for a batch transcription
            response.
        request_id:
          type: string
          description: Unique identifier for this transcription request.
        text:
          type: string
          description: The transcribed text.
        is_final:
          type: boolean
          description: Not used for batch transcription.
          deprecated: true
        language:
          type: string
          description: The specified language of the input audio.
        duration:
          type: number
          format: double
          description: The duration of the input audio in seconds.
        words:
          type: array
          items:
            $ref: '#/components/schemas/TranscriptionWord'
          description: >-
            Word-level timestamps showing the start and end time of each word.
            Only included when `[word]` is passed into
            `timestamp_granularities[]`.
      required:
        - type
        - text
    TranscriptionWord:
      title: TranscriptionWord
      type: object
      properties:
        word:
          type: string
          description: The transcribed word.
        start:
          type: number
          format: double
          description: Start time of the word in seconds.
        end:
          type: number
          format: double
          description: End time of the word in seconds.
      required:
        - word
        - start
        - end
  securitySchemes:
    TokenAuth:
      type: http
      scheme: bearer
      bearerFormat: JWT
      description: A short-lived access token to make API requests from a client.
    APIKeyAuth:
      type: http
      scheme: bearer
      bearerFormat: API Key
      description: >-
        Cartesia API key (`sk_car_...`). Get one at
        [play.cartesia.ai/keys](https://play.cartesia.ai/keys).

````