> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Realtime Speech-to-Text (Auto)

> Realtime speech transcription with built-in turn detection

<Note>
  This endpoint is English only right now.  
  We expect to add more languages in the coming months.
</Note>




## AsyncAPI

````yaml asyncapi.yml /stt/turns/websocket
id: /stt/turns/websocket
title: /stt/turns/websocket
description: |
  Realtime speech transcription with built-in turn detection

  <Note>
    This endpoint is English only right now.  
    We expect to add more languages in the coming months.
  </Note>
servers:
  - id: production
    protocol: wss
    host: api.cartesia.ai
    bindings: []
    variables: []
address: /stt/turns/websocket
parameters:
  - id: model
    jsonSchema:
      type: string
      description: |
        ID of the model to use for transcription, e.g. `ink-2`.

        See [Models](/build-with-cartesia/stt/latest) for available models.
    description: |
      ID of the model to use for transcription, e.g. `ink-2`.

      See [Models](/build-with-cartesia/stt/latest) for available models.
    type: string
    required: true
    deprecated: false
  - id: encoding
    jsonSchema:
      type: string
      description: >
        The encoding format of the audio data. This determines how the server
        interprets the raw binary audio data you send.


        Supported encodings: `pcm_s16le`, `pcm_s32le`, `pcm_f16le`, `pcm_f32le`,
        `pcm_mulaw`, `pcm_alaw`.


        For guidance on choosing an encoding, see [Audio
        Input](/build-with-cartesia/stt/audio-input).
    description: >
      The encoding format of the audio data. This determines how the server
      interprets the raw binary audio data you send.


      Supported encodings: `pcm_s16le`, `pcm_s32le`, `pcm_f16le`, `pcm_f32le`,
      `pcm_mulaw`, `pcm_alaw`.


      For guidance on choosing an encoding, see [Audio
      Input](/build-with-cartesia/stt/audio-input).
    type: string
    required: true
    deprecated: false
  - id: sample_rate
    jsonSchema:
      type: string
      description: |
        The sample rate of the audio in Hz.
    description: |
      The sample rate of the audio in Hz.
    type: string
    required: true
    deprecated: false
  - id: turn_start_threshold
    jsonSchema:
      type: string
      description: >
        Threshold above which to start the turn. Range: 0.5–0.9. Must stay above
        the eager end threshold.


        See [Configuring turn
        detection](/use-the-api/stt/turns#configuring-turn-detection) for
        details.
      default: '0.8'
    description: >
      Threshold above which to start the turn. Range: 0.5–0.9. Must stay above
      the eager end threshold.


      See [Configuring turn
      detection](/use-the-api/stt/turns#configuring-turn-detection) for details.
    type: string
    required: true
    deprecated: false
  - id: turn_eager_end_threshold
    jsonSchema:
      type: string
      description: >
        Threshold below which to eager end the turn. Range: 0.3–0.6. Must stay
        between the end and start thresholds.


        See [Configuring turn
        detection](/use-the-api/stt/turns#configuring-turn-detection) for
        details.
      default: '0.4'
    description: >
      Threshold below which to eager end the turn. Range: 0.3–0.6. Must stay
      between the end and start thresholds.


      See [Configuring turn
      detection](/use-the-api/stt/turns#configuring-turn-detection) for details.
    type: string
    required: true
    deprecated: false
  - id: turn_end_threshold
    jsonSchema:
      type: string
      description: >
        Threshold below which to end the turn. Range: 0.05–0.5. Must stay below
        the eager end threshold.


        See [Configuring turn
        detection](/use-the-api/stt/turns#configuring-turn-detection) for
        details.
      default: '0.2'
    description: >
      Threshold below which to end the turn. Range: 0.05–0.5. Must stay below
      the eager end threshold.


      See [Configuring turn
      detection](/use-the-api/stt/turns#configuring-turn-detection) for details.
    type: string
    required: true
    deprecated: false
  - id: turn_end_timeout_ms
    jsonSchema:
      type: string
      description: >
        Maximum amount of time in milliseconds that the model will wait after
        the user stops speaking before ending the turn. Range: 640–11200.


        See [Configuring turn
        detection](/use-the-api/stt/turns#configuring-turn-detection) for
        details.
      default: '5600'
    description: >
      Maximum amount of time in milliseconds that the model will wait after the
      user stops speaking before ending the turn. Range: 640–11200.


      See [Configuring turn
      detection](/use-the-api/stt/turns#configuring-turn-detection) for details.
    type: string
    required: true
    deprecated: false
  - id: cartesia_version
    jsonSchema:
      type: string
      description: API version, e.g. `2026-03-01`
    description: API version, e.g. `2026-03-01`
    type: string
    required: true
    deprecated: false
bindings: []
operations:
  - &ref_1
    id: sendSTTTurnsAudio
    title: Send s t t turns audio
    type: receive
    messages:
      - &ref_3
        id: sttTurnsAudioData
        payload:
          - type: string
            format: binary
            description: >
              Send WebSocket binary messages containing raw audio data as
              specified by the `encoding` and `sample_rate` query parameters.


              Audio Requirements:

              - Send audio in small chunks, e.g. 100 ms

              - Audio format must match the `encoding` and `sample_rate`
              parameters
            x-parser-schema-id: <anonymous-schema-100>
            name: Send Audio Data
        headers: []
        jsonPayloadSchema:
          type: string
          format: binary
          description: >-
            Raw audio data as a binary message in the format specified by the
            `encoding` parameter. Send audio in small chunks, e.g. 100 ms.
          x-parser-schema-id: <anonymous-schema-100>
        title: Send Audio Data
        description: >
          Send WebSocket binary messages containing raw audio data as specified
          by the `encoding` and `sample_rate` query parameters.


          Audio Requirements:

          - Send audio in small chunks, e.g. 100 ms

          - Audio format must match the `encoding` and `sample_rate` parameters
        example: '{}'
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsAudioData
      - &ref_4
        id: sttTurnsCloseCommand
        payload:
          - name: Close Command
            description: >-
              Send a JSON encoded close command as WebSocket text message to
              close the session cleanly. All buffered audio will be processed by
              the model into events.
            type: object
            properties:
              - name: type
                type: string
                description: >-
                  Command type. Send this as a JSON encoded WebSocket text
                  message to close the session.
                enumValues:
                  - close
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
          properties:
            type:
              type: string
              enum:
                - close
              description: >-
                Command type. Send this as a JSON encoded WebSocket text message
                to close the session.
              x-parser-schema-id: <anonymous-schema-101>
          x-parser-schema-id: STTTurnsCloseCommand
        title: Close Command
        description: >-
          Send a JSON encoded close command as WebSocket text message to close
          the session cleanly. All buffered audio will be processed by the model
          into events.
        example: |-
          {
            "type": "close"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsCloseCommand
      - &ref_5
        id: sttTurnsConfigCommand
        payload:
          - name: Config Command
            description: >-
              Send a JSON encoded config command as a WebSocket text message to
              update model settings.
            type: object
            properties:
              - name: type
                type: string
                description: >-
                  Command type. Send this as a JSON encoded WebSocket text
                  message to update model settings.
                enumValues:
                  - config
                required: true
              - name: turn
                type: object
                description: Turn detection settings.
                required: false
                properties:
                  - name: start_threshold
                    type: number
                    description: >
                      Threshold above which to start the turn. Range: 0.5–0.9.
                      Must stay above the eager end threshold.


                      See [Configuring turn
                      detection](/use-the-api/stt/turns#configuring-turn-detection)
                      for details.
                    required: false
                  - name: eager_end_threshold
                    type: number
                    description: >
                      Threshold below which to eager end the turn. Range:
                      0.3–0.6. Must stay between the end and start thresholds.


                      See [Configuring turn
                      detection](/use-the-api/stt/turns#configuring-turn-detection)
                      for details.
                    required: false
                  - name: end_threshold
                    type: number
                    description: >
                      Threshold below which to end the turn. Range: 0.05–0.5.
                      Must stay below the eager end threshold.


                      See [Configuring turn
                      detection](/use-the-api/stt/turns#configuring-turn-detection)
                      for details.
                    required: false
                  - name: end_timeout_ms
                    type: number
                    description: >
                      Maximum amount of time in milliseconds that the model will
                      wait after the user stops speaking before ending the turn.
                      Range: 640–11200.


                      See [Configuring turn
                      detection](/use-the-api/stt/turns#configuring-turn-detection)
                      for details.
                    required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
          properties:
            type:
              type: string
              enum:
                - config
              description: >-
                Command type. Send this as a JSON encoded WebSocket text message
                to update model settings.
              x-parser-schema-id: <anonymous-schema-102>
            turn:
              type: object
              description: Turn detection settings.
              properties:
                start_threshold:
                  type: number
                  default: 0.8
                  description: >
                    Threshold above which to start the turn. Range: 0.5–0.9.
                    Must stay above the eager end threshold.


                    See [Configuring turn
                    detection](/use-the-api/stt/turns#configuring-turn-detection)
                    for details.
                  x-parser-schema-id: <anonymous-schema-104>
                eager_end_threshold:
                  type: number
                  default: 0.4
                  description: >
                    Threshold below which to eager end the turn. Range: 0.3–0.6.
                    Must stay between the end and start thresholds.


                    See [Configuring turn
                    detection](/use-the-api/stt/turns#configuring-turn-detection)
                    for details.
                  x-parser-schema-id: <anonymous-schema-105>
                end_threshold:
                  type: number
                  default: 0.2
                  description: >
                    Threshold below which to end the turn. Range: 0.05–0.5. Must
                    stay below the eager end threshold.


                    See [Configuring turn
                    detection](/use-the-api/stt/turns#configuring-turn-detection)
                    for details.
                  x-parser-schema-id: <anonymous-schema-106>
                end_timeout_ms:
                  type: number
                  default: 5600
                  description: >
                    Maximum amount of time in milliseconds that the model will
                    wait after the user stops speaking before ending the turn.
                    Range: 640–11200.


                    See [Configuring turn
                    detection](/use-the-api/stt/turns#configuring-turn-detection)
                    for details.
                  x-parser-schema-id: <anonymous-schema-107>
              x-parser-schema-id: <anonymous-schema-103>
          x-parser-schema-id: STTTurnsConfigCommand
        title: Config Command
        description: >-
          Send a JSON encoded config command as a WebSocket text message to
          update model settings.
        example: |-
          {
            "type": "config",
            "turn": {
              "start_threshold": 0.8,
              "eager_end_threshold": 0.4,
              "end_threshold": 0.2,
              "end_timeout_ms": 5600
            }
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsConfigCommand
    bindings: []
    extensions: &ref_0
      - id: x-parser-unique-object-id
        value: /stt/turns/websocket
  - &ref_2
    id: receiveSTTTurnsEvents
    title: Receive s t t turns events
    description: >-
      The server sends turn events as the model transcribes. Messages can be of
      type `turn.start`, `turn.update`, `turn.eager_end`, `turn.resume`,
      `turn.end`, or `error`. All emitted text is final — the model does not
      revise previous output. The `transcript` field is cumulative within a
      turn.
    type: send
    messages:
      - &ref_6
        id: sttTurnsConnected
        payload:
          - name: Connected
            description: |
              Fires once when the WebSocket connection is established.

              You do not need to wait for this event before sending audio.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - connected
                required: true
              - name: request_id
                type: string
                description: Unique identifier for this connection.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - request_id
          properties:
            type:
              type: string
              enum:
                - connected
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-108>
            request_id:
              type: string
              description: Unique identifier for this connection.
              x-parser-schema-id: <anonymous-schema-109>
          x-parser-schema-id: STTTurnsConnected
        title: Connected
        description: |
          Fires once when the WebSocket connection is established.

          You do not need to wait for this event before sending audio.
        example: |-
          {
            "type": "connected",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsConnected
      - &ref_7
        id: sttTurnsTurnStart
        payload:
          - name: Turn Start
            description: >
              Marks the start of a user turn. Fires quickly after the user
              begins speaking.


              This event can be used to interrupt your agent to avoid talking
              over the user.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - turn.start
                required: true
              - name: request_id
                type: string
                description: >-
                  Unique identifier for this connection. Does not change between
                  turns.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - request_id
          properties:
            type:
              type: string
              enum:
                - turn.start
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-110>
            request_id:
              type: string
              description: >-
                Unique identifier for this connection. Does not change between
                turns.
              x-parser-schema-id: <anonymous-schema-111>
          x-parser-schema-id: STTTurnsTurnStart
        title: Turn Start
        description: >
          Marks the start of a user turn. Fires quickly after the user begins
          speaking.


          This event can be used to interrupt your agent to avoid talking over
          the user.
        example: |-
          {
            "type": "turn.start",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsTurnStart
      - &ref_8
        id: sttTurnsTurnUpdate
        payload:
          - name: Turn Update
            description: |
              Fires repeatedly as the model transcribes the current user turn.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - turn.update
                required: true
              - name: transcript
                type: string
                description: >-
                  Cumulative text for the current turn, i.e. the full text
                  transcribed so far in this turn, not a delta.
                required: true
              - name: request_id
                type: string
                description: >-
                  Unique identifier for this connection. Does not change between
                  turns.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - transcript
            - request_id
          properties:
            type:
              type: string
              enum:
                - turn.update
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-112>
            transcript:
              type: string
              description: >-
                Cumulative text for the current turn, i.e. the full text
                transcribed so far in this turn, not a delta.
              x-parser-schema-id: <anonymous-schema-113>
            request_id:
              type: string
              description: >-
                Unique identifier for this connection. Does not change between
                turns.
              x-parser-schema-id: <anonymous-schema-114>
          x-parser-schema-id: STTTurnsTurnUpdate
        title: Turn Update
        description: |
          Fires repeatedly as the model transcribes the current user turn.
        example: |-
          {
            "type": "turn.update",
            "transcript": "Hey can you help",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsTurnUpdate
      - &ref_9
        id: sttTurnsTurnEagerEnd
        payload:
          - name: Turn Eager End
            description: >
              Fires when the model predicts that the user might be done
              speaking.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - turn.eager_end
                required: true
              - name: transcript
                type: string
                description: Cumulative text for the current turn.
                required: true
              - name: request_id
                type: string
                description: >-
                  Unique identifier for this connection. Does not change between
                  turns.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - transcript
            - request_id
          properties:
            type:
              type: string
              enum:
                - turn.eager_end
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-115>
            transcript:
              type: string
              description: Cumulative text for the current turn.
              x-parser-schema-id: <anonymous-schema-116>
            request_id:
              type: string
              description: >-
                Unique identifier for this connection. Does not change between
                turns.
              x-parser-schema-id: <anonymous-schema-117>
          x-parser-schema-id: STTTurnsTurnEagerEnd
        title: Turn Eager End
        description: |
          Fires when the model predicts that the user might be done speaking.
        example: |-
          {
            "type": "turn.eager_end",
            "transcript": "Hey can you help me with something?",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsTurnEagerEnd
      - &ref_10
        id: sttTurnsTurnResume
        payload:
          - name: Turn Resume
            description: >
              Fires after `turn.eager_end` if the user turn has not actually
              ended.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - turn.resume
                required: true
              - name: request_id
                type: string
                description: >-
                  Unique identifier for this connection. Does not change between
                  turns.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - request_id
          properties:
            type:
              type: string
              enum:
                - turn.resume
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-118>
            request_id:
              type: string
              description: >-
                Unique identifier for this connection. Does not change between
                turns.
              x-parser-schema-id: <anonymous-schema-119>
          x-parser-schema-id: STTTurnsTurnResume
        title: Turn Resume
        description: |
          Fires after `turn.eager_end` if the user turn has not actually ended.
        example: |-
          {
            "type": "turn.resume",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsTurnResume
      - &ref_11
        id: sttTurnsTurnEnd
        payload:
          - name: Turn End
            description: |
              Marks the end of a user turn.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - turn.end
                required: true
              - name: transcript
                type: string
                description: Definitive transcript for the completed turn.
                required: true
              - name: request_id
                type: string
                description: >-
                  Unique identifier for this connection. Does not change between
                  turns.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - transcript
            - request_id
          properties:
            type:
              type: string
              enum:
                - turn.end
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-120>
            transcript:
              type: string
              description: Definitive transcript for the completed turn.
              x-parser-schema-id: <anonymous-schema-121>
            request_id:
              type: string
              description: >-
                Unique identifier for this connection. Does not change between
                turns.
              x-parser-schema-id: <anonymous-schema-122>
          x-parser-schema-id: STTTurnsTurnEnd
        title: Turn End
        description: |
          Marks the end of a user turn.
        example: |-
          {
            "type": "turn.end",
            "transcript": "Hey can you help me with something?",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTurnsTurnEnd
      - &ref_12
        id: sttErrorResponse
        payload:
          - name: Error Response
            description: Error information for STT WebSocket connections.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - error
                required: true
              - name: error_code
                type: string
                description: Machine-readable error code.
                required: false
              - name: status_code
                type: number
                description: An HTTP response status code.
                required: true
              - name: title
                type: string
                description: Human-readable error title.
                required: true
              - name: message
                type: string
                description: Human-readable error message.
                required: true
              - name: doc_url
                type: string
                description: URL to relevant documentation
                required: false
              - name: request_id
                type: string
                description: Unique identifier for this websocket connection
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - status_code
            - title
            - message
          properties:
            type:
              type: string
              enum:
                - error
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-85>
            error_code:
              type: string
              description: Machine-readable error code.
              x-parser-schema-id: <anonymous-schema-86>
            status_code:
              type: number
              format: integer
              description: An HTTP response status code.
              x-parser-schema-id: <anonymous-schema-87>
            title:
              type: string
              description: Human-readable error title.
              x-parser-schema-id: <anonymous-schema-88>
            message:
              type: string
              description: Human-readable error message.
              x-parser-schema-id: <anonymous-schema-89>
            doc_url:
              type: string
              description: URL to relevant documentation
              x-parser-schema-id: <anonymous-schema-90>
            request_id:
              type: string
              description: Unique identifier for this websocket connection
              x-parser-schema-id: <anonymous-schema-91>
          x-parser-schema-id: STTErrorResponse
        title: Error Response
        description: Error information for STT WebSocket connections.
        example: |-
          {
            "type": "error",
            "title": "Invalid model",
            "message": "The model is not valid, make sure it is a valid model ID.",
            "error_code": "model_not_found",
            "doc_url": "https://docs.cartesia.ai/build-with-cartesia/stt/latest",
            "status_code": 400,
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttErrorResponse
    bindings: []
    extensions: *ref_0
sendOperations:
  - *ref_1
receiveOperations:
  - *ref_2
sendMessages:
  - *ref_3
  - *ref_4
  - *ref_5
receiveMessages:
  - *ref_6
  - *ref_7
  - *ref_8
  - *ref_9
  - *ref_10
  - *ref_11
  - *ref_12
extensions:
  - id: x-parser-unique-object-id
    value: /stt/turns/websocket
securitySchemes:
  - id: apiKey
    name: X-API-Key
    type: httpApiKey
    description: API key passed in a header.
    in: header
    extensions: []
  - id: accessTokenQuery
    name: access_token
    type: httpApiKey
    description: >
      A short-lived access token passed in a query param to make API requests
      from a client.

      This is particularly useful in the browser, where WebSockets do not
      support headers.

      See [Authenticate client
      apps](/get-started/authenticate-your-client-applications) to generate an
      access token.
    in: query
    extensions: []

````