> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# リアルタイム音声認識 (手動)

> Realtime speech transcription without turn detection

<Note>
  This endpoint relies on the `finalize` command to trigger transcription. See [Compare STT Endpoints](/use-the-api/stt/compare-endpoints) for details.
</Note>


## AsyncAPI

````yaml asyncapi.yml /stt/websocket
id: /stt/websocket
title: /stt/websocket
description: |
  Realtime speech transcription without turn detection

  <Note>
    This endpoint relies on the `finalize` command to trigger transcription. See [Compare STT Endpoints](/use-the-api/stt/compare-endpoints) for details.
  </Note>
servers:
  - id: production
    protocol: wss
    host: api.cartesia.ai
    bindings: []
    variables: []
address: /stt/websocket
parameters: []
bindings:
  - protocol: ws
    version: latest
    value:
      query:
        type: object
        required:
          - model
          - encoding
          - sample_rate
          - cartesia_version
        properties:
          model:
            type: string
            enum:
              - ink-2
              - ink-whisper
            example: ink-2
            description: |
              ID of the model to use for transcription.

              See [Models](/build-with-cartesia/stt/latest) for all options.
          encoding:
            type: string
            enum:
              - pcm_s16le
              - pcm_s32le
              - pcm_f16le
              - pcm_f32le
              - pcm_mulaw
              - pcm_alaw
            description: >
              The encoding format of the audio data. This determines how the
              server interprets the raw binary audio data you send.


              For guidance on choosing an encoding, see [Audio
              Input](/build-with-cartesia/stt/audio-input).
          sample_rate:
            type: number
            format: integer
            example: 16000
            description: |
              The sample rate of the audio in Hz.
          cartesia_version:
            type: string
            enum:
              - '2026-03-01'
            example: '2026-03-01'
            description: API version.
          language:
            type: string
            enum:
              - en
            default: en
            description: >
              The language of the input audio in ISO-639-1 format. Defaults to
              `en`.
          min_volume:
            type: number
            minimum: 0
            maximum: 1
            description: >
              Used by `ink-whisper` models only.


              Controls what is considered silence for automatic transcript
              finalization. Range: 0.0-1.0.


              Lower values will pick up quiet audio.

              Higher values lead to more filtering for noisy audio.
          max_silence_duration_secs:
            type: number
            description: >
              Used by `ink-whisper` models only.


              Maximum duration of silence (in seconds) before the API
              automatically finalizes the transcript.


              Lower values will finalize transcripts more aggressively.

              Higher values allow for longer pauses within utterances.
    schemaProperties:
      - name: query
        type: object
        required: false
        properties:
          - name: model
            type: string
            description: |
              ID of the model to use for transcription.

              See [Models](/build-with-cartesia/stt/latest) for all options.
            enumValues:
              - ink-2
              - ink-whisper
            required: true
          - name: encoding
            type: string
            description: >
              The encoding format of the audio data. This determines how the
              server interprets the raw binary audio data you send.


              For guidance on choosing an encoding, see [Audio
              Input](/build-with-cartesia/stt/audio-input).
            enumValues:
              - pcm_s16le
              - pcm_s32le
              - pcm_f16le
              - pcm_f32le
              - pcm_mulaw
              - pcm_alaw
            required: true
          - name: sample_rate
            type: number
            description: |
              The sample rate of the audio in Hz.
            required: true
          - name: cartesia_version
            type: string
            description: API version.
            enumValues:
              - '2026-03-01'
            required: true
          - name: language
            type: string
            description: >
              The language of the input audio in ISO-639-1 format. Defaults to
              `en`.
            enumValues:
              - en
            required: false
          - name: min_volume
            type: number
            description: >
              Used by `ink-whisper` models only.


              Controls what is considered silence for automatic transcript
              finalization. Range: 0.0-1.0.


              Lower values will pick up quiet audio.

              Higher values lead to more filtering for noisy audio.
            required: false
          - name: max_silence_duration_secs
            type: number
            description: >
              Used by `ink-whisper` models only.


              Maximum duration of silence (in seconds) before the API
              automatically finalizes the transcript.


              Lower values will finalize transcripts more aggressively.

              Higher values allow for longer pauses within utterances.
            required: false
operations:
  - &ref_5
    id: sendSTTAudio
    title: Send s t t audio
    type: receive
    messages:
      - &ref_7
        id: sttAudioData
        payload:
          - type: string
            format: binary
            description: >
              Send WebSocket binary messages containing raw audio data as
              specified by the `encoding` and `sample_rate` query parameters.


              Audio Requirements:

              - Send audio in small chunks, e.g. 100 ms

              - Audio format must match the `encoding` and `sample_rate`
              parameters
            x-parser-schema-id: <anonymous-schema-66>
            name: Send Audio Data
        headers: []
        jsonPayloadSchema:
          type: string
          format: binary
          description: >-
            Raw audio data as a binary message in the format specified by the
            `encoding` parameter. Send audio in small chunks, e.g. 100 ms.
          x-parser-schema-id: <anonymous-schema-66>
        title: Send Audio Data
        description: >
          Send WebSocket binary messages containing raw audio data as specified
          by the `encoding` and `sample_rate` query parameters.


          Audio Requirements:

          - Send audio in small chunks, e.g. 100 ms

          - Audio format must match the `encoding` and `sample_rate` parameters
        example: '{}'
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttAudioData
      - &ref_8
        id: sttFinalizeCommand
        payload:
          - type: string
            enum: &ref_0
              - finalize
            description: >-
              Send `finalize` as a text message when the user is done speaking
              to receive the transcript for any buffered audio.
            examples: &ref_1
              - finalize
            x-parser-schema-id: <anonymous-schema-67>
            name: Finalize Command
        headers: []
        jsonPayloadSchema:
          type: string
          enum: *ref_0
          description: The value `finalize` as a text message
          examples: *ref_1
          x-parser-schema-id: <anonymous-schema-67>
        title: Finalize Command
        description: >-
          Send `finalize` as a text message when the user is done speaking to
          receive the transcript for any buffered audio.
        example: '"finalize"'
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttFinalizeCommand
      - &ref_9
        id: sttCloseCommand
        payload:
          - type: string
            enum: &ref_2
              - close
            description: >-
              Send `close` as a text message to flush remaining audio, close
              session, and receive a done acknowledgment
            examples: &ref_3
              - close
            x-parser-schema-id: <anonymous-schema-68>
            name: Close Command
        headers: []
        jsonPayloadSchema:
          type: string
          enum: *ref_2
          description: The value `close` as a text message
          examples: *ref_3
          x-parser-schema-id: <anonymous-schema-68>
        title: Close Command
        description: >-
          Send `close` as a text message to flush remaining audio, close
          session, and receive a done acknowledgment
        example: '"close"'
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttCloseCommand
    bindings: []
    extensions: &ref_4
      - id: x-parser-unique-object-id
        value: /stt/websocket
  - &ref_6
    id: receiveSTTTranscription
    title: Receive s t t transcription
    type: send
    messages:
      - &ref_10
        id: sttTranscriptResponse
        payload:
          - name: Transcript Response
            description: >
              Transcript chunks.


              You should send the `finalize` command after the user is done
              speaking to make the API emit these transcript chunks;

              although, the API may send transcript chunks even before you send
              the `finalize` command.
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - transcript
                required: true
              - name: is_final
                type: boolean
                description: Whether `text` is finalized.
                required: true
              - name: request_id
                type: string
                description: Unique identifier for this WebSocket connection.
                required: true
              - name: text
                type: string
                description: >
                  Transcribed text. This is a delta from the last transcript
                  chunk with `"is_final": true`.


                  To assemble the full transcript, concatenate all transcript
                  chunks where `"is_final": true`.


                  Do not strip whitespace from `text` or add whitespace between
                  chunks as this will produce an incorrect transcript.
                required: true
              - name: duration
                type: number
                description: Duration of the audio in seconds
                required: false
              - name: words
                type: array
                description: Word-level timestamps
                required: false
                properties:
                  - name: word
                    type: string
                    description: The transcribed word
                    required: true
                  - name: start
                    type: number
                    description: Start time in seconds
                    required: true
                  - name: end
                    type: number
                    description: End time in seconds
                    required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - is_final
            - request_id
            - text
          properties:
            type:
              type: string
              enum:
                - transcript
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-69>
            is_final:
              type: boolean
              description: Whether `text` is finalized.
              x-parser-schema-id: <anonymous-schema-70>
            request_id:
              type: string
              description: Unique identifier for this WebSocket connection.
              x-parser-schema-id: <anonymous-schema-71>
            text:
              type: string
              description: >
                Transcribed text. This is a delta from the last transcript chunk
                with `"is_final": true`.


                To assemble the full transcript, concatenate all transcript
                chunks where `"is_final": true`.


                Do not strip whitespace from `text` or add whitespace between
                chunks as this will produce an incorrect transcript.
              x-parser-schema-id: <anonymous-schema-72>
            duration:
              type: number
              description: Duration of the audio in seconds
              x-parser-schema-id: <anonymous-schema-73>
            words:
              type: array
              description: Word-level timestamps
              items:
                type: object
                required:
                  - word
                  - start
                  - end
                properties:
                  word:
                    type: string
                    description: The transcribed word
                    x-parser-schema-id: <anonymous-schema-76>
                  start:
                    type: number
                    description: Start time in seconds
                    x-parser-schema-id: <anonymous-schema-77>
                  end:
                    type: number
                    description: End time in seconds
                    x-parser-schema-id: <anonymous-schema-78>
                x-parser-schema-id: <anonymous-schema-75>
              x-parser-schema-id: <anonymous-schema-74>
          x-parser-schema-id: STTTranscriptResponse
        title: Transcript Response
        description: >
          Transcript chunks.


          You should send the `finalize` command after the user is done speaking
          to make the API emit these transcript chunks;

          although, the API may send transcript chunks even before you send the
          `finalize` command.
        example: |-
          {
            "type": "transcript",
            "is_final": true,
            "request_id": "b67e1c5d-2f4c-4c3d-9f82-96eb4d2f12a8",
            "text": "How are you doing today?",
            "duration": 2.5,
            "language": "en",
            "words": [
              {
                "word": "How",
                "start": 0,
                "end": 0.12
              },
              {
                "word": "are",
                "start": 0.15,
                "end": 0.25
              },
              {
                "word": "you",
                "start": 0.28,
                "end": 0.35
              },
              {
                "word": "doing",
                "start": 0.38,
                "end": 0.55
              },
              {
                "word": "today?",
                "start": 0.58,
                "end": 0.78
              }
            ]
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttTranscriptResponse
      - &ref_11
        id: sttFlushDoneResponse
        payload:
          - name: Flush Done Response
            description: Acknowledgment for the `finalize` command
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - flush_done
                required: true
              - name: request_id
                type: string
                description: Unique identifier for this websocket connection
                required: true
              - name: is_final
                type: boolean
                description: Has no meaning for this message
                deprecated: true
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - request_id
          properties:
            type:
              type: string
              enum:
                - flush_done
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-79>
            request_id:
              type: string
              description: Unique identifier for this websocket connection
              x-parser-schema-id: <anonymous-schema-80>
            is_final:
              type: boolean
              deprecated: true
              description: Has no meaning for this message
              x-parser-schema-id: <anonymous-schema-81>
          x-parser-schema-id: STTFlushDoneResponse
        title: Flush Done Response
        description: Acknowledgment for the `finalize` command
        example: |-
          {
            "type": "flush_done",
            "request_id": "b67e1c5d-2f4c-4c3d-9f82-96eb4d2f12a8"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttFlushDoneResponse
      - &ref_12
        id: sttDoneResponse
        payload:
          - name: Done Response
            description: Acknowledgment for the `close` command
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - done
                required: true
              - name: request_id
                type: string
                description: Unique identifier for this websocket connection
                required: true
              - name: is_final
                type: boolean
                description: Has no meaning for this message
                deprecated: true
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - request_id
          properties:
            type:
              type: string
              enum:
                - done
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-82>
            request_id:
              type: string
              description: Unique identifier for this websocket connection
              x-parser-schema-id: <anonymous-schema-83>
            is_final:
              type: boolean
              deprecated: true
              description: Has no meaning for this message
              x-parser-schema-id: <anonymous-schema-84>
          x-parser-schema-id: STTDoneResponse
        title: Done Response
        description: Acknowledgment for the `close` command
        example: |-
          {
            "type": "done",
            "request_id": "b67e1c5d-2f4c-4c3d-9f82-96eb4d2f12a8"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttDoneResponse
      - &ref_13
        id: sttErrorResponse
        payload:
          - name: Error Response
            description: Error information for STT WebSocket connections.
            type: object
            properties:
              - name: type
                type: string
                description: Event type identifier.
                enumValues:
                  - error
                required: true
              - name: error_code
                type: string
                description: Machine-readable error code.
                required: false
              - name: status_code
                type: number
                description: An HTTP response status code.
                required: true
              - name: title
                type: string
                description: Human-readable error title.
                required: true
              - name: message
                type: string
                description: Human-readable error message.
                required: true
              - name: doc_url
                type: string
                description: URL to relevant documentation
                required: false
              - name: request_id
                type: string
                description: Unique identifier for this websocket connection
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - status_code
            - title
            - message
          properties:
            type:
              type: string
              enum:
                - error
              description: Event type identifier.
              x-parser-schema-id: <anonymous-schema-85>
            error_code:
              type: string
              description: Machine-readable error code.
              x-parser-schema-id: <anonymous-schema-86>
            status_code:
              type: number
              format: integer
              description: An HTTP response status code.
              x-parser-schema-id: <anonymous-schema-87>
            title:
              type: string
              description: Human-readable error title.
              x-parser-schema-id: <anonymous-schema-88>
            message:
              type: string
              description: Human-readable error message.
              x-parser-schema-id: <anonymous-schema-89>
            doc_url:
              type: string
              description: URL to relevant documentation
              x-parser-schema-id: <anonymous-schema-90>
            request_id:
              type: string
              description: Unique identifier for this websocket connection
              x-parser-schema-id: <anonymous-schema-91>
          x-parser-schema-id: STTErrorResponse
        title: Error Response
        description: Error information for STT WebSocket connections.
        example: |-
          {
            "type": "error",
            "title": "Invalid model",
            "message": "The model is not valid, make sure it is a valid model ID.",
            "error_code": "model_not_found",
            "doc_url": "https://docs.cartesia.ai/build-with-cartesia/stt/latest",
            "status_code": 400,
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: sttErrorResponse
    bindings: []
    extensions: *ref_4
sendOperations:
  - *ref_5
receiveOperations:
  - *ref_6
sendMessages:
  - *ref_7
  - *ref_8
  - *ref_9
receiveMessages:
  - *ref_10
  - *ref_11
  - *ref_12
  - *ref_13
extensions:
  - id: x-parser-unique-object-id
    value: /stt/websocket
securitySchemes:
  - id: apiKey
    name: X-API-Key
    type: httpApiKey
    description: API key passed in a header.
    in: header
    extensions: []
  - id: accessTokenQuery
    name: access_token
    type: httpApiKey
    description: >
      A short-lived access token passed in a query param to make API requests
      from a client.

      This is particularly useful in the browser, where WebSockets do not
      support headers.

      See [Authenticate client
      apps](/get-started/authenticate-your-client-applications) to generate an
      access token.
    in: query
    extensions: []

````