> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Text-to-Speech (WebSocket)

> Generate audio in realtime with contexts


## AsyncAPI

````yaml asyncapi.yml /tts/websocket
id: /tts/websocket
title: /tts/websocket
description: Generate audio in realtime with contexts
servers:
  - id: production
    protocol: wss
    host: api.cartesia.ai
    bindings: []
    variables: []
address: /tts/websocket
parameters:
  - id: cartesia_version
    jsonSchema:
      type: string
      description: API version, e.g. `2026-03-01`
    description: API version, e.g. `2026-03-01`
    type: string
    required: true
    deprecated: false
bindings: []
operations:
  - &ref_2
    id: sendTTSGeneration
    title: Send t t s generation
    type: receive
    messages:
      - &ref_4
        id: generationRequest
        payload:
          - name: Generation Request
            description: Use this to generate speech for a transcript.
            type: object
            properties:
              - name: model_id
                type: string
                description: >-
                  The ID of the model to use for the generation. See
                  [Models](/build-with-cartesia/tts-models/latest) for all
                  options.
                enumValues:
                  - sonic-3.5
                  - sonic-3
                  - sonic-latest
                required: true
              - name: transcript
                type: string
                description: >-
                  Transcript chunk to add to the audio being generated by this
                  context.
                required: true
              - name: voice
                type: object
                description: >-
                  Voice configuration. Use the same value for all generation
                  requests made to the same context.
                required: true
                properties:
                  - name: mode
                    type: string
                    description: Voice selection mode
                    enumValues:
                      - id
                    required: true
                  - name: id
                    type: string
                    description: The ID of the voice.
                    required: true
              - name: output_format
                type: object
                description: >-
                  Audio output format configuration. Use the same value for all
                  generation requests made to the same context.
                required: true
                properties:
                  - name: container
                    type: string
                    description: Audio container format
                    enumValues:
                      - raw
                    required: true
                  - name: encoding
                    type: string
                    description: >-
                      Audio encoding format. See [TTS Output Audio
                      Format](/build-with-cartesia/capability-guides/tts-output-audio-format)
                      if you're unsure what to use.
                    enumValues:
                      - pcm_f32le
                      - pcm_s16le
                      - pcm_mulaw
                      - pcm_alaw
                    required: true
                  - name: sample_rate
                    type: integer
                    description: Audio sample rate in Hz.
                    enumValues:
                      - 8000
                      - 16000
                      - 22050
                      - 24000
                      - 44100
                      - 48000
                    required: true
              - name: language
                type: string
                description: >-
                  The transcript's language. Use the same value for all
                  generation requests made to the same context.
                enumValues:
                  - en
                  - fr
                  - de
                  - es
                  - pt
                  - zh
                  - ja
                  - hi
                  - it
                  - ko
                  - nl
                  - pl
                  - ru
                  - sv
                  - tr
                  - tl
                  - bg
                  - ro
                  - ar
                  - cs
                  - el
                  - fi
                  - hr
                  - ms
                  - sk
                  - da
                  - ta
                  - uk
                  - hu
                  - 'no'
                  - vi
                  - bn
                  - th
                  - he
                  - ka
                  - id
                  - te
                  - gu
                  - kn
                  - ml
                  - mr
                  - pa
                required: false
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
              - name: continue
                type: boolean
                description: >
                  Whether `transcript` may be followed by more text:
                    - Set to `true` if you will add more `transcript` chunks to this context
                    - Set to `false` on the last `transcript` chunk for this context to minimize latency

                  Defaults to `false`


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details
                required: false
              - name: max_buffer_delay_ms
                type: integer
                description: >
                  The maximum time in milliseconds to buffer text before
                  starting generation. Values between [0, 5000]ms are supported.
                  Defaults to 3000ms.


                  See [Buffering](/use-the-api/tts-websocket/buffering) for
                  details
                required: false
              - name: flush
                type: boolean
                description: >
                  Whether to flush the context


                  See [Context
                  Flushing](/use-the-api/tts-websocket/context-flushing-and-flush-i-ds)
                  for details
                required: false
              - name: add_timestamps
                type: boolean
                description: >
                  Whether to include word-level timestamps for the generated
                  audio
                required: false
              - name: add_phoneme_timestamps
                type: boolean
                description: >
                  Whether to include phoneme-level timestamps for the generated
                  audio
                required: false
              - name: use_normalized_timestamps
                type: boolean
                description: >-
                  Whether to use normalized timestamps (`true`) or original
                  timestamps (`false`)
                required: false
              - name: pronunciation_dict_id
                type: string
                description: >-
                  The ID of a pronunciation dictionary to use for the generation

                  See [Custom
                  Pronunciations](/build-with-cartesia/capability-guides/custom-pronunciations)
                  for details
                required: false
              - name: generation_config
                type: object
                description: >-
                  Configure the various attributes of the generated speech

                  See [Volume, Speed, and
                  Emotion](/build-with-cartesia/capability-guides/volume-speed-emotion)
                  for details
                required: false
                properties:
                  - name: volume
                    type: number
                    description: >-
                      Adjust the volume of the generated speech between 0.5x and
                      2.0x the default volume. Valid values are between [0.5,
                      2.0] inclusive.
                    required: false
                  - name: speed
                    type: number
                    description: >-
                      Adjust the speed of the generated speech between 0.6x and
                      1.5x the default speed. Valid values are between [0.6,
                      1.5] inclusive.
                    required: false
                  - name: emotion
                    type: string
                    title: Emotion
                    description: >-
                      Must match a valid option exactly

                      A complete list can be found on the [Volume, Speed, and
                      Emotion](/build-with-cartesia/capability-guides/volume-speed-emotion)
                      page.
                    enumValues:
                      - neutral
                      - calm
                      - angry
                      - content
                      - sad
                    required: false
              - name: speed
                type: string
                description: >-
                  This property is deprecated and may not work for all voices.
                  Use `generation_config.speed` instead.

                  Influences the speed of the generated speech.
                enumValues:
                  - slow
                  - normal
                  - fast
                deprecated: true
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - model_id
            - transcript
            - voice
            - output_format
            - context_id
          properties:
            model_id:
              type: string
              enum:
                - sonic-3.5
                - sonic-3
                - sonic-latest
              example: sonic-3.5
              description: >-
                The ID of the model to use for the generation. See
                [Models](/build-with-cartesia/tts-models/latest) for all
                options.
              x-parser-schema-id: <anonymous-schema-2>
            transcript:
              type: string
              description: >-
                Transcript chunk to add to the audio being generated by this
                context.
              x-parser-schema-id: <anonymous-schema-3>
            voice:
              type: object
              description: >-
                Voice configuration. Use the same value for all generation
                requests made to the same context.
              required:
                - mode
                - id
              properties:
                mode:
                  type: string
                  enum:
                    - id
                  description: Voice selection mode
                  default: id
                  x-parser-schema-id: <anonymous-schema-5>
                id:
                  type: string
                  description: The ID of the voice.
                  x-parser-schema-id: <anonymous-schema-6>
              x-parser-schema-id: <anonymous-schema-4>
            output_format:
              type: object
              description: >-
                Audio output format configuration. Use the same value for all
                generation requests made to the same context.
              required:
                - container
                - encoding
                - sample_rate
              properties:
                container:
                  type: string
                  enum:
                    - raw
                  description: Audio container format
                  default: raw
                  x-parser-schema-id: <anonymous-schema-8>
                encoding:
                  type: string
                  enum:
                    - pcm_f32le
                    - pcm_s16le
                    - pcm_mulaw
                    - pcm_alaw
                  description: >-
                    Audio encoding format. See [TTS Output Audio
                    Format](/build-with-cartesia/capability-guides/tts-output-audio-format)
                    if you're unsure what to use.
                  x-parser-schema-id: <anonymous-schema-9>
                sample_rate:
                  type: integer
                  enum:
                    - 8000
                    - 16000
                    - 22050
                    - 24000
                    - 44100
                    - 48000
                  description: Audio sample rate in Hz.
                  x-parser-schema-id: <anonymous-schema-10>
              x-parser-schema-id: <anonymous-schema-7>
            language:
              type: string
              description: >-
                The transcript's language. Use the same value for all generation
                requests made to the same context.
              enum:
                - en
                - fr
                - de
                - es
                - pt
                - zh
                - ja
                - hi
                - it
                - ko
                - nl
                - pl
                - ru
                - sv
                - tr
                - tl
                - bg
                - ro
                - ar
                - cs
                - el
                - fi
                - hr
                - ms
                - sk
                - da
                - ta
                - uk
                - hu
                - 'no'
                - vi
                - bn
                - th
                - he
                - ka
                - id
                - te
                - gu
                - kn
                - ml
                - mr
                - pa
              x-parser-schema-id: <anonymous-schema-11>
            context_id: &ref_0
              type: string
              description: >
                A unique identifier for the context. You can use any unique
                identifier, like a UUID or human ID.


                See [Contexts](/use-the-api/tts-websocket/contexts) for details.
              x-parser-schema-id: TTSContextID
            continue:
              type: boolean
              description: |
                Whether `transcript` may be followed by more text:
                  - Set to `true` if you will add more `transcript` chunks to this context
                  - Set to `false` on the last `transcript` chunk for this context to minimize latency

                Defaults to `false`

                See [Contexts](/use-the-api/tts-websocket/contexts) for details
              default: false
              x-parser-schema-id: <anonymous-schema-12>
            max_buffer_delay_ms:
              type: integer
              description: >
                The maximum time in milliseconds to buffer text before starting
                generation. Values between [0, 5000]ms are supported. Defaults
                to 3000ms.


                See [Buffering](/use-the-api/tts-websocket/buffering) for
                details
              default: 3000
              x-parser-schema-id: <anonymous-schema-13>
            flush:
              type: boolean
              description: >
                Whether to flush the context


                See [Context
                Flushing](/use-the-api/tts-websocket/context-flushing-and-flush-i-ds)
                for details
              x-parser-schema-id: <anonymous-schema-14>
            add_timestamps:
              type: boolean
              description: |
                Whether to include word-level timestamps for the generated audio
              default: false
              x-parser-schema-id: <anonymous-schema-15>
            add_phoneme_timestamps:
              type: boolean
              description: >
                Whether to include phoneme-level timestamps for the generated
                audio
              default: false
              x-parser-schema-id: <anonymous-schema-16>
            use_normalized_timestamps:
              type: boolean
              description: >-
                Whether to use normalized timestamps (`true`) or original
                timestamps (`false`)
              x-parser-schema-id: <anonymous-schema-17>
            pronunciation_dict_id:
              type: string
              description: >-
                The ID of a pronunciation dictionary to use for the generation

                See [Custom
                Pronunciations](/build-with-cartesia/capability-guides/custom-pronunciations)
                for details
              x-parser-schema-id: <anonymous-schema-18>
            generation_config:
              type: object
              description: >-
                Configure the various attributes of the generated speech

                See [Volume, Speed, and
                Emotion](/build-with-cartesia/capability-guides/volume-speed-emotion)
                for details
              properties:
                volume:
                  type: number
                  description: >-
                    Adjust the volume of the generated speech between 0.5x and
                    2.0x the default volume. Valid values are between [0.5, 2.0]
                    inclusive.
                  minimum: 0.5
                  maximum: 2
                  default: 1
                  x-parser-schema-id: <anonymous-schema-20>
                speed:
                  type: number
                  description: >-
                    Adjust the speed of the generated speech between 0.6x and
                    1.5x the default speed. Valid values are between [0.6, 1.5]
                    inclusive.
                  minimum: 0.6
                  maximum: 1.5
                  default: 1
                  x-parser-schema-id: <anonymous-schema-21>
                emotion:
                  title: Emotion
                  type: string
                  description: >-
                    Must match a valid option exactly

                    A complete list can be found on the [Volume, Speed, and
                    Emotion](/build-with-cartesia/capability-guides/volume-speed-emotion)
                    page.
                  enum:
                    - neutral
                    - calm
                    - angry
                    - content
                    - sad
                  x-parser-schema-id: <anonymous-schema-22>
              x-parser-schema-id: <anonymous-schema-19>
            speed:
              deprecated: true
              type: string
              enum:
                - slow
                - normal
                - fast
              default: normal
              description: >-
                This property is deprecated and may not work for all voices. Use
                `generation_config.speed` instead.

                Influences the speed of the generated speech.
              x-parser-schema-id: <anonymous-schema-23>
          x-parser-schema-id: GenerationRequest
        title: Generation Request
        description: Use this to generate speech for a transcript.
        example: |-
          {
            "model_id": "sonic-latest",
            "transcript": "Hello, world! I'm generating audio on Cartesia!",
            "voice": {
              "mode": "id",
              "id": "a0e99841-438c-4a64-b679-ae501e7d6091"
            },
            "language": "en",
            "context_id": "ab977222-f9e0-4563-a1c0-5a934ae8fdd6",
            "output_format": {
              "container": "raw",
              "encoding": "pcm_s16le",
              "sample_rate": 8000
            },
            "add_timestamps": true,
            "continue": false
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: generationRequest
      - &ref_5
        id: cancelRequest
        payload:
          - name: Cancel Context Request
            description: >-
              Use this to cancel a context, so that no more messages are
              generated for that context.
            type: object
            properties:
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
              - name: cancel
                type: boolean
                description: >-
                  Whether to cancel the context, so that no more messages are
                  generated for that context.
                enumValues:
                  - true
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - context_id
            - cancel
          properties:
            context_id: *ref_0
            cancel:
              type: boolean
              enum:
                - true
              description: >-
                Whether to cancel the context, so that no more messages are
                generated for that context.
              x-parser-schema-id: <anonymous-schema-24>
          x-parser-schema-id: CancelRequest
        title: Cancel Context Request
        description: >-
          Use this to cancel a context, so that no more messages are generated
          for that context.
        example: |-
          {
            "context_id": "50dc3b5e-5841-4aa1-9f94-60cfb9aead79",
            "cancel": true
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: cancelRequest
    bindings: []
    extensions: &ref_1
      - id: x-parser-unique-object-id
        value: /tts/websocket
  - &ref_3
    id: receiveTTSAudio
    title: Receive t t s audio
    description: >-
      The server will send you back a stream of messages with the same
      `context_id` as your request. The messages can be of type `chunk`,
      `timestamps`, `phoneme_timestamps``,` `error`, or `done`.
    type: send
    messages:
      - &ref_6
        id: chunkResponse
        payload:
          - name: Audio Chunk Response
            description: Audio data chunk
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - chunk
                required: true
              - name: data
                type: string
                description: Base64-encoded audio data
                required: true
              - name: done
                type: boolean
                description: Whether this is the final chunk for this context
                required: true
              - name: status_code
                type: integer
                description: HTTP-style status code
                required: true
              - name: step_time
                type: number
                description: Server-side processing time for this chunk in milliseconds
                required: true
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - data
            - done
            - status_code
            - step_time
            - context_id
          properties:
            type:
              type: string
              enum:
                - chunk
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-25>
            data:
              type: string
              description: Base64-encoded audio data
              x-parser-schema-id: <anonymous-schema-26>
            done:
              type: boolean
              description: Whether this is the final chunk for this context
              x-parser-schema-id: <anonymous-schema-27>
            status_code:
              type: integer
              description: HTTP-style status code
              x-parser-schema-id: <anonymous-schema-28>
            step_time:
              type: number
              description: Server-side processing time for this chunk in milliseconds
              x-parser-schema-id: <anonymous-schema-29>
            context_id: *ref_0
          x-parser-schema-id: ChunkResponse
        title: Audio Chunk Response
        description: Audio data chunk
        example: |-
          {
            "type": "chunk",
            "data": "aSDinaTvuI8gbWludGxpZnk=",
            "done": false,
            "status_code": 206,
            "step_time": 123,
            "context_id": "50dc3b5e-5841-4aa1-9f94-60cfb9aead79"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: chunkResponse
      - &ref_7
        id: flushDoneResponse
        payload:
          - name: Flush Done Response
            description: >-
              Acknowledgment that flush command was received. See [Context
              Flushing](/use-the-api/tts-websocket/context-flushing-and-flush-i-ds)
              for details.
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - flush_done
                required: true
              - name: done
                type: boolean
                description: Whether generation is complete
                required: true
              - name: flush_done
                type: boolean
                description: Whether the flush is complete
                required: true
              - name: flush_id
                type: integer
                description: >-
                  An identifier corresponding to the number of flush commands
                  that have been sent for this context. Starts at 1. This can be
                  used to map chunks of audio to certain transcript submissions.
                required: true
              - name: status_code
                type: integer
                description: HTTP-style status code
                required: true
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - done
            - flush_done
            - flush_id
            - status_code
            - context_id
          properties:
            type:
              type: string
              enum:
                - flush_done
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-30>
            done:
              type: boolean
              description: Whether generation is complete
              x-parser-schema-id: <anonymous-schema-31>
            flush_done:
              type: boolean
              description: Whether the flush is complete
              x-parser-schema-id: <anonymous-schema-32>
            flush_id:
              type: integer
              description: >-
                An identifier corresponding to the number of flush commands that
                have been sent for this context. Starts at 1. This can be used
                to map chunks of audio to certain transcript submissions.
              x-parser-schema-id: <anonymous-schema-33>
            status_code:
              type: integer
              description: HTTP-style status code
              x-parser-schema-id: <anonymous-schema-34>
            context_id: *ref_0
          x-parser-schema-id: FlushDoneResponse
        title: Flush Done Response
        description: >-
          Acknowledgment that flush command was received. See [Context
          Flushing](/use-the-api/tts-websocket/context-flushing-and-flush-i-ds)
          for details.
        example: |-
          {
            "type": "flush_done",
            "done": false,
            "flush_done": true,
            "flush_id": 1,
            "status_code": 206,
            "context_id": "50dc3b5e-5841-4aa1-9f94-60cfb9aead79"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: flushDoneResponse
      - &ref_8
        id: doneResponse
        payload:
          - name: Done Response
            description: Generation completion signal
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - done
                required: true
              - name: done
                type: boolean
                description: Whether generation is complete
                required: true
              - name: status_code
                type: integer
                description: HTTP-style status code
                required: true
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - done
            - status_code
            - context_id
          properties:
            type:
              type: string
              enum:
                - done
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-35>
            done:
              type: boolean
              description: Whether generation is complete
              x-parser-schema-id: <anonymous-schema-36>
            status_code:
              type: integer
              description: HTTP-style status code
              x-parser-schema-id: <anonymous-schema-37>
            context_id: *ref_0
          x-parser-schema-id: DoneResponse
        title: Done Response
        description: Generation completion signal
        example: |-
          {
            "type": "done",
            "done": true,
            "status_code": 206,
            "context_id": "50dc3b5e-5841-4aa1-9f94-60cfb9aead79"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: doneResponse
      - &ref_9
        id: timestampsResponse
        payload:
          - name: Word Timestamps Response
            description: Word-level timing information
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - timestamps
                required: true
              - name: done
                type: boolean
                description: Whether generation is complete
                required: true
              - name: status_code
                type: integer
                description: HTTP-style status code
                required: true
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
              - name: word_timestamps
                type: object
                description: Word-level timing information
                required: false
                properties:
                  - name: words
                    type: array
                    description: List of words in order
                    required: false
                    properties:
                      - name: item
                        type: string
                        required: false
                  - name: start
                    type: array
                    description: Start times in seconds for each word
                    required: false
                    properties:
                      - name: item
                        type: number
                        required: false
                  - name: end
                    type: array
                    description: End times in seconds for each word
                    required: false
                    properties:
                      - name: item
                        type: number
                        required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - done
            - status_code
            - context_id
          properties:
            type:
              type: string
              enum:
                - timestamps
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-38>
            done:
              type: boolean
              description: Whether generation is complete
              x-parser-schema-id: <anonymous-schema-39>
            status_code:
              type: integer
              description: HTTP-style status code
              x-parser-schema-id: <anonymous-schema-40>
            context_id: *ref_0
            word_timestamps:
              type: object
              description: Word-level timing information
              properties:
                words:
                  type: array
                  items:
                    type: string
                    x-parser-schema-id: <anonymous-schema-43>
                  description: List of words in order
                  x-parser-schema-id: <anonymous-schema-42>
                start:
                  type: array
                  items:
                    type: number
                    x-parser-schema-id: <anonymous-schema-45>
                  description: Start times in seconds for each word
                  x-parser-schema-id: <anonymous-schema-44>
                end:
                  type: array
                  items:
                    type: number
                    x-parser-schema-id: <anonymous-schema-47>
                  description: End times in seconds for each word
                  x-parser-schema-id: <anonymous-schema-46>
              x-parser-schema-id: <anonymous-schema-41>
          x-parser-schema-id: TimestampsResponse
        title: Word Timestamps Response
        description: Word-level timing information
        example: |-
          {
            "type": "timestamps",
            "done": false,
            "status_code": 206,
            "context_id": "872ec12d-bc63-4e1e-a241-4f58c879d105",
            "word_timestamps": {
              "words": [
                "Hello",
                "world"
              ],
              "start": [
                0,
                0.5
              ],
              "end": [
                0.4,
                0.9
              ]
            }
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: timestampsResponse
      - &ref_10
        id: phonemeTimestampsResponse
        payload:
          - name: Phoneme Timestamps Response
            description: Phoneme-level timing information
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - phoneme_timestamps
                required: true
              - name: done
                type: boolean
                description: Whether generation is complete
                required: true
              - name: status_code
                type: integer
                description: HTTP-style status code
                required: true
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: true
              - name: phoneme_timestamps
                type: object
                description: Phoneme-level timing information
                required: false
                properties:
                  - name: phonemes
                    type: array
                    description: List of phonemes in order
                    required: false
                    properties:
                      - name: item
                        type: string
                        required: false
                  - name: start
                    type: array
                    description: Start times in seconds for each phoneme
                    required: false
                    properties:
                      - name: item
                        type: number
                        required: false
                  - name: end
                    type: array
                    description: End times in seconds for each phoneme
                    required: false
                    properties:
                      - name: item
                        type: number
                        required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - done
            - status_code
            - context_id
          properties:
            type:
              type: string
              enum:
                - phoneme_timestamps
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-48>
            done:
              type: boolean
              description: Whether generation is complete
              x-parser-schema-id: <anonymous-schema-49>
            status_code:
              type: integer
              description: HTTP-style status code
              x-parser-schema-id: <anonymous-schema-50>
            context_id: *ref_0
            phoneme_timestamps:
              type: object
              description: Phoneme-level timing information
              properties:
                phonemes:
                  type: array
                  items:
                    type: string
                    x-parser-schema-id: <anonymous-schema-53>
                  description: List of phonemes in order
                  x-parser-schema-id: <anonymous-schema-52>
                start:
                  type: array
                  items:
                    type: number
                    x-parser-schema-id: <anonymous-schema-55>
                  description: Start times in seconds for each phoneme
                  x-parser-schema-id: <anonymous-schema-54>
                end:
                  type: array
                  items:
                    type: number
                    x-parser-schema-id: <anonymous-schema-57>
                  description: End times in seconds for each phoneme
                  x-parser-schema-id: <anonymous-schema-56>
              x-parser-schema-id: <anonymous-schema-51>
          x-parser-schema-id: PhonemeTimestampsResponse
        title: Phoneme Timestamps Response
        description: Phoneme-level timing information
        example: |-
          {
            "type": "phoneme_timestamps",
            "done": false,
            "status_code": 206,
            "context_id": "872ec12d-bc63-4e1e-a241-4f58c879d105",
            "phoneme_timestamps": {
              "phonemes": [
                "h",
                "ə",
                "l",
                "oʊ"
              ],
              "start": [
                0.093,
                0.174,
                0.255,
                0.337
              ],
              "end": [
                0.174,
                0.255,
                0.337,
                0.418
              ]
            }
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: phonemeTimestampsResponse
      - &ref_11
        id: ttsErrorResponse
        payload:
          - name: Error Response
            description: Error information for TTS WebSocket connections.
            type: object
            properties:
              - name: type
                type: string
                description: Response type identifier
                enumValues:
                  - error
                required: true
              - name: done
                type: boolean
                description: Whether generation is complete
                required: false
              - name: error_code
                type: string
                description: Machine-readable error code.
                required: false
              - name: status_code
                type: number
                description: An HTTP response status code.
                required: true
              - name: title
                type: string
                description: Human-readable error title.
                required: true
              - name: message
                type: string
                description: Human-readable error message.
                required: true
              - name: doc_url
                type: string
                description: URL to relevant documentation
                required: false
              - name: request_id
                type: string
                description: Unique identifier for this websocket connection
                required: false
              - name: context_id
                type: string
                description: >
                  A unique identifier for the context. You can use any unique
                  identifier, like a UUID or human ID.


                  See [Contexts](/use-the-api/tts-websocket/contexts) for
                  details.
                required: false
        headers: []
        jsonPayloadSchema:
          type: object
          required:
            - type
            - title
            - message
            - status_code
          properties:
            type:
              type: string
              enum:
                - error
              description: Response type identifier
              x-parser-schema-id: <anonymous-schema-58>
            done:
              type: boolean
              description: Whether generation is complete
              x-parser-schema-id: <anonymous-schema-59>
            error_code:
              type: string
              description: Machine-readable error code.
              x-parser-schema-id: <anonymous-schema-60>
            status_code:
              type: number
              format: integer
              description: An HTTP response status code.
              x-parser-schema-id: <anonymous-schema-61>
            title:
              type: string
              description: Human-readable error title.
              x-parser-schema-id: <anonymous-schema-62>
            message:
              type: string
              description: Human-readable error message.
              x-parser-schema-id: <anonymous-schema-63>
            doc_url:
              type: string
              description: URL to relevant documentation
              x-parser-schema-id: <anonymous-schema-64>
            request_id:
              type: string
              description: Unique identifier for this websocket connection
              x-parser-schema-id: <anonymous-schema-65>
            context_id: *ref_0
          x-parser-schema-id: TTSErrorResponse
        title: Error Response
        description: Error information for TTS WebSocket connections.
        example: |-
          {
            "type": "error",
            "done": true,
            "title": "Invalid model",
            "message": "The model is not valid, make sure it is a valid model ID.",
            "error_code": "model_not_found",
            "status_code": 400,
            "doc_url": "https://docs.cartesia.ai/build-with-cartesia/tts-models/latest",
            "request_id": "2ff8af53-4d38-479d-8287-58940f01c701",
            "context_id": "50dc3b5e-5841-4aa1-9f94-60cfb9aead79"
          }
        bindings: []
        extensions:
          - id: x-parser-unique-object-id
            value: ttsErrorResponse
    bindings: []
    extensions: *ref_1
sendOperations:
  - *ref_2
receiveOperations:
  - *ref_3
sendMessages:
  - *ref_4
  - *ref_5
receiveMessages:
  - *ref_6
  - *ref_7
  - *ref_8
  - *ref_9
  - *ref_10
  - *ref_11
extensions:
  - id: x-parser-unique-object-id
    value: /tts/websocket
securitySchemes:
  - id: apiKey
    name: X-API-Key
    type: httpApiKey
    description: API key passed in a header.
    in: header
    extensions: []
  - id: accessTokenQuery
    name: access_token
    type: httpApiKey
    description: >
      A short-lived access token passed in a query param to make API requests
      from a client.

      This is particularly useful in the browser, where WebSockets do not
      support headers.

      See [Authenticate client
      apps](/get-started/authenticate-your-client-applications) to generate an
      access token.
    in: query
    extensions: []

````