> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Text-to-Speech (SSE)

> Stream audio with extra metadata from a complete transcript



## OpenAPI

````yaml 2024-06-10/api.yml POST /tts/sse
openapi: 3.0.1
info:
  title: Cartesia API
  version: '2024-06-10'
servers:
  - url: https://api.cartesia.ai
    description: Production
security: []
paths:
  /tts/sse:
    post:
      tags:
        - Tts
      summary: Text-to-Speech (SSE)
      description: Stream audio with extra metadata from a complete transcript
      operationId: tts_sse
      parameters:
        - $ref: '#/components/parameters/CartesiaVersionHeader'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TTSSSERequest'
      responses:
        '200':
          description: >-
            Server-sent events stream. Each frame is `data: <json>\n\n` where
            the JSON payload matches `TTSSSEEvent`.
          content:
            text/event-stream:
              schema:
                $ref: '#/components/schemas/TTSSSEEvent'
      security:
        - ApiKeyAuth: []
components:
  parameters:
    CartesiaVersionHeader:
      name: Cartesia-Version
      in: header
      description: API version header.
      required: true
      schema:
        type: string
        format: date
        example: '2024-06-10'
        enum:
          - '2024-06-10'
  schemas:
    TTSSSERequest:
      title: TTSSSERequest
      type: object
      properties:
        model_id:
          $ref: '#/components/schemas/TTSModelID'
        transcript:
          type: string
        voice:
          $ref: '#/components/schemas/TTSRequestVoiceSpecifier'
        language:
          $ref: '#/components/schemas/SupportedLanguage'
          nullable: true
        output_format:
          $ref: '#/components/schemas/SSEOutputFormat'
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: >-
            This can be any string value you find useful. The server will echo
            back the same `context_id` in events that it sends.

            > Contexts on the [TTS
            (WebSocket)](/2024-06-10/api-reference/tts/websocket) endpoint are
            used for
            [continuations](/build-with-cartesia/capability-guides/stream-inputs-using-continuations).  
            > The TTS (SSE) endpoint does not support continuations, so most
            users just ignore this property.
        duration:
          type: number
          format: double
          nullable: true
          description: >-
            The maximum duration of the audio in seconds. You do not usually
            need to specify this.

            If the duration is not appropriate for the length of the transcript,
            the output audio may be truncated.
        add_timestamps:
          type: boolean
          nullable: true
          description: >-
            Whether to return word-level timestamps. If `false` (default), no
            word timestamps will be produced at all. If `true`, the server will
            return timestamp events containing word-level timing information.
        add_phoneme_timestamps:
          type: boolean
          nullable: true
          description: >-
            Whether to return phoneme-level timestamps. If `false` (default), no
            phoneme timestamps will be produced - if `add_timestamps` is `true`,
            the produced timestamps will be word timestamps instead. If `true`,
            the server will return timestamp events containing phoneme-level
            timing information.
        use_normalized_timestamps:
          type: boolean
          nullable: true
          description: >-
            Whether to use normalized timestamps (True) or original timestamps
            (False).
        speed:
          $ref: '#/components/schemas/ModelSpeed'
          nullable: true
      required:
        - model_id
        - transcript
        - voice
        - output_format
    TTSSSEEvent:
      title: TTSSSEEvent
      description: An event emitted by the TTS SSE stream.
      oneOf:
        - $ref: '#/components/schemas/TTSSSEChunkEvent'
        - $ref: '#/components/schemas/TTSSSETimestampsEvent'
        - $ref: '#/components/schemas/TTSSSEPhonemeTimestampsEvent'
        - $ref: '#/components/schemas/TTSSSEDoneEvent'
        - $ref: '#/components/schemas/TTSSSEErrorEvent'
      discriminator:
        propertyName: type
        mapping:
          chunk:
            $ref: '#/components/schemas/TTSSSEChunkEvent'
          timestamps:
            $ref: '#/components/schemas/TTSSSETimestampsEvent'
          phoneme_timestamps:
            $ref: '#/components/schemas/TTSSSEPhonemeTimestampsEvent'
          done:
            $ref: '#/components/schemas/TTSSSEDoneEvent'
          error:
            $ref: '#/components/schemas/TTSSSEErrorEvent'
    TTSModelID:
      title: TTSModelID
      type: string
      enum:
        - sonic-3.5
        - sonic-3
        - sonic-latest
      example: sonic-3.5
      description: |-
        The ID of the model to use for the generation.
        See [Models](/build-with-cartesia/tts-models/latest) all options.
    TTSRequestVoiceSpecifier:
      title: TTSRequestVoiceSpecifier
      oneOf:
        - $ref: '#/components/schemas/TTSRequestIdSpecifier'
        - $ref: '#/components/schemas/TTSRequestEmbeddingSpecifier'
    SupportedLanguage:
      title: SupportedLanguage
      type: string
      enum:
        - en
        - fr
        - de
        - es
        - pt
        - zh
        - ja
        - hi
        - it
        - ko
        - nl
        - pl
        - ru
        - sv
        - tr
      description: The language that the given voice should speak the transcript in.
    SSEOutputFormat:
      title: SSEOutputFormat
      type: object
      properties:
        container:
          type: string
          enum:
            - raw
        encoding:
          $ref: '#/components/schemas/RawEncoding'
        sample_rate:
          type: integer
          description: >-
            The sample rate of the audio in Hz. Supported sample rates are 8000,
            16000, 22050, 24000, 44100, 48000.
      required:
        - container
        - encoding
        - sample_rate
    SSEContextID:
      title: SSEContextID
      type: string
      nullable: true
    ModelSpeed:
      title: ModelSpeed
      deprecated: true
      type: string
      enum:
        - slow
        - normal
        - fast
      default: normal
      description: >-
        Influences the speed of the generated speech. Faster speeds may reduce
        hallucination rate.

        > This feature is experimental and may not work for all voices.
    TTSSSEChunkEvent:
      title: TTSSSEChunkEvent
      description: Audio data chunk.
      type: object
      example:
        type: chunk
        done: false
        status_code: 206
        step_time: 123
        context_id: 50dc3b5e-5841-4aa1-9f94-60cfb9aead79
        data: aSDinaTvuI8gbWludGxpZnk=
      properties:
        type:
          type: string
          enum:
            - chunk
          description: Event type identifier.
        done:
          type: boolean
          enum:
            - false
          description: >-
            Whether this is the final event for the request. Always `false` for
            chunk events.
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: The context ID echoed back from the request, if one was provided.
        data:
          type: string
          description: Base64-encoded audio data.
        step_time:
          type: number
          description: Server-side processing time for this chunk in milliseconds.
        status_code:
          type: integer
          description: HTTP-style status code.
      required:
        - type
        - done
        - data
        - step_time
        - status_code
    TTSSSETimestampsEvent:
      title: TTSSSETimestampsEvent
      description: Word-level timing information.
      type: object
      example:
        type: timestamps
        done: false
        status_code: 206
        context_id: 872ec12d-bc63-4e1e-a241-4f58c879d105
        word_timestamps:
          words:
            - Hello
            - world
          start:
            - 0
            - 0.5
          end:
            - 0.4
            - 0.9
      properties:
        type:
          type: string
          enum:
            - timestamps
          description: Event type identifier.
        done:
          type: boolean
          enum:
            - false
          description: >-
            Whether this is the final event for the request. Always `false` for
            timestamps events.
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: The context ID echoed back from the request, if one was provided.
        word_timestamps:
          type: object
          description: Word-level timing information.
          properties:
            words:
              type: array
              items:
                type: string
              description: List of words in order.
            start:
              type: array
              items:
                type: number
              description: Start times in seconds for each word.
            end:
              type: array
              items:
                type: number
              description: End times in seconds for each word.
          required:
            - words
            - start
            - end
        status_code:
          type: integer
          description: HTTP-style status code.
      required:
        - type
        - done
        - word_timestamps
        - status_code
    TTSSSEPhonemeTimestampsEvent:
      title: TTSSSEPhonemeTimestampsEvent
      description: Phoneme-level timing information.
      type: object
      example:
        type: phoneme_timestamps
        done: false
        status_code: 206
        context_id: 872ec12d-bc63-4e1e-a241-4f58c879d105
        phoneme_timestamps:
          phonemes:
            - h
            - ə
            - l
            - oʊ
          start:
            - 0.093
            - 0.174
            - 0.255
            - 0.337
          end:
            - 0.174
            - 0.255
            - 0.337
            - 0.418
      properties:
        type:
          type: string
          enum:
            - phoneme_timestamps
          description: Event type identifier.
        done:
          type: boolean
          enum:
            - false
          description: >-
            Whether this is the final event for the request. Always `false` for
            phoneme_timestamps events.
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: The context ID echoed back from the request, if one was provided.
        phoneme_timestamps:
          type: object
          description: Phoneme-level timing information.
          properties:
            phonemes:
              type: array
              items:
                type: string
              description: List of phonemes in order.
            start:
              type: array
              items:
                type: number
              description: Start times in seconds for each phoneme.
            end:
              type: array
              items:
                type: number
              description: End times in seconds for each phoneme.
          required:
            - phonemes
            - start
            - end
        status_code:
          type: integer
          description: HTTP-style status code.
      required:
        - type
        - done
        - phoneme_timestamps
        - status_code
    TTSSSEDoneEvent:
      title: TTSSSEDoneEvent
      description: Generation completion signal. Final event in the stream.
      type: object
      example:
        type: done
        done: true
        status_code: 206
        context_id: 50dc3b5e-5841-4aa1-9f94-60cfb9aead79
      properties:
        type:
          type: string
          enum:
            - done
          description: Event type identifier.
        done:
          type: boolean
          enum:
            - true
          description: Whether generation is complete. Always `true` for done events.
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: The context ID echoed back from the request, if one was provided.
        status_code:
          type: integer
          description: HTTP-style status code.
      required:
        - type
        - done
        - status_code
    TTSSSEErrorEvent:
      title: TTSSSEErrorEvent
      description: Error information for the TTS SSE request.
      type: object
      example:
        type: error
        done: true
        status_code: 400
        error: >-
          Invalid model: The model is not valid, make sure it is a valid model
          ID.
        context_id: 50dc3b5e-5841-4aa1-9f94-60cfb9aead79
      properties:
        type:
          type: string
          enum:
            - error
          description: Event type identifier.
        done:
          type: boolean
          description: Whether generation is complete.
        status_code:
          type: integer
          description: HTTP-style status code.
        error:
          type: string
          description: Error message describing what went wrong.
        context_id:
          $ref: '#/components/schemas/SSEContextID'
          description: The context ID echoed back from the request, if one was provided.
      required:
        - type
        - done
        - status_code
        - error
    TTSRequestIdSpecifier:
      title: TTSRequestIdSpecifier
      type: object
      properties:
        mode:
          type: string
          enum:
            - id
        id:
          $ref: '#/components/schemas/VoiceId'
        __experimental_controls:
          $ref: '#/components/schemas/Controls'
          nullable: true
          deprecated: true
          description: This field will no longer be supported after June 1, 2026.
      required:
        - mode
        - id
    TTSRequestEmbeddingSpecifier:
      title: TTSRequestEmbeddingSpecifier
      type: object
      deprecated: true
      description: >-
        Voice embeddings will no longer be supported after June 1, 2026. Use
        voice IDs instead. See [API
        Changes](/build-with-cartesia/tts-models/api-changes) for details.
      properties:
        mode:
          type: string
          enum:
            - embedding
          deprecated: true
          description: >-
            Voice embeddings will no longer be supported after June 1, 2026. Use
            voice IDs instead. See [API
            Changes](/build-with-cartesia/tts-models/api-changes) for details.
        embedding:
          $ref: '#/components/schemas/Embedding'
          deprecated: true
          description: >-
            Voice embeddings will no longer be supported after June 1, 2026. Use
            voice IDs instead. See [API
            Changes](/build-with-cartesia/tts-models/api-changes) for details.
        __experimental_controls:
          $ref: '#/components/schemas/Controls'
          nullable: true
          deprecated: true
          description: This field will no longer be supported after June 1, 2026.
      required:
        - mode
        - embedding
    RawEncoding:
      title: RawEncoding
      type: string
      description: >-
        The encoding format for output audio. See [TTS Output Audio
        Format](/build-with-cartesia/capability-guides/tts-output-audio-format)
        if you're unsure what to use.
      enum:
        - pcm_f32le
        - pcm_s16le
        - pcm_mulaw
        - pcm_alaw
    VoiceId:
      title: VoiceId
      type: string
      description: The ID of the voice.
    Controls:
      title: Controls
      type: object
      properties:
        speed:
          $ref: '#/components/schemas/Speed'
        emotion:
          type: array
          items:
            $ref: '#/components/schemas/Emotion'
      required:
        - speed
        - emotion
    Embedding:
      title: Embedding
      type: array
      items:
        type: number
        format: double
      description: >-
        A 192-dimensional vector (i.e. a list of 192 numbers) that represents
        the voice.
    Speed:
      title: Speed
      oneOf:
        - $ref: '#/components/schemas/NumericalSpecifier'
        - $ref: '#/components/schemas/NaturalSpecifier'
      description: >-
        Either a number between -1.0 and 1.0 or a natural language description
        of speed.


        If you specify a number, 0.0 is the default speed, -1.0 is the slowest
        speed, and 1.0 is the fastest speed.
    Emotion:
      title: Emotion
      type: string
      enum:
        - anger:lowest
        - anger:low
        - anger
        - anger:high
        - anger:highest
        - positivity:lowest
        - positivity:low
        - positivity
        - positivity:high
        - positivity:highest
        - surprise:lowest
        - surprise:low
        - surprise
        - surprise:high
        - surprise:highest
        - sadness:lowest
        - sadness:low
        - sadness
        - sadness:high
        - sadness:highest
        - curiosity:lowest
        - curiosity:low
        - curiosity
        - curiosity:high
        - curiosity:highest
      description: >-
        An array of emotion:level tags.


        Supported emotions are: anger, positivity, surprise, sadness, and
        curiosity.


        Supported levels are: lowest, low, (omit), high, highest.
    NumericalSpecifier:
      title: NumericalSpecifier
      type: number
      format: double
    NaturalSpecifier:
      title: NaturalSpecifier
      type: string
      enum:
        - slowest
        - slow
        - normal
        - fast
        - fastest
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-API-Key

````