> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# テキスト読み上げ (Bytes)

> Stream audio from a complete transcript



## OpenAPI

````yaml 2024-06-10/api.yml POST /tts/bytes
openapi: 3.0.1
info:
  title: Cartesia API
  version: '2024-06-10'
servers:
  - url: https://api.cartesia.ai
    description: Production
security: []
paths:
  /tts/bytes:
    post:
      tags:
        - Tts
      summary: Text-to-Speech (Bytes)
      description: Stream audio from a complete transcript
      operationId: tts_bytes
      parameters:
        - $ref: '#/components/parameters/CartesiaVersionHeader'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TTSRequest'
      responses:
        '200':
          description: Audio bytes
          content:
            audio/*:
              schema:
                type: string
                format: binary
      security:
        - ApiKeyAuth: []
components:
  parameters:
    CartesiaVersionHeader:
      name: Cartesia-Version
      in: header
      description: API version header.
      required: true
      schema:
        type: string
        format: date
        example: '2024-06-10'
        enum:
          - '2024-06-10'
  schemas:
    TTSRequest:
      title: TTSRequest
      type: object
      properties:
        model_id:
          $ref: '#/components/schemas/TTSModelID'
        transcript:
          type: string
        voice:
          $ref: '#/components/schemas/TTSRequestVoiceSpecifier'
        language:
          $ref: '#/components/schemas/SupportedLanguage'
          nullable: true
        output_format:
          $ref: '#/components/schemas/OutputFormat'
        duration:
          type: number
          format: double
          nullable: true
          description: >-
            The maximum duration of the audio in seconds. You do not usually
            need to specify this.

            If the duration is not appropriate for the length of the transcript,
            the output audio may be truncated.
        speed:
          $ref: '#/components/schemas/ModelSpeed'
          nullable: true
      required:
        - model_id
        - transcript
        - voice
        - output_format
    TTSModelID:
      title: TTSModelID
      type: string
      enum:
        - sonic-3.5
        - sonic-3
        - sonic-latest
      example: sonic-3.5
      description: |-
        The ID of the model to use for the generation.
        See [Models](/build-with-cartesia/tts-models/latest) all options.
    TTSRequestVoiceSpecifier:
      title: TTSRequestVoiceSpecifier
      oneOf:
        - $ref: '#/components/schemas/TTSRequestIdSpecifier'
        - $ref: '#/components/schemas/TTSRequestEmbeddingSpecifier'
    SupportedLanguage:
      title: SupportedLanguage
      type: string
      enum:
        - en
        - fr
        - de
        - es
        - pt
        - zh
        - ja
        - hi
        - it
        - ko
        - nl
        - pl
        - ru
        - sv
        - tr
      description: The language that the given voice should speak the transcript in.
    OutputFormat:
      title: OutputFormat
      oneOf:
        - type: object
          allOf:
            - type: object
              properties:
                container:
                  type: string
                  enum:
                    - raw
            - $ref: '#/components/schemas/RawOutputFormat'
          required:
            - container
        - type: object
          allOf:
            - type: object
              properties:
                container:
                  type: string
                  enum:
                    - wav
            - $ref: '#/components/schemas/WAVOutputFormat'
          required:
            - container
        - type: object
          allOf:
            - type: object
              properties:
                container:
                  type: string
                  enum:
                    - mp3
            - $ref: '#/components/schemas/MP3OutputFormat'
          required:
            - container
    ModelSpeed:
      title: ModelSpeed
      deprecated: true
      type: string
      enum:
        - slow
        - normal
        - fast
      default: normal
      description: >-
        Influences the speed of the generated speech. Faster speeds may reduce
        hallucination rate.

        > This feature is experimental and may not work for all voices.
    TTSRequestIdSpecifier:
      title: TTSRequestIdSpecifier
      type: object
      properties:
        mode:
          type: string
          enum:
            - id
        id:
          $ref: '#/components/schemas/VoiceId'
        __experimental_controls:
          $ref: '#/components/schemas/Controls'
          nullable: true
          deprecated: true
          description: This field will no longer be supported after June 1, 2026.
      required:
        - mode
        - id
    TTSRequestEmbeddingSpecifier:
      title: TTSRequestEmbeddingSpecifier
      type: object
      deprecated: true
      description: >-
        Voice embeddings will no longer be supported after June 1, 2026. Use
        voice IDs instead. See [API
        Changes](/build-with-cartesia/tts-models/api-changes) for details.
      properties:
        mode:
          type: string
          enum:
            - embedding
          deprecated: true
          description: >-
            Voice embeddings will no longer be supported after June 1, 2026. Use
            voice IDs instead. See [API
            Changes](/build-with-cartesia/tts-models/api-changes) for details.
        embedding:
          $ref: '#/components/schemas/Embedding'
          deprecated: true
          description: >-
            Voice embeddings will no longer be supported after June 1, 2026. Use
            voice IDs instead. See [API
            Changes](/build-with-cartesia/tts-models/api-changes) for details.
        __experimental_controls:
          $ref: '#/components/schemas/Controls'
          nullable: true
          deprecated: true
          description: This field will no longer be supported after June 1, 2026.
      required:
        - mode
        - embedding
    RawOutputFormat:
      title: RawOutputFormat
      type: object
      properties:
        encoding:
          $ref: '#/components/schemas/RawEncoding'
        sample_rate:
          type: integer
          description: >-
            The sample rate of the audio in Hz. Supported sample rates are 8000,
            16000, 22050, 24000, 44100, 48000.
        bit_rate:
          type: integer
          nullable: true
      required:
        - encoding
        - sample_rate
    WAVOutputFormat:
      title: WAVOutputFormat
      type: object
      properties: {}
      allOf:
        - $ref: '#/components/schemas/RawOutputFormat'
    MP3OutputFormat:
      title: MP3OutputFormat
      type: object
      properties:
        sample_rate:
          type: integer
          description: >-
            The sample rate of the audio in Hz. Supported sample rates are 8000,
            16000, 22050, 24000, 44100, 48000.
        bit_rate:
          type: integer
          description: >-
            The bit rate of the audio in bits per second. Supported bit rates
            are 32000, 64000, 96000, 128000, 192000.
      required:
        - sample_rate
        - bit_rate
    VoiceId:
      title: VoiceId
      type: string
      description: The ID of the voice.
    Controls:
      title: Controls
      type: object
      properties:
        speed:
          $ref: '#/components/schemas/Speed'
        emotion:
          type: array
          items:
            $ref: '#/components/schemas/Emotion'
      required:
        - speed
        - emotion
    Embedding:
      title: Embedding
      type: array
      items:
        type: number
        format: double
      description: >-
        A 192-dimensional vector (i.e. a list of 192 numbers) that represents
        the voice.
    RawEncoding:
      title: RawEncoding
      type: string
      description: >-
        The encoding format for output audio. See [TTS Output Audio
        Format](/build-with-cartesia/capability-guides/tts-output-audio-format)
        if you're unsure what to use.
      enum:
        - pcm_f32le
        - pcm_s16le
        - pcm_mulaw
        - pcm_alaw
    Speed:
      title: Speed
      oneOf:
        - $ref: '#/components/schemas/NumericalSpecifier'
        - $ref: '#/components/schemas/NaturalSpecifier'
      description: >-
        Either a number between -1.0 and 1.0 or a natural language description
        of speed.


        If you specify a number, 0.0 is the default speed, -1.0 is the slowest
        speed, and 1.0 is the fastest speed.
    Emotion:
      title: Emotion
      type: string
      enum:
        - anger:lowest
        - anger:low
        - anger
        - anger:high
        - anger:highest
        - positivity:lowest
        - positivity:low
        - positivity
        - positivity:high
        - positivity:highest
        - surprise:lowest
        - surprise:low
        - surprise
        - surprise:high
        - surprise:highest
        - sadness:lowest
        - sadness:low
        - sadness
        - sadness:high
        - sadness:highest
        - curiosity:lowest
        - curiosity:low
        - curiosity
        - curiosity:high
        - curiosity:highest
      description: >-
        An array of emotion:level tags.


        Supported emotions are: anger, positivity, surprise, sadness, and
        curiosity.


        Supported levels are: lowest, low, (omit), high, highest.
    NumericalSpecifier:
      title: NumericalSpecifier
      type: number
      format: double
    NaturalSpecifier:
      title: NaturalSpecifier
      type: string
      enum:
        - slowest
        - slow
        - normal
        - fast
        - fastest
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-API-Key

````