> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cartesia.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Infill (Bytes)

> Generate audio that smoothly connects two existing audio segments


## OpenAPI

````yaml 2024-06-10/api.yml POST /infill/bytes
openapi: 3.0.1
info:
  title: Cartesia API
  version: '2024-06-10'
servers:
  - url: https://api.cartesia.ai
    description: Production
security: []
paths:
  /infill/bytes:
    post:
      tags:
        - Infill
      summary: Infill (Bytes)
      description: Generate audio that smoothly connects two existing audio segments
      operationId: infill_bytes
      parameters:
        - $ref: '#/components/parameters/CartesiaVersionHeader'
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                left_audio:
                  type: string
                  format: binary
                  description: >-
                    Audio clip that comes before the infill transcript:

                    `left_audio` -> `transcript` -> `right_audio`


                    For best results, target natural pauses in the audio and
                    clip tightly.

                    At least one of `left_audio` or `right_audio` must be
                    provided.


                    Supported audio formats: `flac`, `mp3`, `mpeg`, `mpga`,
                    `oga`, `ogg`, `wav`, `webm`
                right_audio:
                  type: string
                  format: binary
                  description: >-
                    Audio clip that comes after the infill transcript:

                    `left_audio` -> `transcript` -> `right_audio`


                    For best results, target natural pauses in the audio and
                    clip tightly.

                    At least one of `left_audio` or `right_audio` must be
                    provided.


                    Supported audio formats: `flac`, `mp3`, `mpeg`, `mpga`,
                    `oga`, `ogg`, `wav`, `webm`
                model_id:
                  description: The ID of the model to use for generating audio
                  type: string
                  enum:
                    - sonic-3
                    - sonic-3-2026-01-12
                    - sonic-3-2025-10-27
                language:
                  $ref: '#/components/schemas/SupportedLanguage'
                  description: The language of the transcript
                transcript:
                  description: >-
                    The infill text to generate.

                    For best results, use longer transcripts to give the model
                    more flexibility to adapt to the rest of the audio.
                  type: string
                voice_id:
                  description: The ID of the voice to use for generating audio
                  type: string
                output_format[container]:
                  $ref: '#/components/schemas/OutputFormatContainer'
                  description: The format of the output audio
                output_format[sample_rate]:
                  description: >-
                    The sample rate of the output audio in Hz. Supported sample
                    rates are 8000, 16000, 22050, 24000, 44100, 48000.
                  type: integer
                output_format[encoding]:
                  $ref: '#/components/schemas/RawEncoding'
                  description: Required for `raw` and `wav` containers.
                  nullable: true
                output_format[bit_rate]:
                  description: Required for `mp3` containers.
                  type: integer
                  nullable: true
                voice[__experimental_controls][speed]:
                  $ref: '#/components/schemas/Speed'
                  description: >-
                    Either a number between -1.0 and 1.0 or a natural language
                    description of speed.


                    If you specify a number, 0.0 is the default speed, -1.0 is
                    the slowest speed, and 1.0 is the fastest speed.
                  nullable: true
                voice[__experimental_controls][emotion][]:
                  description: >-
                    An array of emotion:level tags.


                    Supported emotions are: anger, positivity, surprise,
                    sadness, and curiosity.


                    Supported levels are: lowest, low, (omit), high, highest.
                  type: array
                  items:
                    $ref: '#/components/schemas/Emotion'
                  nullable: true
      responses:
        '200':
          description: Audio bytes
          content:
            audio/*:
              schema:
                type: string
                format: binary
      security:
        - ApiKeyAuth: []
components:
  parameters:
    CartesiaVersionHeader:
      name: Cartesia-Version
      in: header
      description: API version header.
      required: true
      schema:
        type: string
        format: date
        example: '2024-06-10'
        enum:
          - '2024-06-10'
  schemas:
    SupportedLanguage:
      title: SupportedLanguage
      type: string
      enum:
        - en
        - fr
        - de
        - es
        - pt
        - zh
        - ja
        - hi
        - it
        - ko
        - nl
        - pl
        - ru
        - sv
        - tr
      description: The language that the given voice should speak the transcript in.
    OutputFormatContainer:
      title: OutputFormatContainer
      type: string
      enum:
        - raw
        - wav
        - mp3
    RawEncoding:
      title: RawEncoding
      type: string
      description: >-
        The encoding format for output audio. See [TTS Output Audio
        Format](/build-with-cartesia/capability-guides/tts-output-audio-format)
        if you're unsure what to use.
      enum:
        - pcm_f32le
        - pcm_s16le
        - pcm_mulaw
        - pcm_alaw
    Speed:
      title: Speed
      oneOf:
        - $ref: '#/components/schemas/NumericalSpecifier'
        - $ref: '#/components/schemas/NaturalSpecifier'
      description: >-
        Either a number between -1.0 and 1.0 or a natural language description
        of speed.


        If you specify a number, 0.0 is the default speed, -1.0 is the slowest
        speed, and 1.0 is the fastest speed.
    Emotion:
      title: Emotion
      type: string
      enum:
        - anger:lowest
        - anger:low
        - anger
        - anger:high
        - anger:highest
        - positivity:lowest
        - positivity:low
        - positivity
        - positivity:high
        - positivity:highest
        - surprise:lowest
        - surprise:low
        - surprise
        - surprise:high
        - surprise:highest
        - sadness:lowest
        - sadness:low
        - sadness
        - sadness:high
        - sadness:highest
        - curiosity:lowest
        - curiosity:low
        - curiosity
        - curiosity:high
        - curiosity:highest
      description: >-
        An array of emotion:level tags.


        Supported emotions are: anger, positivity, surprise, sadness, and
        curiosity.


        Supported levels are: lowest, low, (omit), high, highest.
    NumericalSpecifier:
      title: NumericalSpecifier
      type: number
      format: double
    NaturalSpecifier:
      title: NaturalSpecifier
      type: string
      enum:
        - slowest
        - slow
        - normal
        - fast
        - fastest
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-API-Key

````