End-to-end Pro Voice Cloning (Python)

Prerequisites

You have a Cartesia API token (export it as CARTESIA_API_TOKEN).

You have at least 1 M credits on your account.

You have a folder called samples/ with one or more .wav files.
1 """
2 End-to-end Pro Voice Cloning example.
3 
4 Steps
5 -----
6 1. Create a dataset.
7 2. Upload audio files from samples/ to the dataset.
8 3. Kick off a fine-tune from that dataset.
9 4. Poll until fine-tune is completed.
10 5. Get the voices produced by the fine-tune.
11 """
12 
13 import os
14 import time
15 from pathlib import Path
16 
17 import requests
18 
19 API_BASE = "https://api.cartesia.ai"
20 API_HEADERS = {
21     "Cartesia-Version": "2025-04-16",
22     "Authorization": f"Bearer {os.environ['CARTESIA_API_KEY']}",
23 }
24 
25 
26 def create_dataset(name: str, description: str) -> str:
27     """POST /datasets → dataset id."""
28     res = requests.post(
29         f"{API_BASE}/datasets",
30         headers=API_HEADERS,
31         json={"name": name, "description": description},
32     )
33     res.raise_for_status()
34     return res.json()["id"]
35 
36 
37 def upload_file_to_dataset(dataset_id: str, path: Path) -> None:
38     """POST /datasets/{dataset_id}/files (multipart/form-data)."""
39     with path.open("rb") as fp:
40         res = requests.post(
41             f"{API_BASE}/datasets/{dataset_id}/files",
42             headers=API_HEADERS,
43             files={"file": fp, "purpose": (None, "fine_tune")},
44         )
45     res.raise_for_status()
46 
47 
48 def create_fine_tune(dataset_id: str, *, name: str, language: str, model_id: str) -> str:
49     """POST /fine-tunes → fine-tune id."""
50     body = {
51         "name": name,
52         "description": "Pro Voice Clone demo",
53         "language": language,
54         "model_id": model_id,
55         "dataset": dataset_id,
56     }
57     res = requests.post(f"{API_BASE}/fine-tunes", headers=API_HEADERS, json=body, timeout=60)
58     res.raise_for_status()
59     return res.json()["id"]
60 
61 
62 def wait_for_fine_tune(ft_id: str, every: float = 10.0) -> None:
63     """Poll GET /fine-tunes/{id} until status == completed."""
64     start = time.monotonic()
65     while True:
66         res = requests.get(f"{API_BASE}/fine-tunes/{ft_id}", headers=API_HEADERS)
67         res.raise_for_status()
68         status = res.json()["status"]
69         print(f"fine-tune {ft_id} -> {status}. Elapsed: {time.monotonic() - start:.0f}s")
70         if status == "completed":
71             return
72         if status == "failed":
73             raise RuntimeError(f"fine-tune ended with status={status}")
74         time.sleep(every)
75 
76 
77 def list_voices(ft_id: str) -> list[dict]:
78     """GET /fine-tunes/{id}/voices → list of voices."""
79     res = requests.get(f"{API_BASE}/fine-tunes/{ft_id}/voices", headers=API_HEADERS)
80     res.raise_for_status()
81     return res.json()["data"]
82 
83 
84 if __name__ == "__main__":
85     # Create the dataset
86     DATASET_ID = create_dataset("PVC demo", "Samples for a Pro Voice Clone")
87     print("Created dataset:", DATASET_ID)
88 
89     # Upload .wav files to the dataset
90     for wav_path in Path("samples").glob("*.wav"):
91         upload_file_to_dataset(DATASET_ID, wav_path)
92         print(f"Uploaded {wav_path.name} to dataset {DATASET_ID}")
93 
94     # Ask for confirmation before kicking off the fine-tune
95     confirmation = input(
96         "Are you sure you want to start the fine-tune? It will cost 1M credits upon successful completion (yes/no): "
97     )
98     if confirmation.lower() != "yes":
99         print("Fine-tuning cancelled by user.")
100         exit()
101 
102     # Kick off the fine-tune
103     FINE_TUNE_ID = create_fine_tune(
104         DATASET_ID,
105         name="PVC demo",
106         language="en",
107         model_id="sonic-2",
108     )
109     print(f"Started fine-tune: {FINE_TUNE_ID}")
110 
111     # Wait for training to finish
112     wait_for_fine_tune(FINE_TUNE_ID)
113     print("Fine-tune completed!")
114 
115     # Fetch the voices created by the fine-tune
116     FINE_TUNE_ID = "fine_tune_5YoCXHgSEyadGrZraMJWwf"
117     voices = list_voices(FINE_TUNE_ID)
118     print("Voices IDs:")
119     for voice in voices:
120         print(voice["id"])