End-to-end Pro Voice Cloning (Python)
Use Cartesia’s REST API to create a Pro Voice Clone.
Prerequisites
- You have a Cartesia API token (export it as
CARTESIA_API_TOKEN
).- You have at least 1 M credits on your account.
- You have a folder called
samples/
with one or more.wav
files.
1 """ 2 End-to-end Pro Voice Cloning example. 3 4 Steps 5 ----- 6 1. Create a dataset. 7 2. Upload audio files from samples/ to the dataset. 8 3. Kick off a fine-tune from that dataset. 9 4. Poll until fine-tune is completed. 10 5. Get the voices produced by the fine-tune. 11 """ 12 13 import os 14 import time 15 from pathlib import Path 16 17 import requests 18 19 API_BASE = "https://api.cartesia.ai" 20 API_HEADERS = { 21 "Cartesia-Version": "2025-04-16", 22 "Authorization": f"Bearer {os.environ['CARTESIA_API_KEY']}", 23 } 24 25 26 def create_dataset(name: str, description: str) -> str: 27 """POST /datasets → dataset id.""" 28 res = requests.post( 29 f"{API_BASE}/datasets", 30 headers=API_HEADERS, 31 json={"name": name, "description": description}, 32 ) 33 res.raise_for_status() 34 return res.json()["id"] 35 36 37 def upload_file_to_dataset(dataset_id: str, path: Path) -> None: 38 """POST /datasets/{dataset_id}/files (multipart/form-data).""" 39 with path.open("rb") as fp: 40 res = requests.post( 41 f"{API_BASE}/datasets/{dataset_id}/files", 42 headers=API_HEADERS, 43 files={"file": fp, "purpose": (None, "fine_tune")}, 44 ) 45 res.raise_for_status() 46 47 48 def create_fine_tune(dataset_id: str, *, name: str, language: str, model_id: str) -> str: 49 """POST /fine-tunes → fine-tune id.""" 50 body = { 51 "name": name, 52 "description": "Pro Voice Clone demo", 53 "language": language, 54 "model_id": model_id, 55 "dataset": dataset_id, 56 } 57 res = requests.post(f"{API_BASE}/fine-tunes", headers=API_HEADERS, json=body, timeout=60) 58 res.raise_for_status() 59 return res.json()["id"] 60 61 62 def wait_for_fine_tune(ft_id: str, every: float = 10.0) -> None: 63 """Poll GET /fine-tunes/{id} until status == completed.""" 64 start = time.monotonic() 65 while True: 66 res = requests.get(f"{API_BASE}/fine-tunes/{ft_id}", headers=API_HEADERS) 67 res.raise_for_status() 68 status = res.json()["status"] 69 print(f"fine-tune {ft_id} -> {status}. Elapsed: {time.monotonic() - start:.0f}s") 70 if status == "completed": 71 return 72 if status == "failed": 73 raise RuntimeError(f"fine-tune ended with status={status}") 74 time.sleep(every) 75 76 77 def list_voices(ft_id: str) -> list[dict]: 78 """GET /fine-tunes/{id}/voices → list of voices.""" 79 res = requests.get(f"{API_BASE}/fine-tunes/{ft_id}/voices", headers=API_HEADERS) 80 res.raise_for_status() 81 return res.json()["data"] 82 83 84 if __name__ == "__main__": 85 # Create the dataset 86 DATASET_ID = create_dataset("PVC demo", "Samples for a Pro Voice Clone") 87 print("Created dataset:", DATASET_ID) 88 89 # Upload .wav files to the dataset 90 for wav_path in Path("samples").glob("*.wav"): 91 upload_file_to_dataset(DATASET_ID, wav_path) 92 print(f"Uploaded {wav_path.name} to dataset {DATASET_ID}") 93 94 # Ask for confirmation before kicking off the fine-tune 95 confirmation = input( 96 "Are you sure you want to start the fine-tune? It will cost 1M credits upon successful completion (yes/no): " 97 ) 98 if confirmation.lower() != "yes": 99 print("Fine-tuning cancelled by user.") 100 exit() 101 102 # Kick off the fine-tune 103 FINE_TUNE_ID = create_fine_tune( 104 DATASET_ID, 105 name="PVC demo", 106 language="en", 107 model_id="sonic-2", 108 ) 109 print(f"Started fine-tune: {FINE_TUNE_ID}") 110 111 # Wait for training to finish 112 wait_for_fine_tune(FINE_TUNE_ID) 113 print("Fine-tune completed!") 114 115 # Fetch the voices created by the fine-tune 116 FINE_TUNE_ID = "fine_tune_5YoCXHgSEyadGrZraMJWwf" 117 voices = list_voices(FINE_TUNE_ID) 118 print("Voices IDs:") 119 for voice in voices: 120 print(voice["id"])