Twilio <> Cartesia

How to integrate Twilio with Cartesia to generate audio from text and send it as a voice call.

Integrating Twilio with Cartesia

This guide will walk you through the process of integrating Cartesia’s Text-to-Speech (TTS) API with Twilio’s calling capabilities. By following these steps, you’ll be able to create an application that initiates phone calls and uses Cartesia’s TTS to speak to the call recipient.

Prerequisites

Before you begin, make sure you have the following:

Node.js installed on your system
A Twilio account with Account SID and Auth Token
A Cartesia API key
An outbound phone number to call
A Twilio phone number to call from

Step 1: Set Up Your Project

Create a new directory for your project and navigate to it in your terminal.

Initialize a new Node.js project:

$ npm init -y

Install the required dependencies:

$ npm install twilio ws http ngrok dotenv

Step 2: Configure Environment Variables

Create a .env file in your project root and add the following:

1 TWILIO_ACCOUNT_SID="your_twilio_account_sid"
2 TWILIO_AUTH_TOKEN="your_twilio_auth_token"
3 CARTESIA_API_KEY="your_cartesia_api_key"

Replace the placeholder values with your actual credentials.

Step 3: Create the Main Script

Create a file named app.js (or any name you prefer) and add the following code:

1 const twilio = require('twilio');
2 const WebSocket = require('ws');
3 const http = require('http');
4 const ngrok = require('ngrok');
5 const dotenv = require('dotenv');
6 
7 // Load environment variables
8 dotenv.config();
9 
10 // Function to get a value from environment variable or command line argument
11 function getConfig(key, defaultValue = undefined) {
12   return process.env[key] || process.argv.find(arg => arg.startsWith(`${key}=`))?.split('=')[1] || defaultValue;
13 }
14 
15 // Configuration
16 const config = {
17     TWILIO_ACCOUNT_SID: getConfig('TWILIO_ACCOUNT_SID'),
18     TWILIO_AUTH_TOKEN: getConfig('TWILIO_AUTH_TOKEN'),
19     CARTESIA_API_KEY: getConfig('CARTESIA_API_KEY'),
20 };
21 
22 // Validate required configuration
23 const requiredConfig = ['TWILIO_ACCOUNT_SID', 'TWILIO_AUTH_TOKEN', 'CARTESIA_API_KEY'];
24 for (const key of requiredConfig) {
25     if (!config[key]) {
26         console.error(`Missing required configuration: ${key}`);
27         process.exit(1);
28     }
29 }
30 
31 const client = twilio(config.TWILIO_ACCOUNT_SID, config.TWILIO_AUTH_TOKEN);

Step 4: Configure Cartesia TTS

In the script, you’ll find a configuration section for Cartesia TTS. Make sure to set the following variables according to your needs:

1 const TTS_WEBSOCKET_URL = `wss://api.cartesia.ai/tts/websocket?api_key=${config.CARTESIA_API_KEY}&cartesia_version=2024-06-10`;
2 const modelId = 'sonic-english';
3 const voice = {
4     'mode': 'id',
5     'id': "VOICE_ID" // You can check available voices using the Cartesia API or at https://play.cartesia.ai
6 };
7 const partialResponse = 'Hi there, my name is Cartesia. I hope youre having a great day!';

Step 5: Set Up Twilio Calling

Configure your Twilio outbound and inbound numbers:

1 const outbound = "+1234567890"; // Replace with the number you want to call
2 const inbound = "+1234567890";  // Replace with your Twilio number

Step 6: Implement Main Logic

The main() function orchestrates the entire process:

Connects to the Cartesia TTS WebSocket
Tests the TTS WebSocket
Sets up a Twilio WebSocket server
Creates an ngrok tunnel for the Twilio WebSocket
Initiates the call using Twilio

1 function connectToTTSWebSocket() {
2   return new Promise((resolve, reject) => {
3     log('Attempting to connect to TTS WebSocket');
4     ttsWebSocket = new WebSocket(TTS_WEBSOCKET_URL);
5 
6     ttsWebSocket.on('open', () => {
7       log('Connected to TTS WebSocket');
8       resolve(ttsWebSocket);
9     });
10 
11     ttsWebSocket.on('error', (error) => {
12       log(`TTS WebSocket error: ${error.message}`);
13       reject(error);
14     });
15 
16     ttsWebSocket.on('close', (code, reason) => {
17       log(`TTS WebSocket closed. Code: ${code}, Reason: ${reason}`);
18       reject(new Error('TTS WebSocket closed unexpectedly'));
19     });
20   });
21 }
22 
23 function sendTTSMessage(message) {
24   const textMessage = {
25     'model_id': modelId,
26     'transcript': message,
27     'voice': voice,
28     'output_format': {
29       'container': 'raw',
30       'encoding': 'pcm_mulaw',
31       'sample_rate': 8000
32     }
33   };
34 
35   log(`Sending message to TTS WebSocket: ${message}`);
36   ttsWebSocket.send(JSON.stringify(textMessage));
37 }
38 
39 function testTTSWebSocket() {
40   return new Promise((resolve, reject) => {
41     const testMessage = 'This is a test message';
42     let receivedAudio = false;
43 
44     sendTTSMessage(testMessage);
45 
46     const timeout = setTimeout(() => {
47       if (!receivedAudio) {
48         reject(new Error('Timeout: No audio received from TTS WebSocket'));
49       }
50     }, 10000); // 10 second timeout
51 
52     ttsWebSocket.on('message', (audioChunk) => {
53       if (!receivedAudio) {
54         log(audioChunk);
55         log('Received audio chunk from TTS for test message');
56         receivedAudio = true;
57         clearTimeout(timeout);
58         resolve();
59       }
60     });
61   });
62 }
63 
64 async function startCall(twilioWebsocketUrl) {
65   try {
66     log(`Initiating call with WebSocket URL: ${twilioWebsocketUrl}`);
67     const call = await client.calls.create({
68       twiml: `<Response><Connect><Stream url="${twilioWebsocketUrl}"/></Connect></Response>`,
69       to: outbound,  // Replace with the phone number you want to call
70       from: inbound  // Replace with your Twilio phone number
71     });
72     
73     callSid = call.sid;
74     log(`Call initiated. SID: ${callSid}`);
75   } catch (error) {
76     log(`Error initiating call: ${error.message}`);
77     throw error;
78   }
79 }
80 
81 async function hangupCall() {
82   try {
83     log(`Attempting to hang up call: ${callSid}`);
84     await client.calls(callSid).update({status: 'completed'});
85     log('Call hung up successfully');
86   } catch (error) {
87     log(`Error hanging up call: ${error.message}`);
88   }
89 }
90 
91 function setupTwilioWebSocket() {
92     return new Promise((resolve, reject) => {
93       const server = http.createServer((req, res) => {
94         log(`Received HTTP request: ${req.method} ${req.url}`);
95         res.writeHead(200);
96         res.end('WebSocket server is running');
97       });
98   
99       const wss = new WebSocket.Server({ server });
100   
101       log('WebSocket server created');
102   
103       wss.on('connection', (twilioWs, request) => {
104         log(`Twilio WebSocket connection attempt from ${request.socket.remoteAddress}`);
105   
106         let streamSid = null;
107   
108         twilioWs.on('message', (message) => {
109           try {
110             const msg = JSON.parse(message);
111             log(`Received message from Twilio: ${JSON.stringify(msg)}`);
112             
113             if (msg.event === 'start') {
114               log('Media stream started');
115               streamSid = msg.start.streamSid;
116               log(`Stream SID: ${streamSid}`);
117               sendTTSMessage(partialResponse);
118             } else if (msg.event === 'media' && !messageComplete) {
119               log('Received media event');
120             } else if (msg.event === 'stop') {
121               log('Media stream stopped');
122               hangupCall();
123             }
124           } catch (error) {
125             log(`Error processing Twilio message: ${error.message}`);
126           }
127         });
128   
129         twilioWs.on('close', (code, reason) => {
130           log(`Twilio WebSocket disconnected. Code: ${code}, Reason: ${reason}`);
131         });
132   
133         twilioWs.on('error', (error) => {
134           log(`Twilio WebSocket error: ${error.message}`);
135         });
136   
137         // Handle incoming audio chunks from TTS WebSocket
138         ttsWebSocket.on('message', (audioChunk) => {
139           log('Received audio chunk from TTS');
140           try {
141             if (streamSid) {
142               twilioWs.send(JSON.stringify({
143                 event: 'media',
144                 streamSid: streamSid,
145                 media: {
146                   payload: JSON.parse(audioChunk)['data']
147                 }
148               }));
149   
150               audioChunksReceived++;
151               log(`Audio chunks received: ${audioChunksReceived}`);
152   
153               if (audioChunksReceived >= 50) {
154                 messageComplete = true;
155                 log('Message complete, preparing to hang up');
156                 setTimeout(hangupCall, 2000);
157               }
158             } else {
159               log('Warning: Received audio chunk but streamSid is not set');
160             }
161           } catch (error) {
162             log(`Error sending audio chunk to Twilio: ${error.message}`);
163           }
164         });
165   
166         log('Twilio WebSocket connected and handlers set up');
167       });
168   
169       wss.on('error', (error) => {
170         log(`WebSocket server error: ${error.message}`);
171       });
172   
173       server.listen(0, () => {
174         const port = server.address().port;
175         log(`Twilio WebSocket server is running on port ${port}`);
176         resolve(port);
177       });
178   
179       server.on('error', (error) => {
180         log(`HTTP server error: ${error.message}`);
181         reject(error);
182       });
183     });
184   }
185 
186 async function setupNgrokTunnel(port) {
187     try {
188       const httpsUrl = await ngrok.connect(port);
189       // Convert https:// to wss://
190       const wssUrl = httpsUrl.replace('https://', 'wss://');
191       log(`ngrok tunnel established: ${wssUrl}`);
192       return wssUrl;
193     } catch (error) {
194       log(`Error setting up ngrok tunnel: ${error.message}`);
195       throw error;
196     }
197   }
198 
199 async function main() {
200   try {
201     log('Starting application');
202     
203     await connectToTTSWebSocket();
204     log('TTS WebSocket connected successfully');
205     
206     await testTTSWebSocket();
207     log('TTS WebSocket test passed successfully');
208     
209     const twilioWebsocketPort = await setupTwilioWebSocket();
210     log(`Twilio WebSocket server set up on port ${twilioWebsocketPort}`);
211     
212     const twilioWebsocketUrl = await setupNgrokTunnel(twilioWebsocketPort);
213     
214     await startCall(twilioWebsocketUrl);
215   } catch (error) {
216     log(`Error in main function: ${error.message}`);
217   }
218 }
219 
220 // Run the script
221 main();

Step 7: Run the Application

To run the application, use the following command:

$ node app.js

How It Works

The script establishes a connection to Cartesia’s TTS WebSocket.
It sets up a local WebSocket server to communicate with Twilio.
An ngrok tunnel is created to expose the local WebSocket server to the internet.
A call is initiated using Twilio, connecting to the ngrok tunnel.
When the call connects, the script sends the predefined message to Cartesia’s TTS.
Cartesia converts the text to speech and sends audio chunks back.
The script forwards these audio chunks to Twilio, which plays them on the call.

Customization

To change the spoken message, modify the partialResponse variable.
Adjust the voice parameters in the voice object to change the TTS voice characteristics.
Modify the audioChunksReceived threshold to control when the call should end.

Troubleshooting

If you encounter any issues, check the console logs for detailed error messages.
Ensure all required environment variables are correctly set.
Verify that your Twilio and Cartesia credentials are valid and have the necessary permissions.

1	TWILIO_ACCOUNT_SID="your_twilio_account_sid"
2	TWILIO_AUTH_TOKEN="your_twilio_auth_token"
3	CARTESIA_API_KEY="your_cartesia_api_key"

1	const twilio = require('twilio');
2	const WebSocket = require('ws');
3	const http = require('http');
4	const ngrok = require('ngrok');
5	const dotenv = require('dotenv');
6
7	// Load environment variables
8	dotenv.config();
9
10	// Function to get a value from environment variable or command line argument
11	function getConfig(key, defaultValue = undefined) {
12	return process.env[key] \|\| process.argv.find(arg => arg.startsWith(`${key}=`))?.split('=')[1] \|\| defaultValue;
13	}
14
15	// Configuration
16	const config = {
17	TWILIO_ACCOUNT_SID: getConfig('TWILIO_ACCOUNT_SID'),
18	TWILIO_AUTH_TOKEN: getConfig('TWILIO_AUTH_TOKEN'),
19	CARTESIA_API_KEY: getConfig('CARTESIA_API_KEY'),
20	};
21
22	// Validate required configuration
23	const requiredConfig = ['TWILIO_ACCOUNT_SID', 'TWILIO_AUTH_TOKEN', 'CARTESIA_API_KEY'];
24	for (const key of requiredConfig) {
25	if (!config[key]) {
26	console.error(`Missing required configuration: ${key}`);
27	process.exit(1);
28	}
29	}
30
31	const client = twilio(config.TWILIO_ACCOUNT_SID, config.TWILIO_AUTH_TOKEN);

1	const TTS_WEBSOCKET_URL = `wss://api.cartesia.ai/tts/websocket?api_key=${config.CARTESIA_API_KEY}&cartesia_version=2024-06-10`;
2	const modelId = 'sonic-english';
3	const voice = {
4	'mode': 'id',
5	'id': "VOICE_ID" // You can check available voices using the Cartesia API or at https://play.cartesia.ai
6	};
7	const partialResponse = 'Hi there, my name is Cartesia. I hope youre having a great day!';

1	const outbound = "+1234567890"; // Replace with the number you want to call
2	const inbound = "+1234567890"; // Replace with your Twilio number

1	function connectToTTSWebSocket() {
2	return new Promise((resolve, reject) => {
3	log('Attempting to connect to TTS WebSocket');
4	ttsWebSocket = new WebSocket(TTS_WEBSOCKET_URL);
5
6	ttsWebSocket.on('open', () => {
7	log('Connected to TTS WebSocket');
8	resolve(ttsWebSocket);
9	});
10
11	ttsWebSocket.on('error', (error) => {
12	log(`TTS WebSocket error: ${error.message}`);
13	reject(error);
14	});
15
16	ttsWebSocket.on('close', (code, reason) => {
17	log(`TTS WebSocket closed. Code: ${code}, Reason: ${reason}`);
18	reject(new Error('TTS WebSocket closed unexpectedly'));
19	});
20	});
21	}
22
23	function sendTTSMessage(message) {
24	const textMessage = {
25	'model_id': modelId,
26	'transcript': message,
27	'voice': voice,
28	'output_format': {
29	'container': 'raw',
30	'encoding': 'pcm_mulaw',
31	'sample_rate': 8000
32	}
33	};
34
35	log(`Sending message to TTS WebSocket: ${message}`);
36	ttsWebSocket.send(JSON.stringify(textMessage));
37	}
38
39	function testTTSWebSocket() {
40	return new Promise((resolve, reject) => {
41	const testMessage = 'This is a test message';
42	let receivedAudio = false;
43
44	sendTTSMessage(testMessage);
45
46	const timeout = setTimeout(() => {
47	if (!receivedAudio) {
48	reject(new Error('Timeout: No audio received from TTS WebSocket'));
49	}
50	}, 10000); // 10 second timeout
51
52	ttsWebSocket.on('message', (audioChunk) => {
53	if (!receivedAudio) {
54	log(audioChunk);
55	log('Received audio chunk from TTS for test message');
56	receivedAudio = true;
57	clearTimeout(timeout);
58	resolve();
59	}
60	});
61	});
62	}
63
64	async function startCall(twilioWebsocketUrl) {
65	try {
66	log(`Initiating call with WebSocket URL: ${twilioWebsocketUrl}`);
67	const call = await client.calls.create({
68	twiml: `<Response><Connect><Stream url="${twilioWebsocketUrl}"/></Connect></Response>`,
69	to: outbound, // Replace with the phone number you want to call
70	from: inbound // Replace with your Twilio phone number
71	});
72
73	callSid = call.sid;
74	log(`Call initiated. SID: ${callSid}`);
75	} catch (error) {
76	log(`Error initiating call: ${error.message}`);
77	throw error;
78	}
79	}
80
81	async function hangupCall() {
82	try {
83	log(`Attempting to hang up call: ${callSid}`);
84	await client.calls(callSid).update({status: 'completed'});
85	log('Call hung up successfully');
86	} catch (error) {
87	log(`Error hanging up call: ${error.message}`);
88	}
89	}
90
91	function setupTwilioWebSocket() {
92	return new Promise((resolve, reject) => {
93	const server = http.createServer((req, res) => {
94	log(`Received HTTP request: ${req.method} ${req.url}`);
95	res.writeHead(200);
96	res.end('WebSocket server is running');
97	});
98
99	const wss = new WebSocket.Server({ server });
100
101	log('WebSocket server created');
102
103	wss.on('connection', (twilioWs, request) => {
104	log(`Twilio WebSocket connection attempt from ${request.socket.remoteAddress}`);
105
106	let streamSid = null;
107
108	twilioWs.on('message', (message) => {
109	try {
110	const msg = JSON.parse(message);
111	log(`Received message from Twilio: ${JSON.stringify(msg)}`);
112
113	if (msg.event === 'start') {
114	log('Media stream started');
115	streamSid = msg.start.streamSid;
116	log(`Stream SID: ${streamSid}`);
117	sendTTSMessage(partialResponse);
118	} else if (msg.event === 'media' && !messageComplete) {
119	log('Received media event');
120	} else if (msg.event === 'stop') {
121	log('Media stream stopped');
122	hangupCall();
123	}
124	} catch (error) {
125	log(`Error processing Twilio message: ${error.message}`);
126	}
127	});
128
129	twilioWs.on('close', (code, reason) => {
130	log(`Twilio WebSocket disconnected. Code: ${code}, Reason: ${reason}`);
131	});
132
133	twilioWs.on('error', (error) => {
134	log(`Twilio WebSocket error: ${error.message}`);
135	});
136
137	// Handle incoming audio chunks from TTS WebSocket
138	ttsWebSocket.on('message', (audioChunk) => {
139	log('Received audio chunk from TTS');
140	try {
141	if (streamSid) {
142	twilioWs.send(JSON.stringify({
143	event: 'media',
144	streamSid: streamSid,
145	media: {
146	payload: JSON.parse(audioChunk)['data']
147	}
148	}));
149
150	audioChunksReceived++;
151	log(`Audio chunks received: ${audioChunksReceived}`);
152
153	if (audioChunksReceived >= 50) {
154	messageComplete = true;
155	log('Message complete, preparing to hang up');
156	setTimeout(hangupCall, 2000);
157	}
158	} else {
159	log('Warning: Received audio chunk but streamSid is not set');
160	}
161	} catch (error) {
162	log(`Error sending audio chunk to Twilio: ${error.message}`);
163	}
164	});
165
166	log('Twilio WebSocket connected and handlers set up');
167	});
168
169	wss.on('error', (error) => {
170	log(`WebSocket server error: ${error.message}`);
171	});
172
173	server.listen(0, () => {
174	const port = server.address().port;
175	log(`Twilio WebSocket server is running on port ${port}`);
176	resolve(port);
177	});
178
179	server.on('error', (error) => {
180	log(`HTTP server error: ${error.message}`);
181	reject(error);
182	});
183	});
184	}
185
186	async function setupNgrokTunnel(port) {
187	try {
188	const httpsUrl = await ngrok.connect(port);
189	// Convert https:// to wss://
190	const wssUrl = httpsUrl.replace('https://', 'wss://');
191	log(`ngrok tunnel established: ${wssUrl}`);
192	return wssUrl;
193	} catch (error) {
194	log(`Error setting up ngrok tunnel: ${error.message}`);
195	throw error;
196	}
197	}
198
199	async function main() {
200	try {
201	log('Starting application');
202
203	await connectToTTSWebSocket();
204	log('TTS WebSocket connected successfully');
205
206	await testTTSWebSocket();
207	log('TTS WebSocket test passed successfully');
208
209	const twilioWebsocketPort = await setupTwilioWebSocket();
210	log(`Twilio WebSocket server set up on port ${twilioWebsocketPort}`);
211
212	const twilioWebsocketUrl = await setupNgrokTunnel(twilioWebsocketPort);
213
214	await startCall(twilioWebsocketUrl);
215	} catch (error) {
216	log(`Error in main function: ${error.message}`);
217	}
218	}
219
220	// Run the script
221	main();