我需要一双新的眼睛,也许还需要一些想法。我已经两天了。
这个想法很简单,我有静态文本,通过他们的 websocket 发送到 tts 的 elevenlabs AI,它批量返回给我 ulaw_8000 音频流,然后我想将其流式传输到 twilio 以播放给呼叫者,如下所示对来电者的话的回应。
我关注:https://www.twilio.com/docs/voice/twiml/stream#attributes-status-callback和https://www.ai-shift.co.jp/techblog/2844和https://www.twilio.com/docs/voice/twiml/stream#bi-direction-media-streams 以及许多其他类似来源。
我觉得我已经很接近它了,但我又不太能定位它。这是我的代码(请原谅我的游乐场代码)
import { createServer } from 'http';
import express from 'express';
import { WebSocketServer, WebSocket } from 'ws';
import 'dotenv/config';
import voiced from 'twilio'
const VoiceResponse = voiced.twiml.VoiceResponse;
const voiceId = "somevoiceid";
const model = 'eleven_monolingual_v1';
const ngrokURL = 'https://myngrok-url-here.app'
let streamsID = '';
const app = express();
app.use(express.urlencoded({ extended: true })); // Ensure Express can parse URL-encoded bodies sent by Twilio
const server = createServer(app);
app.get('/', (_, res) => res.type('text').send('Twilio media stream transcriber'));
app.post('/statusCallBack', (req, res) => {
console.log(req.body)
streamsID = req.body.StreamSid;
console.log(streamsID);
});
// Endpoint to handle incoming calls and gather speech input
app.post('/', (req, res) => {
const response = new VoiceResponse();
const connect = response.connect();
connect.stream({
url: `${ngrokURL}`,
statusCallback: `${ngrokURL}/statusCallBack`,
statusCallbackMethod: "POST"
});
console.log(response.toString());
// Use the <Gather> verb to collect speech input and define the action URL to process the input
const gather = response.gather({
input: 'speech',
timeout: 3, // Adjust the timeout as needed
action: '/process_speech', // Endpoint to process the speech input
method: 'POST',
});
gather.say('Thank you for calling. How may you be assisted?');
// If the caller doesn't say anything, <Gather> will finish, and you can provide additional instructions
response.say("I didn't catch that. Please try again.");
res.type('xml').send(response.toString());
});
// Endpoint to process the speech input and respond during the call
app.post('/process_speech', (req, res) => {
const speechResult = req.body.SpeechResult;
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input?model_id=${model}&output_format=ulaw_8000`;
const socket = new WebSocket(wsUrl);
const response = new VoiceResponse();
if (speechResult) {
response.say("Here is a poem to soothe you.")
socket.onopen = function (event) {
const bosMessage = {
"text": " ",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.8
},
"xi_api_key": process.env.ELEVENLABS_API_KEY, // replace with your API key
};
socket.send(JSON.stringify(bosMessage));
// Respond based on the speech input
console.log(`SpeechResult: ${req.body.SpeechResult}`);
const textMessage = {
"text": "roses are red,\n" +
"violets are blue\n" +
"You're an idiot but I still love you\n"
};
socket.send(JSON.stringify(textMessage));
// 4. Send the EOS message with an empty string
const eosMessage = {
"text": ""
};
socket.send(JSON.stringify(eosMessage));
};
socket.onmessage = function (event) {
const responded = JSON.parse(event.data);
if (responded.audio) {
// decode and handle the audio data (e.g., play it)
// const audioChunk = atob(responded.audio); // decode base64
console.log("Received audio chunk");
const ulaw_data_buffer = Buffer.from(responded.audio);
// Encode the Buffer containing μ-law data to a base64 string
const payload = ulaw_data_buffer.toString('base64');
console.log("Trying to send...")
let load = {
"event": "media",
"streamSid": streamsID,
"media": {
"payload": payload
}
}
console.log(load)
socketTwilio.send(JSON.stringify(load));
} else {
console.log("No audio data in the response");
}
if (responded.isFinal) {
// the generation is complete
}
if (responded.normalizedAlignment) {
// use the alignment info if needed
}
};
// Handle errors
socket.onerror = function (error) {
console.error(`WebSocket Error: ${error}`);
};
// Handle socket closing
socket.onclose = function (event) {
if (event.wasClean) {
console.info(`Connection closed cleanly, code=${event.code}, reason=${event.reason}`);
const gather = response.gather({
input: 'speech',
timeout: 3, // Adjust the timeout as needed
action: '/follow_up', // Endpoint to process the speech input
method: 'POST',
});
gather.say('Will there be anything else?');
res.type('xml').send(response.toString());
} else {
console.warn('Connection died');
}
};
} else {
// Handle the case where no speech was detected
response.say("Sorry, I didn't get that. Please try speaking again.");
}
// res.type('xml').send(response.toString());
});
app.post('/follow_up', (req, res) => {
const speechResult = req.body.SpeechResult.toLowerCase();
const response = new VoiceResponse();
if (speechResult.includes('yes')) {
// If caller says yes, loop back to initial gather
response.redirect({ method: 'POST' }, '/');
} else {
// End the call if they say no or provide an unclear response
response.say('Thank you for calling. Goodbye!');
}
res.type('xml').send(response.toString());
});
socketTwilio.addEventListener('open', function (event) {
// Send the audio blob as binary data
console.log("Twilio Websocket opened");
});
// WebSocket server setup remains unchanged
const wss = new WebSocketServer({ server });
wss.on('connection', (ws) => {
console.log('Twilio media stream WebSocket connected');
ws.on('message', (message) => {
// Process WebSocket messages as before
console.log("Twilio: ", message)
});
ws.on('close', () => {
console.log('Twilio media stream WebSocket disconnected');
});
});
console.log('Listening on port 8080');
server.listen(8080);
我的输出没有错误,呼叫进行得很好,响应说命令它说的话,但 twilio 不播放音频流。我得到了大约一两秒的空白,然后 twilio 继续下一个
gather.say('Will there be anything else?');
。我知道 Elevenlabs 返回音频流效果很好。我也得到“twilio 媒体流已连接”,即使在通过 websocket 进行流传输后,我也看到 twilio 套接字的 onMessage 回调的响应,如下所示:Twilio: <Buffer 7b ... ... 24531 more bytes>
我预感可能出了什么问题,但我不确定,因为我尝试了很多事情
对可能出现的问题有什么见解吗? (如果问题不好或重复,请道歉)
目前也在解决这个问题,你解决了吗?
我在 Elevenlabs 文档上找到了这个:
if (response.audio) {
// decode and handle the audio data (e.g., play it)
const audioChunk = atob(response.audio); // decode base64
console.log("Received audio chunk");
}
还有一个工作项目中的一些Python代码片段遵循相同的想法:
if data.get("audio"):
audio_chunk = base64.b64decode(data["audio"])
await websocket_server.send_json({
"event": "media",
"streamSid": self.stream_sid,
"media": {"payload": base64.b64encode(audio_chunk).decode('utf-8')}
})