我想使用 Twilio 动词将音频发送到 Azure 的连续语音转文本 API,以便在通话时进行实时转录。我已成功使用下面的代码将音频文件发送到 Azure Speech to Text,但当我推送 Twilio Stream 数据时,我没有得到转录结果。我做错了什么?
#!/usr/bin/env node
const WebSocket = require("ws");
const fs = require("fs");
const mulaw = require("mulaw-js");
const base64 = require("js-base64");
const express = require("express");
const app = express();
const server = require("http").createServer(app);
const wss = new WebSocket.Server({ server });
const subscriptionKey = "6038f4a6669540bd89547b19a9135657";
const serviceRegion = "eastus"; // e.g., "westus"
const language = "en-US";
const sdk = require("microsoft-cognitiveservices-speech-sdk");
const stream = require("stream");
const azurePusher = sdk.AudioInputStream.createPushStream(sdk.AudioStreamFormat.getWaveFormatPCM(8000, 16, 1));
const audioConfig = sdk.AudioConfig.fromStreamInput(azurePusher);
//const audioConfig = sdk.AudioConfig.fromDefaultSpeakerOutput();
//const audioConfig = sdk.AudioConfig.fromWavFileInput(fs.readFileSync("C:\\Users\\kenar\\Downloads\\ACCDownload_20210904094910\\Audio\\MFA IDMission Demo Audio - 1.wav"))
const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey,serviceRegion);
speechConfig.speechRecognitionLanguage = language;
const recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
recognizer.recognizing = (s, e) => {
console.log(`RECOGNIZING: Text=${e.result.text}`);
};
recognizer.recognized = (s, e) => {
if (e.result.reason == sdk.ResultReason.RecognizedSpeech) {
console.log(`RECOGNIZED: Text=${e.result.text}`);
}
else if (e.result.reason == sdk.ResultReason.NoMatch) {
console.log("NOMATCH: Speech could not be recognized.");
}
};
recognizer.canceled = (s, e) => {
console.log(`CANCELED: Reason=${e.reason}`);
if (e.reason == sdk.CancellationReason.Error) {
console.log(`"CANCELED: ErrorCode=${e.errorCode}`);
console.log(`"CANCELED: ErrorDetails=${e.errorDetails}`);
console.log("CANCELED: Did you update the key and location/region info?");
}
recognizer.stopContinuousRecognitionAsync();
};
recognizer.sessionStopped = (s, e) => {
console.log("\n Session stopped event.");
recognizer.stopContinuousRecognitionAsync();
};
recognizer.startContinuousRecognitionAsync(() => {
console.log("Continuous Reco Started");
},
err => {
console.trace("err - " + err);
recognizer.close();
recognizer = undefined;
});
// Handle Web Socket Connection
wss.on("connection", function connection(ws) {
console.log("New Connection Initiated");
let recostream = null;
ws.on("message", function incoming(message) {
const msg = JSON.parse(message);
switch (msg.event) {
case "connected":
console.log(`A new call has connected.`);
break;
case "start":
console.log(`Starting Media Stream ${msg.streamSid}`);
break;
case "media":
process.stdout.write(msg.media.payload + " " + " bytes\033[0G");
streampayload = base64.decode(msg.media.payload);
let data = Buffer.from(streampayload);
azurePusher.write(mulaw.decode(data));
break;
case "stop":
console.log(`Call Has Ended`);
azurePusher.close();
recognizer.stopContinuousRecognitionAsync();
break;
}
});
});
Here are the results after running with attached audio:
"C:\Program Files\nodejs\node.exe"
C:\Users\kenar\WebstormProjects\twiliostreams1\twiliostream.js
Listening at Port 8080
Continuous Reco Started
New Connection Initiated
A new call has connected.
Starting Media Stream MZ8dc3ec47f7b9bd3b37e1b4896beb354e
RECOGNIZED: Text=
Call Has Ended
RECOGNIZED: Text=
NOMATCH: Speech could not be recognized.
Session stopped event.
首先要注意的是:您永远不应该将您的订阅密钥发布到任何公开的地方!任何人都可以获取您的密钥并开始使用 Azure 语音服务,您将需要为其使用付费。我建议您立即:
我能够通过额外的步骤将 Mulaw 转换为 PCM。这是使用示例 mulaw 文件和 recognizeOnceAsync 的代码的简化版本:
const sdk = require("microsoft-cognitiveservices-speech-sdk");
const fs = require("fs");
const alawmulaw = require("alawmulaw");
const language = "en-US";
// NOTE: Since I am reading from a mulaw file, it will include a wave file header. Assuming your basic
// header with no additions, this will be 44 bytes long.
// Twilio will probably *not* include this wave file header, in which case you should set this
// value to 0.
const waveHeaderSize = 44;
var pushStream = sdk.AudioInputStream.createPushStream(sdk.AudioStreamFormat.getWaveFormatPCM(16000, 16, 1));
fs.createReadStream('c:\\temp\\short.mulaw', { start: waveHeaderSize })
.on('data', function(arrayBuffer) {
// This returns an Int16Array
let rawPcm = alawmulaw.mulaw.decode(arrayBuffer);
// Let's change our view of this data to instead be an UInt8Array
// CAUTION:
// This will work on systems with a Little Endian architecture (the more
// common one). If your system is Big Endian, you will probably need to
// manually convert to a Little Endian encoded Int16 values since that i
// the format the Cognitive Speech service expects
let uintView = new Uint8Array(rawPcm.buffer);
pushStream.write(uintView);
})
.on('end', function() {
pushStream.close();
});
const audioConfig = sdk.AudioConfig.fromStreamInput(pushStream);
const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
speechConfig.speechRecognitionLanguage = language;
const recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
recognizer.recognizeOnceAsync(
result =>
{
console.log(result);
recognizer.close();
},
error =>
{
console.log(err);
recognizer.close();
});
有几点需要注意:
alawmulaw.mulaw.decodeSample
函数,并从中创建所需的 Uint8Array。例如(未经测试):
let sample = alawmulaw.mulaw.decodeSample(sampleFromTwilio)
var buff = Buffer.alloc(2);
buff.writeInt16LE(sample);
pushStream.write(buff.buffer);
you can simple write method to convert into wav and that sent this data to push stream
function writedata(data) {
if (isStreamHasbeenClosed) {
// stream already closed don't need to write data
return;
}
const wav = new WaveFile();
wav.fromScratch(1, 8000, "8m", Buffer.from(data, "base64"));
wav.fromMuLaw();
wav.toSampleRate(16000);
pushStream.write(wav.data.samples);
}
and on webhook just call this method :
writedata(msg.media.payload); it will work (tested)