如何训练一个Tf.js音频识别模型来识别3个以上的命令?

问题描述 投票:1回答:1

我一直在关注这里的Tensorflow.js音频识别教程。https:/codelabs.developers.google.comcodelabstensorflowjs-audio-codelabindex.html?index=...%2F...index#5。. 我改变了命令,去掉了滑块和函数moveSlider(),只是让标签出现在 "控制台 "的div中。你可以在这里找到我的代码。https:/codepen.iowillrd123penabvQbyG?editors=0010。.

<html>
  <head>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/speech-commands"></script>
  </head>
  <body>
    <button id="start" onmousedown="collect(0)">Start</button>
    <button id="forward" onmousedown="collect(1)">Forward</button>
    <button id="back" onmousedown="collect(2)">Back</button>
    <button id="left" onmousedown="collect(3)">Left</button>
    <button id="right" onmousedown="collect(4)">Right</button>
    <button id="up" onmousedown="collect(5)">Up</button>
    <button id="down" onmousedown="collect(6)">Down</button>
    <button id="stop" onmousedown="collect(7)">Stop</button>
    <button id="takeOff" onmousedown="collect(8)">Take Off</button>
    <button id="land" onmousedown="collect(9)">Land</button>
    <button id="flip" onmousedown="collect(10)">Flip</button>
    <button id="switchView" onmousedown="collect(11)">Switch View</button>
    <button id="noise" onmousedown="collect(12)">Noise</button>
    <br/><br/>
    <button id="train" onclick="train()">Train</button>
    <button id="listen" onclick="listen()">Listen</button>
    <button id="save" onclick="save()">Save</button>
    <br/><br/>
    <div id="console"></div>
    <script src="index.js"></script>
  </body>
</html>
let recognizer;

async function app() {
 recognizer = speechCommands.create('BROWSER_FFT');
 await recognizer.ensureModelLoaded();
 // Add this line.
 buildModel();
}

app();

// One frame is ~23ms of audio.
const NUM_FRAMES = 6;
let examples = [];

function collect(label) {
 if (recognizer.isListening()) {
   return recognizer.stopListening();
 }
 if (label == null) {
   return;
 }
 recognizer.listen(async ({spectrogram: {frameSize, data}}) => {
   let vals = normalize(data.subarray(-frameSize * NUM_FRAMES));
   examples.push({vals, label});
   document.querySelector('#console').textContent =
       `${examples.length} examples collected`;
 }, {
   overlapFactor: 0.999,
   includeSpectrogram: true,
   invokeCallbackOnNoiseAndUnknown: true
 });
}

function normalize(x) {
 const mean = -100;
 const std = 10;
 return x.map(x => (x - mean) / std);
}

const INPUT_SHAPE = [NUM_FRAMES, 232, 1];
let model;

async function train() {
 toggleButtons(false);
 const ys = tf.oneHot(examples.map(e => e.label), 3);
 const xsShape = [examples.length, ...INPUT_SHAPE];
 const xs = tf.tensor(flatten(examples.map(e => e.vals)), xsShape);

 await model.fit(xs, ys, {
   batchSize: 16,
   epochs: 10,
   callbacks: {
     onEpochEnd: (epoch, logs) => {
       document.querySelector('#console').textContent =
           `Accuracy: ${(logs.acc * 100).toFixed(1)}% Epoch: ${epoch + 1}`;
     }
   }
 });
 tf.dispose([xs, ys]);
 toggleButtons(true);
}

function buildModel() {
 model = tf.sequential();
 model.add(tf.layers.depthwiseConv2d({
   depthMultiplier: 8,
   kernelSize: [NUM_FRAMES,  3],
   activation: 'relu',
   inputShape: INPUT_SHAPE
 }));
 model.add(tf.layers.maxPooling2d({poolSize: [1, 2], strides: [2, 2]}));
 model.add(tf.layers.flatten());
 model.add(tf.layers.dense({units: 3, activation: 'softmax'}));
 const optimizer = tf.train.adam(0.01);
 model.compile({
   optimizer,
   loss: 'categoricalCrossentropy',
   metrics: ['accuracy']
 });
}

function toggleButtons(enable) {
 document.querySelectorAll('button').forEach(b => b.disabled = !enable);
}

function flatten(tensors) {
 const size = tensors[0].length;
 const result = new Float32Array(tensors.length * size);
 tensors.forEach((arr, i) => result.set(arr, i * size));
 return result;
}

var labels = ["Forward", "Back", "Left", "Right", "Up", "Down", "Take Off", "Land", "Switch View", "Noise"];
async function finish(labelTensor) {
 const label = (await labelTensor.data())[0];
 document.getElementById('console').textContent = labels[label];
} 

function listen() {
 if (recognizer.isListening()) {
   recognizer.stopListening();
   toggleButtons(true);
   document.getElementById('listen').textContent = 'Listen';
   return;
 }
 toggleButtons(false);
 document.getElementById('listen').textContent = 'Stop';
 document.getElementById('listen').disabled = false;

 recognizer.listen(async ({spectrogram: {frameSize, data}}) => {
   const vals = normalize(data.subarray(-frameSize * NUM_FRAMES));
   const input = tf.tensor(vals, [1, ...INPUT_SHAPE]);
   const probs = model.predict(input);
   const predLabel = probs.argMax(1);
   await finish(predLabel);
   tf.dispose([input, probs, predLabel]);
 }, {
   overlapFactor: 0.999,
   includeSpectrogram: true,
   invokeCallbackOnNoiseAndUnknown: true
 });
}

async function save () {
  const model = await tf.loadLayersModel(HTTP-Server/dronemodel.json');
}

然而,当我试着为13个不同的命令改编这段代码时,模型只返回了其中的前3个命令(开始、前进和后退),即使我只为这3个命令之外的1个命令提供音频。有什么办法可以解决这个问题吗?

javascript tensorflow machine-learning speech-recognition tensorflow.js
1个回答
1
投票

该模型是在给定单位的情况下,对三类进行分类。3 单位数必须改为预期的命令数( )。13),需要对模型进行相应的训练。

© www.soinside.com 2019 - 2024. All rights reserved.