如何训练一个Tf.js音频识别模型来识别超过3个命令？

我一直在按照Tensorflow.js的音频识别教程进行学习，教程链接在这里：https://codelabs.developers.google.com/codelabs/tensorflowjs-audio-codelab/index.html?index=..%2F..index#5。我修改了命令，移除了滑块和moveSlider()函数，只是在”console” div中显示标签。你可以在这里找到我的代码：https://codepen.io/willrd123/pen/abvQbyG?editors=0010。

<html>  <head>    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>    <script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/speech-commands"></script>  </head>  <body>    <button id="start" onmousedown="collect(0)">Start</button>    <button id="forward" onmousedown="collect(1)">Forward</button>    <button id="back" onmousedown="collect(2)">Back</button>    <button id="left" onmousedown="collect(3)">Left</button>    <button id="right" onmousedown="collect(4)">Right</button>    <button id="up" onmousedown="collect(5)">Up</button>    <button id="down" onmousedown="collect(6)">Down</button>    <button id="stop" onmousedown="collect(7)">Stop</button>    <button id="takeOff" onmousedown="collect(8)">Take Off</button>    <button id="land" onmousedown="collect(9)">Land</button>    <button id="flip" onmousedown="collect(10)">Flip</button>    <button id="switchView" onmousedown="collect(11)">Switch View</button>    <button id="noise" onmousedown="collect(12)">Noise</button>    <br/><br/>    <button id="train" onclick="train()">Train</button>    <button id="listen" onclick="listen()">Listen</button>    <button id="save" onclick="save()">Save</button>    <br/><br/>    <div id="console"></div>    <script src="index.js"></script>  </body></html>

let recognizer;async function app() { recognizer = speechCommands.create('BROWSER_FFT'); await recognizer.ensureModelLoaded(); // Add this line. buildModel();}app();// One frame is ~23ms of audio.const NUM_FRAMES = 6;let examples = [];function collect(label) { if (recognizer.isListening()) {   return recognizer.stopListening(); } if (label == null) {   return; } recognizer.listen(async ({spectrogram: {frameSize, data}}) => {   let vals = normalize(data.subarray(-frameSize * NUM_FRAMES));   examples.push({vals, label});   document.querySelector('#console').textContent =       `${examples.length} examples collected`; }, {   overlapFactor: 0.999,   includeSpectrogram: true,   invokeCallbackOnNoiseAndUnknown: true });}function normalize(x) { const mean = -100; const std = 10; return x.map(x => (x - mean) / std);}const INPUT_SHAPE = [NUM_FRAMES, 232, 1];let model;async function train() { toggleButtons(false); const ys = tf.oneHot(examples.map(e => e.label), 3); const xsShape = [examples.length, ...INPUT_SHAPE]; const xs = tf.tensor(flatten(examples.map(e => e.vals)), xsShape); await model.fit(xs, ys, {   batchSize: 16,   epochs: 10,   callbacks: {     onEpochEnd: (epoch, logs) => {       document.querySelector('#console').textContent =           `Accuracy: ${(logs.acc * 100).toFixed(1)}% Epoch: ${epoch + 1}`;     }   } }); tf.dispose([xs, ys]); toggleButtons(true);}function buildModel() { model = tf.sequential(); model.add(tf.layers.depthwiseConv2d({   depthMultiplier: 8,   kernelSize: [NUM_FRAMES,  3],   activation: 'relu',   inputShape: INPUT_SHAPE })); model.add(tf.layers.maxPooling2d({poolSize: [1, 2], strides: [2, 2]})); model.add(tf.layers.flatten()); model.add(tf.layers.dense({units: 3, activation: 'softmax'})); const optimizer = tf.train.adam(0.01); model.compile({   optimizer,   loss: 'categoricalCrossentropy',   metrics: ['accuracy'] });}function toggleButtons(enable) { document.querySelectorAll('button').forEach(b => b.disabled = !enable);}function flatten(tensors) { const size = tensors[0].length; const result = new Float32Array(tensors.length * size); tensors.forEach((arr, i) => result.set(arr, i * size)); return result;}var labels = ["Forward", "Back", "Left", "Right", "Up", "Down", "Take Off", "Land", "Switch View", "Noise"];async function finish(labelTensor) { const label = (await labelTensor.data())[0]; document.getElementById('console').textContent = labels[label];} function listen() { if (recognizer.isListening()) {   recognizer.stopListening();   toggleButtons(true);   document.getElementById('listen').textContent = 'Listen';   return; } toggleButtons(false); document.getElementById('listen').textContent = 'Stop'; document.getElementById('listen').disabled = false; recognizer.listen(async ({spectrogram: {frameSize, data}}) => {   const vals = normalize(data.subarray(-frameSize * NUM_FRAMES));   const input = tf.tensor(vals, [1, ...INPUT_SHAPE]);   const probs = model.predict(input);   const predLabel = probs.argMax(1);   await finish(predLabel);   tf.dispose([input, probs, predLabel]); }, {   overlapFactor: 0.999,   includeSpectrogram: true,   invokeCallbackOnNoiseAndUnknown: true });}async function save () {  const model = await tf.loadLayersModel(HTTP-Server/dronemodel.json');}

然而，当我尝试将这段代码调整为识别13个不同的命令时，模型只返回了前三个命令（Start, Forward和Back），即使我只提供了这三个命令之外的一个命令的音频。有什么办法可以解决这个问题吗？

回答：

模型在给定最后一层的单位数为3的情况下，对三个类别进行分类。需要将单位数更改为期望的命令数（13），并相应地训练模型。

学技术

如何训练一个Tf.js音频识别模型来识别超过3个命令？

发表回复取消回复

相关文章：

Related Posts

使用LSTM在Python中预测未来值

如何在gensim的word2vec模型中查找双词组的相似性

dask_xgboost.predict 可以工作但无法显示 – 数据必须是一维的

ML Tuning – Cross Validation in Spark

如何在React JS中使用fetch从REST API获取预测

如何分析ML.NET中多类分类预测得分数组？

发表回复 取消回复

发表回复取消回复