我一直在按照Tensorflow.js的音频识别教程进行学习,教程链接在这里:https://codelabs.developers.google.com/codelabs/tensorflowjs-audio-codelab/index.html?index=..%2F..index#5。我修改了命令,移除了滑块和moveSlider()函数,只是在”console” div中显示标签。你可以在这里找到我的代码:https://codepen.io/willrd123/pen/abvQbyG?editors=0010。
<html> <head> <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script> <script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/speech-commands"></script> </head> <body> <button id="start" onmousedown="collect(0)">Start</button> <button id="forward" onmousedown="collect(1)">Forward</button> <button id="back" onmousedown="collect(2)">Back</button> <button id="left" onmousedown="collect(3)">Left</button> <button id="right" onmousedown="collect(4)">Right</button> <button id="up" onmousedown="collect(5)">Up</button> <button id="down" onmousedown="collect(6)">Down</button> <button id="stop" onmousedown="collect(7)">Stop</button> <button id="takeOff" onmousedown="collect(8)">Take Off</button> <button id="land" onmousedown="collect(9)">Land</button> <button id="flip" onmousedown="collect(10)">Flip</button> <button id="switchView" onmousedown="collect(11)">Switch View</button> <button id="noise" onmousedown="collect(12)">Noise</button> <br/><br/> <button id="train" onclick="train()">Train</button> <button id="listen" onclick="listen()">Listen</button> <button id="save" onclick="save()">Save</button> <br/><br/> <div id="console"></div> <script src="index.js"></script> </body></html>
let recognizer;async function app() { recognizer = speechCommands.create('BROWSER_FFT'); await recognizer.ensureModelLoaded(); // Add this line. buildModel();}app();// One frame is ~23ms of audio.const NUM_FRAMES = 6;let examples = [];function collect(label) { if (recognizer.isListening()) { return recognizer.stopListening(); } if (label == null) { return; } recognizer.listen(async ({spectrogram: {frameSize, data}}) => { let vals = normalize(data.subarray(-frameSize * NUM_FRAMES)); examples.push({vals, label}); document.querySelector('#console').textContent = `${examples.length} examples collected`; }, { overlapFactor: 0.999, includeSpectrogram: true, invokeCallbackOnNoiseAndUnknown: true });}function normalize(x) { const mean = -100; const std = 10; return x.map(x => (x - mean) / std);}const INPUT_SHAPE = [NUM_FRAMES, 232, 1];let model;async function train() { toggleButtons(false); const ys = tf.oneHot(examples.map(e => e.label), 3); const xsShape = [examples.length, ...INPUT_SHAPE]; const xs = tf.tensor(flatten(examples.map(e => e.vals)), xsShape); await model.fit(xs, ys, { batchSize: 16, epochs: 10, callbacks: { onEpochEnd: (epoch, logs) => { document.querySelector('#console').textContent = `Accuracy: ${(logs.acc * 100).toFixed(1)}% Epoch: ${epoch + 1}`; } } }); tf.dispose([xs, ys]); toggleButtons(true);}function buildModel() { model = tf.sequential(); model.add(tf.layers.depthwiseConv2d({ depthMultiplier: 8, kernelSize: [NUM_FRAMES, 3], activation: 'relu', inputShape: INPUT_SHAPE })); model.add(tf.layers.maxPooling2d({poolSize: [1, 2], strides: [2, 2]})); model.add(tf.layers.flatten()); model.add(tf.layers.dense({units: 3, activation: 'softmax'})); const optimizer = tf.train.adam(0.01); model.compile({ optimizer, loss: 'categoricalCrossentropy', metrics: ['accuracy'] });}function toggleButtons(enable) { document.querySelectorAll('button').forEach(b => b.disabled = !enable);}function flatten(tensors) { const size = tensors[0].length; const result = new Float32Array(tensors.length * size); tensors.forEach((arr, i) => result.set(arr, i * size)); return result;}var labels = ["Forward", "Back", "Left", "Right", "Up", "Down", "Take Off", "Land", "Switch View", "Noise"];async function finish(labelTensor) { const label = (await labelTensor.data())[0]; document.getElementById('console').textContent = labels[label];} function listen() { if (recognizer.isListening()) { recognizer.stopListening(); toggleButtons(true); document.getElementById('listen').textContent = 'Listen'; return; } toggleButtons(false); document.getElementById('listen').textContent = 'Stop'; document.getElementById('listen').disabled = false; recognizer.listen(async ({spectrogram: {frameSize, data}}) => { const vals = normalize(data.subarray(-frameSize * NUM_FRAMES)); const input = tf.tensor(vals, [1, ...INPUT_SHAPE]); const probs = model.predict(input); const predLabel = probs.argMax(1); await finish(predLabel); tf.dispose([input, probs, predLabel]); }, { overlapFactor: 0.999, includeSpectrogram: true, invokeCallbackOnNoiseAndUnknown: true });}async function save () { const model = await tf.loadLayersModel(HTTP-Server/dronemodel.json');}
然而,当我尝试将这段代码调整为识别13个不同的命令时,模型只返回了前三个命令(Start, Forward和Back),即使我只提供了这三个命令之外的一个命令的音频。有什么办法可以解决这个问题吗?
回答:
模型在给定最后一层的单位数为3
的情况下,对三个类别进行分类。需要将单位数更改为期望的命令数(13
),并相应地训练模型。