我是这个网站的新手,所以如果我的帖子有什么不妥之处,请提前原谅。
我目前正在尝试机器学习,并且正在学习神经网络。我现在使用的是http://neuralnetworksanddeeplearning.com/。然而,我并没有完全理解所有内容,而且所有的代码都是用Python编写的(我更习惯使用JavaScript)。
我已经创建了一个适用于简单数据的程序。然而,对于更复杂的数据(使用MNIST数据进行手写数字识别),准确率远低于上述网站声称的,使用784个输入神经元、10到400个隐藏层神经元(只有一个隐藏层,并尝试了几种可能的神经元数量)和10个输出神经元,经过数百次迭代。我认为我的反向传播步骤(即训练步骤,我在这里包含了其他函数作为参考)存在错误,导致学习速度不够快(顺便提一下,我使用的是交叉熵作为我的成本函数)。如果有人能帮我找出错误,我将不胜感激。提前谢谢了。
以下是代码。权重被排列在一个三维数组中(weight[i][j][k]
是第i层中的第j个神经元与第(i+1)层中的第k个神经元之间的权重)。类似地,bias[i][j]
是第(i+1)层中第j个神经元的偏置。训练数据被格式化为一个包含输入和输出的对象数组(见下面的示例)。
class NeuralNetwork { constructor(layers) { // 检查layers是否为有效参数 // 初始化神经网络 if (!Array.isArray(layers) || layers.length < 2) { throw Error("Layers must be specified as an array of length at least 2"); } this.weights = []; this.biases = []; for (let i = 0, l = layers.length; i < l; ++i) { let currentLayer = layers[i]; if (typeof currentLayer === "number" && Number.isInteger(currentLayer) && currentLayer > 0) { let numWeights = layers[i + 1]; if (i < l - 1) { this.weights.push([]); } if (i) { this.biases.push([]); } // 初始化权重和偏置 for (let j = 0; j < currentLayer; ++j) { if (i < l - 1) { let weights = []; for (let k = 0; k < numWeights; ++k) { weights.push(Math.random() * 2 - 1); } this.weights[i].push(weights); } if (i) { this.biases[i - 1].push(Math.random() * 2 - 1); } } } else { throw Error("Array used to specify NeuralNetwork layers must consist solely of positive integers"); } } this.activation = (x) => 1 / (1 + Math.exp(-x)); this.activationDerivative = (x) => this.activation(x) * (1 - this.activation(x)); Object.freeze(this); console.log("Successfully initialized NeuralNetwork"); return this; } run(input, training) { // 前向传播 let currentInput; if (training) { currentInput = [input.map((a) => {return {before: a, after: a}})]; } else { currentInput = [...input]; } for (let i = 0, l = this.weights.length; i < l; ++i) { let newInput = []; for (let j = 0, m = this.weights[i][0].length, n = (training ? currentInput[i] : currentInput).length; j < m; ++j) { let sum = this.biases[i][j]; for (let k = 0; k < n; ++k) { sum += (training ? currentInput[i][k].after : currentInput[k]) * this.weights[i][k][j]; } if (training) { newInput.push({ before: sum, after: this.activation(sum) }); } else { newInput.push(this.activation(sum)); } } if (training) { currentInput.push(newInput); } else { currentInput = newInput; } } return currentInput; } train(data, learningRate = 0.1, batch = 50, iterations = 10000) { // 反向传播 console.log("Initialized training"); let length = data.length, totalCost = 0, learningRateFunction = typeof learningRate === "function", batchCount = 0, weightChanges = [], biasChanges = []; for (let i = 0; i < iterations; ++i) { let rate = learningRateFunction ? learningRate(i, totalCost) : learningRate; totalCost = 0; for (let j = 0, l = length; j < l; ++j) { let currentData = data[j], result = this.run(currentData.input, true), outputLayer = result[result.length - 1], outputLayerError = [], errors = []; for (let k = 0, m = outputLayer.length; k < m; ++k) { let currentOutputNeuron = outputLayer[k]; outputLayerError.push(currentOutputNeuron.after - currentData.output[k]); totalCost -= Math.log(currentOutputNeuron.after) * currentData.output[k] + Math.log(1 - currentOutputNeuron.after) * (1 - currentData.output[k]); } errors.push(outputLayerError); for (let k = result.length - 1; k > 1; --k) { let previousErrors = errors[0], newErrors = [], currentLayerWeights = this.weights[k - 1], previousResult = result[k - 1]; for (let i = 0, n = currentLayerWeights.length; i < n; ++i) { let sum = 0, currentNeuronWeights = currentLayerWeights[i]; for (let j = 0, o = currentNeuronWeights.length; j < o; ++j) { sum += currentNeuronWeights[j] * previousErrors[j]; } newErrors.push(sum * this.activationDerivative(previousResult[i].before)); } errors.unshift(newErrors); } for (let k = 0, n = this.biases.length; k < n; ++k) { if (!weightChanges[k]) weightChanges[k] = []; if (!biasChanges[k]) biasChanges[k] = []; let currentLayerWeights = this.weights[k], currentLayerBiases = this.biases[k], currentLayerErrors = errors[k], currentLayerResults = result[k], currentLayerWeightChanges = weightChanges[k], currentLayerBiasChanges = biasChanges[k]; for (let i = 0, o = currentLayerBiases.length; i < o; ++i) { let change = rate * currentLayerErrors[i]; for (let j = 0, p = currentLayerWeights.length; j < p; ++j) { if (!currentLayerWeightChanges[j]) currentLayerWeightChanges[j] = []; currentLayerWeightChanges[j][i] = (currentLayerWeightChanges[j][i] || 0) - change * currentLayerResults[j].after; } currentLayerBiasChanges[i] = (currentLayerBiasChanges[i] || 0) - change; } } ++batchCount; if (batchCount % batch === 0 || i === iterations - 1 && j === l - 1) { for (let k = 0, n = this.weights.length; k < n; ++k) { let currentLayerWeights = this.weights[k], currentLayerBiases = this.biases[k], currentLayerWeightChanges = weightChanges[k], currentLayerBiasChanges = biasChanges[k]; for (let i = 0, o = currentLayerWeights.length; i < o; ++i) { let currentNeuronWeights = currentLayerWeights[i], currentNeuronWeightChanges = currentLayerWeightChanges[i]; for (let j = 0, p = currentNeuronWeights.length; j < p; ++j) { currentNeuronWeights[j] += currentNeuronWeightChanges[j] / batch; } currentLayerBiases[i] += currentLayerBiasChanges[i] / batch; } } weightChanges = []; biasChanges = []; } } totalCost /= length; } console.log(`Training ended due to iterations reached\nIterations: ${iterations} times\nTime spent: ${(new Date).getTime() - startTime} ms`); return this; }}
示例
测试一个点是否在圆内。对于这个示例,神经网络表现良好。然而,对于更复杂的示例,如手写识别,神经网络表现非常差(我能得到的单个神经网络的最佳准确率是70%,而网站上声称的准确率是96%,即使使用相似的参数)。
let trainingData = [];for (let i = 0; i < 1000; ++i) { let [x, y] = [Math.random(), Math.random()]; trainingData.push({input: [x, y], output: [Number(Math.hypot(x,y) < 1)]});}let brain = new NeuralNetwork([2, 5, 5, 1]);brain.train(trainingData.slice(0,700), 0.1, 10, 500); // 在trainingData中剩余的300个条目上的准确率为95.33%
回答:
好吧,我想我会自己回答这个问题。所以,我认为我的代码没有错误,任何人如果想用的话都可以使用(尽管效率非常低)。
我在MNIST数据上的运行没有给出准确答案的原因是我最初没有处理数据。原始数据给出了28*28像素的黑暗度,范围在[0, 255]之间,我直接将其用作每个训练数据的输入。正确的做法是将其转换到[0, 1]或[-1, 1]的范围内。
[0, 255]范围不工作的原因是由于第二隐藏层的神经元会接收到非常正或非常负的输入。
当反向传播算法计算梯度时,每个权重的变化会非常小,因为它与神经元输入处的激活函数的斜率成比例(逻辑函数的导数是exp(-x)/(1+exp(-x)),对于非常正和非常负的x值接近0)。因此,神经网络需要很长时间来训练,在我的情况下,无法很好地学习数据。
使用正确的方法,我能够在一个相当短的时间内实现784*200*10神经网络的约90%的准确率,尽管这仍然远不如问题中提到的链接中作者声称的使用更简单的算法所能达到的准确率高。