我对人工智能很感兴趣,并开始学习相关知识。我尝试基于向量实现一个MLP类,但它无法正常工作。
前馈函数似乎没问题,但显然我对反向传播算法的理解不够。训练函数是一个用于测试XOR情况的虚拟函数,网络总是返回相同的结果(在我这里是约0.625915),当输入为1和0时(期望输出为1),误差为0.264518,而当输入为1和1时(期望输出为0),误差变为0.442609。
我想知道我在反向传播和梯度下降方面做错了什么。下面是这个类的完整代码和主函数。感谢您的帮助和指导!
#include <iostream>#include <vector>#include <cassert>#include <functional>#include <stdlib.h>using namespace std;typedef function<double(double, bool)> func;typedef vector < vector < vector<double> > > Matrix3d;class Net {public: Net(const vector<unsigned> &topology, vector<func> &fns) { learning_rate = 0.1; alpha = 0.5; global_error = 1.0; activationFns = fns; nbLayers = topology.size(); lastLayerId = nbLayers - 1; gradients.resize(nbLayers); neuron_errors.resize(nbLayers); layers.resize(nbLayers); weights.resize(nbLayers); wdeltas.resize(nbLayers); for (unsigned layerNum = 0; layerNum < nbLayers; layerNum++) { bool isLastLayer = layerNum == lastLayerId; unsigned nbNeuronsInLayer = isLastLayer ? topology[layerNum] : topology[layerNum] + 1; unsigned nbWeights = isLastLayer ? 0 : topology[layerNum + 1] + 1; gradients[layerNum].resize(nbNeuronsInLayer, 0.0); layers[layerNum].resize(nbNeuronsInLayer); weights[layerNum].resize(nbNeuronsInLayer); wdeltas[layerNum].resize(nbNeuronsInLayer); neuron_errors[layerNum].resize(nbNeuronsInLayer, 0.0); if (! isLastLayer) { layers[layerNum][nbNeuronsInLayer-1] = 1.0; // 初始化偏置 } for (unsigned n = 0; n < weights[layerNum].size(); n++) { weights[layerNum][n].resize(nbWeights); // 设置下一层神经元的数量:这个神经元的权重数量 wdeltas[layerNum][n].resize(nbWeights, 0.0); InitialiseWeights(weights[layerNum][n]); // 随机化这一层神经元的权重 } } }; ~Net() { gradients.clear(); layers.clear(); weights.clear(); wdeltas.clear(); neuron_errors.clear(); }; // 通过网络传播 // 在前馈过程中,输出值等于激活函数(神经元输入的总和乘以它们的权重) // 对于前一层中的每个神经元: // 取其输出:prevLayer[n],然后乘以当前层神经元i的权重 void FeedForward(const vector<double> &inputs) { assert(inputs.size() == layers[0].size() - 1); // 将输入分配给输入层神经元的输出 for (unsigned i = 0; i < inputs.size(); i++) { layers[0][i] = inputs[i]; } for (unsigned layerNum = 1; layerNum < nbLayers; layerNum++) { vector<double> &prevLayer = layers[layerNum - 1]; const bool isLastLayer = layerNum == lastLayerId; const unsigned forcap = isLastLayer ? layers[layerNum].size() : layers[layerNum].size() - 1; for (unsigned i = 0; i < forcap; i++) { const double bias = prevLayer[prevLayer.size()-1] * weights[layerNum-1][weights[layerNum-1].size()-1][i]; double output = 0.0; for (unsigned n = 0; n < prevLayer.size() - 1; n++) { output += prevLayer[n] * weights[layerNum - 1][n][i]; } output += bias; layers[layerNum][i] = activationFns[layerNum - 1](output, false); } } //Print(); }; void BackPropagate(const vector<double> &targets) { vector<double> &guessed = layers[lastLayerId]; func &outputActivationFn = activationFns[lastLayerId]; assert(targets.size() == guessed.size()); global_error = 0.0; // 计算输出层的误差 // for (unsigned t = 0; t < targets.size(); t++) { double diff_ = targets[t] - guessed[t]; global_error += (diff_ * diff_); neuron_errors[lastLayerId][t] = targets[t] - guessed[t]; // 输出神经元的误差 gradients[lastLayerId][t] = diff_ * outputActivationFn(guessed[t], true); } if (guessed.size() > 1) global_error /= guessed.size()-1; else global_error *= 0.5; global_error = sqrt(global_error); // 计算其他层神经元的误差 for (unsigned l = nbLayers - 2; l < nbLayers; --l) { // 获取连接隐藏层和输出层的权重 for (unsigned n = 0; n < layers[l].size(); n++) { // 对于这一层的每个神经元 neuron_errors[l][n] = 0.0; for (unsigned m = 0; m < layers[l+1].size(); m++) { // 目标是上一层的第m个神经元 double &weight = weights[l][n][m]; // 这里可以计算神经元n的误差 neuron_errors[l][n] += weight * gradients[l+1][m]; } gradients[l][n] = neuron_errors[l][n] * activationFns[l](layers[l][n], true); // ? } } // 更新权重 (?) for (unsigned l = nbLayers - 2; l < nbLayers; --l) { for (unsigned n = 0; n < layers[l].size(); n++) { for (unsigned m = 0; m < layers[l + 1].size(); m++) { weights[l][n][m] -= (learning_rate * gradients[l][n] * layers[l][n]) + (wdeltas[l][n][m] * alpha); wdeltas[l][n][m] = (learning_rate * gradients[l][n] * layers[l][n]) + (wdeltas[l][n][m] * alpha); } } } }; void GetResults(vector<double> &results) { results.clear(); for (unsigned i = 0; i < layers[lastLayerId].size(); i++) { results[i] = layers[lastLayerId][i]; } }; void Train() { vector < vector<double> > ins = { { 1.0, 0.0 }, { 0.0, 1.0 }, { 0.0, 0.0 }, { 1.0, 1.0 } }; vector < vector<double> > outs = { { 1.0 }, { 1.0 }, { 0.0 }, { 0.0 } }; for (unsigned i = 0; i < 1000; i++) { unsigned r = rand() % ins.size(); vector<double> k = ins[r]; vector<double> o = outs[r]; FeedForward(k); BackPropagate(o); cout << "[" << i << "] " << k[0] << " & " << k[1] << " -> " << o[0] << "\tresult : " << layers[lastLayerId][0] << "\terror = " << global_error << endl; } cout << endl << "Test: [ 1 , 0 ]" << endl; FeedForward({ 1.0, 0.0 }); BackPropagate({ 1.0 }); cout << "Result : " << layers[lastLayerId][0] << "\t(error = " << global_error << endl; cout << "Test: [ 1 , 1 ]" << endl; FeedForward({ 0.85, 0.99 }); BackPropagate({ 0.0 }); cout << "Result : " << layers[lastLayerId][0] << "\t(error = " << global_error << endl; }; double Getglobal_error(void) const { return global_error; }; void Print(void) { for (unsigned l = 0; l < nbLayers; l++) { cout << "Layer " << l << " : " << endl; for (unsigned n = 0; n < layers[l].size(); n++) { cout << "\t" << "Neuron " << l << "-" << n << " : "; cout << "(" << layers[l][n] << ")" << endl; for (unsigned w = 0; w < weights[l][n].size(); w++) { cout << "\t\t" << l << "-" << n << " -> " << (l+1) << "-" << w << " | weight=" << weights[l][n][w] << endl; } } } }private: void InitialiseWeights(vector<double> &weights_) { for (unsigned w = 0; w < weights_.size(); w++) { weights_[w] = ((double) rand() / (RAND_MAX)); } } double global_error; double learning_rate; double alpha; unsigned nbLayers; unsigned lastLayerId; vector<func> activationFns; vector< vector<double> > gradients; // [layerNum][neuronNum] 神经元误差梯度 vector< vector<double> > layers; // [layerNum][neuronNum] vector< vector<double> > neuron_errors; // [layerNum][neuronNum] // 神经元误差 Matrix3d weights; // [layer][neuron][outputWeight] Matrix3d wdeltas; // [layer][neuron][outputWeight]};double transfer_tanh(double x, bool isDerivative) { if (isDerivative) { return 1.0 - (tanh(x) * tanh(x)); } return tanh(x);}double transfer_sigmoid(double x, bool isDerivative) { if (isDerivative) { return x * (1.0 - x); } return 1.0 / (1.0 + exp(-x));}int main () { vector<unsigned> topo = { 2, 2, 1 }; vector<func> funcs = { transfer_sigmoid, transfer_sigmoid, transfer_sigmoid }; Net mynet(topo, funcs); /* mynet.FeedForward({ 1.0, 0.0 }); mynet.BackPropagate({ 1.0 }); mynet.Print(); mynet.FeedForward({ 1.0, 0.0 }); mynet.BackPropagate({ 1.0 }); mynet.Print(); */ mynet.Train();}
回答:
我对反向传播相关的数学理解不足。感谢这个资源:https://pabloinsente.github.io/the-multilayer-perceptron,我弄明白了这个BackPropagate方法:
void BackPropagate(const vector<double> &targets) { assert(targets.size() == layers[lastLayerId].size()); global_error = 0.0; for (unsigned l = lastLayerId; l < nbLayers; --l) { for (unsigned n = 0; n < layers[l].size(); n++) { neuron_errors[l][n] = 0.0; if (l == lastLayerId) { // 输出层 global_error += (targets[n] - layers[lastLayerId][n]) * (targets[n] - layers[lastLayerId][n]); neuron_errors[lastLayerId][n] = (targets[n] - layers[lastLayerId][n]) * activationFns[lastLayerId](layers[lastLayerId][n], true); continue; } for (unsigned m = 0; m < layers[l + 1].size(); m++) { double neuron_output = (l == 0) ? inputs[n] : layers[l][n]; double delta = learning_rate * (neuron_errors[l + 1][m] * neuron_output); neuron_errors[l][n] += (neuron_errors[l + 1][m] * weights[l][n][m]) * activationFns[l](layers[l][n], true); weights[l][n][m] += delta + (wdeltas[l][n][m] * alpha); wdeltas[l][n][m] = delta; } } }}