我使用Eigen库进行矩阵乘法,在C++中实现了一个全连接的单隐藏层神经网络。它使用小批量梯度下降法进行训练。
然而,我的模型在MNIST数据集上的准确率无法超过50%。我已经尝试了从0.0001到10之间的学习率。当训练数据集大小小于100时,模型会过拟合(准确率约为90%,尽管如此,这仍然很差),但学习速度极其缓慢。
是什么导致了这种低准确率和极慢的学习速度?我主要担心的是反向传播可能有误。此外,我希望尽量不添加其他优化技术(如学习率调度、正则化等)。
前馈和反向传播代码如下:
z1 = (w1 * mbX).colwise() + b1;a1 = sigmoid(z1);z2 = (w2 * a1).colwise() + b2;a2 = sigmoid(z2);MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();b2 = b2 - err * ones;w2 = w2 - (err * a1.transpose());err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();b1 = b1 - err * ones;w1 = w1 - (err * mbX.transpose());
完整程序代码如下:
#include <iostream>#include <fstream>#include <math.h>#include <cstdlib>#include <Eigen/Dense>#include <vector>#include <string>using namespace Eigen;#define N 30#define epsilon 0.7#define epoch 1000//sizesconst int minibatch_size = 10;const int training_size = 10000;const int val_size = 10;unsigned int num, magic, rows, cols;//imagesunsigned int image[training_size][28][28];unsigned int val_image[val_size][28][28];//labelsunsigned int label[training_size];unsigned int val_label[val_size];//inputsMatrixXd X(784, training_size);MatrixXd Y = MatrixXd::Zero(10, training_size);//minibatchMatrixXd mbX(784, minibatch_size);MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);//validationMatrixXd Xv(784, val_size);MatrixXd Yv = MatrixXd::Zero(10, val_size);//Image processing courtesy of https://stackoverflow.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%adunsigned int in(std::ifstream& icin, unsigned int size) { unsigned int ans = 0; for (int i = 0; i < size; i++) { unsigned char x; icin.read((char*)&x, 1); unsigned int temp = x; ans <<= 8; ans += temp; } return ans;}void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) { std::ifstream icin; //training data icin.open(ipath, std::ios::binary); magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4); for (int i = 0; i < training_size; i++) { int val = 0; for (int x = 0; x < rows; x++) { for (int y = 0; y < cols; y++) { image[i][x][y] = in(icin, 1); X(val, i) = image[i][x][y]/255; val++; } } } icin.close(); //training labels icin.open(lpath, std::ios::binary); magic = in(icin, 4), num = in(icin, 4); for (int i = 0; i < training_size; i++) { label[i] = in(icin, 1); Y(label[i], i) = 1; } icin.close(); //validation data icin.open(ipath2, std::ios::binary); magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4); for (int i = 0; i < val_size; i++) { int val = 0; for (int x = 0; x < rows; x++) { for (int y = 0; y < cols; y++) { val_image[i][x][y] = in(icin, 1); Xv(val, i) = val_image[i][x][y]/255; val++; } } } icin.close(); //validation labels icin.open(lpath2, std::ios::binary); magic = in(icin, 4), num = in(icin, 4); for (int i = 0; i < val_size; i++) { val_label[i] = in(icin, 1); Yv(val_label[i], i) = 1; } icin.close();}//Neural Network calculationsMatrixXd sigmoid(MatrixXd m) { m *= -1; return (1/(1 + m.array().exp())).matrix();}MatrixXd sigmoid_derivative(MatrixXd m) { return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();}//Initialize weights and biases//hidden layerVectorXd b1 = MatrixXd::Zero(N, 1);MatrixXd w1 = MatrixXd::Random(N, 784);//outputVectorXd b2 = MatrixXd::Zero(10, 1);MatrixXd w2 = MatrixXd::Random(10, N);//Initialize intermediate valuesMatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);int main() { input("C:\\Users\\Aaron\\Documents\\Test\\train-images-idx3-ubyte\\train-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\train-labels-idx1-ubyte\\train-labels.idx1-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-labels-idx1-ubyte\\t10k-labels.idx1-ubyte"); std::cout << "Finished Image Processing" << std::endl; //std::cout << w1 << std::endl; std::vector<double> val_ac; std::vector<double> c; std::vector<int> order; for (int i = 0; i < training_size; i++) { order.push_back(i); } for (int i = 0; i < epoch; i++) { //feed forward std::random_shuffle(order.begin(), order.end()); for (int j = 0; j < training_size/minibatch_size; j++) { for (int k = 0; k < minibatch_size; k++) { int index = order[j * minibatch_size + k]; mbX.col(k) = X.col(index); mbY.col(k) = Y.col(index); } z1 = (w1 * mbX).colwise() + b1; a1 = sigmoid(z1); z2 = (w2 * a1).colwise() + b2; a2 = sigmoid(z2); MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix(); b2 = b2 - err * ones; w2 = w2 - (err * a1.transpose()); err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix(); b1 = b1 - err * ones; w1 = w1 - (err * mbX.transpose()); } //validation z1v = (w1 * Xv).colwise() + b1; a1v = sigmoid(z1v); z2v = (w2 * a1v).colwise() + b2; a2v = sigmoid(z2v); double cost = 0; for (int j = 0; j < val_size; j++) { cost += -log(a2v(val_label[j], j)); } cost /= val_size; c.push_back(cost); int correct = 0; for (int j = 0; j < val_size; j++) { double maxP = -1; int na; for (int k = 0; k < 10; k++) { if (a2v(k, j) > maxP) { maxP = a2v(k, j); na = k; } } if (na == val_label[j]) correct++; } val_ac.push_back((double)correct / val_size); std::cout << "Epoch " << i + 1 << " completed. Cost: " << cost << ", Accuracy: " << (double)correct / val_size << std::endl; } return 0;}
回答: