我一直在研究一个具有一个隐藏层的NN,每层的神经元数量可以灵活调整。以下是代码:
import time
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
class NeuralNetwork():
correct = 0
num_predictions = 10
epochs = 100
sizeOfEpoch = 5000
Lambda = 10
learningRate = 0.00001
def __init__(self, sizes):
self.dimensions = sizes
self.x = np.arange(1,self.epochs+1)
self.y = np.empty(self.epochs)
self.secondLayerNeurons = np.empty(sizes[1])
self.outputNeurons = np.empty(sizes[2])
self.firstLayerWeights = np.random.rand(sizes[1], sizes[0])
self.secondLayerWeights = np.random.rand(sizes[2], sizes[1])
self.firstLayerBiases = np.random.rand(sizes[1])
self.secondLayerBiases = np.random.rand(sizes[2])
self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
self.firstLayerBiasesSummations = np.zeros([sizes[1]])
self.secondLayerBiasesSummations = np.zeros([sizes[2]])
self.hiddenLayerErrors = np.empty(sizes[1])
self.outputLayerErrors = np.empty(sizes[2])
def sigmoid(self, x):
return 1/(1+np.exp(-x))
def sigmoidDerivative(self, x):
return np.multiply(x,(1-x))
def forwardProp(self, inputs):
for i in range (self.dimensions[1]):
self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])
def backProp(self, inputs, correct_output):
self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j==0:
self.secondLayerBiasesSummations[i] += self.outputLayerErrors[i]
self.secondLayerWeightsSummations[i][j] += self.outputLayerErrors[i]*self.secondLayerNeurons[j]
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j==0:
self.firstLayerBiasesSummations[i] += self.hiddenLayerErrors[i]
self.firstLayerWeightsSummations[i][j] += self.hiddenLayerErrors[i]*inputs[j]
def train(self, trainImages, trainLabels):
size = str(self.sizeOfEpoch)
greatestError = 0.0
start_time2 = time.time()
for m in range (self.sizeOfEpoch):
correct_output = np.zeros([self.dimensions[2]])
correct_output[int(class_names[trainLabels[m]])] = 1.0
self.forwardProp(trainImages[m].flatten())
self.backProp(trainImages[m].flatten(), correct_output)
if np.argmax(self.outputNeurons) == int(trainLabels[m]):
self.correct+=1
if m%200 == 0:
error = np.amax(np.absolute(self.outputLayerErrors))
if error > greatestError:
greatestError = error
accuracy = str(int((self.correct/(m+1))*100)) + '%'
percent = str(int((m/self.sizeOfEpoch)*100)) + '%'
print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
self.change()
time2 = str(round((time.time() - start_time2), 2))
print (size + '/' + size + " -- " + time2 + "s" + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
return greatestError
def change(self):
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j == 0:
self.secondLayerBiases[i] -= self.learningRate*self.secondLayerBiasesSummations[i]
self.secondLayerWeights[i][j] -= self.learningRate*(self.secondLayerWeightsSummations[i][j]+self.Lambda*self.secondLayerWeights[i][j])
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j == 0:
self.firstLayerBiases[i] -= self.learningRate*self.firstLayerBiasesSummations[i]
self.firstLayerWeights[i][j] -= self.learningRate*(self.firstLayerWeightsSummations[i][j]+self.Lambda*self.firstLayerWeights[i][j])
self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])
self.correct = 0
def predict(self, testImage):
secondLayerAnsNodes = np.empty([self.dimensions[1]])
outputAns = np.empty([self.dimensions[2]])
for i in range (self.dimensions[1]):
secondLayerAnsNodes[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], testImage)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
outputAns[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], secondLayerAnsNodes)+self.secondLayerBiases[i])
return np.argmax(outputAns)
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images/255.0
test_images = test_images/255.0
neural_network = NeuralNetwork([784, 16, 10])
start_time = time.time()
for i in range (neural_network.epochs):
print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
neural_network.y[i]=neural_network.train(train_images, train_labels)
time = time.time() - start_time
plt.plot(neural_network.x, neural_network.y, 'b')
plt.ylabel('Error Change')
plt.xlabel('Epochs')
plt.show()
print("\n\n\nTotal Time Used")
if time/60 < 60:
print("Minutes: %s" % round((time/60),2))
else:
print("Seconds: %s" % round(time,2))
for i in range (neural_network.num_predictions):
prediction = neural_network.predict(test_images[i].flatten())
plt.grid(False)
plt.imshow(test_images[i], cmap=plt.cm.binary)
plt.title("Prediction: " + str(prediction) + " -- Actual: " + class_names[test_labels[i]] + "\n" + str(i+1) + "/" + str(neural_network.num_predictions))
plt.show()
不知为何,这段代码在处理更复杂的问题时不起作用。错误并未减少,准确率也保持不变。这段代码在处理异或问题和类似问题时是有效的。当我尝试使用MNIST数字数据集时,它就不工作了。唯一的区别是每层的神经元数量更多,算法是相同的。
这里可能的问题是什么?
这是运行20个周期后,学习率为0.000001,Lambda为10的图表。它显示了每周期的错误。Y轴标签应该显示为“错误”,而不是“错误变化”。https://i.sstatic.net/fLXzz.png
回答:
从技术上讲,你的实现并没有什么问题。然而,有几点需要注意,这些都对你看到的性能有显著影响。这是一个较长的回答,但我对你的代码所做的每一部分改动都反映了重要的变化,使其按预期工作,所以请仔细阅读所有内容。
首先,你不应该在(0, 1)范围内初始化权重,这是np.random.randn
默认的做法。具体来说,如果你要选择均匀随机权重,均匀分布应该以零为中心。例如,选择-1到1范围内的随机数,或-0.1到0.1范围内的随机数。否则,你的多层感知器(MLP)会立即出现偏见;许多隐藏层神经元会立即通过Sigmoid激活函数映射到接近1的值。毕竟,Sigmoid激活函数在x轴上以零为中心,因此你的默认输入也应该如此。这个问题很容易阻止你的MLP完全收敛(事实上,在你的情况下就是这样)。除了均匀随机分布外,还有更好的权重初始化方法,但这并不是说如果正确执行,这种方法不会有效。
其次,你可能应该对图像数据进行归一化。神经网络在处理0到255之间的输入时表现不佳,这是Keras默认导出的图像数据格式。你可以通过简单地将每个输入特征除以255来解决这个问题。这样做的原因是Sigmoid曲线在高幅度子域的导数非常小。换句话说,当x非常大或非常小(非常负)时,Sigmoid(x)相对于x的导数几乎为零。当你将一些权重乘以非常大的值(例如255)时,你很可能会立即进入Sigmoid曲线的高幅度域。这并不一定会阻止你的网络收敛,但它肯定会在一开始就减慢速度,因为小的导数会导致小的梯度,从而导致小的权重更新。你可以增加学习率,但这可能会导致神经网络在离开Sigmoid曲线低导数区域后超过(甚至发散)。我已经在你的特定程序中测试(并修复)了这个问题,并且它确实产生了显著的差异(最终准确率约为0.8,而不是0.6)。