输出层一直卡在[0.5, 0.5]向量。有人能帮助理解代码是否有问题吗?
我尝试训练的神经网络是一个异或门,因此输出向量应该接近表示正确类别(0或1)的独热向量,但在所有训练轮次后输出向量仍然停留在[0.5, 0.5]
class Backpropogation: def setupWeightsBiases(self): for i in range(1, self.num_layers): self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1]) self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1) def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'): self.train_input = train_data[0] self.input_layer_size = self.train_input[0].size self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T self.output_layer_size = num_output_classes self.train_output = train_data[1] print(self.train_output.shape) num_hidden_layer = len(hidden_layer_neurons_tuple) self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple self.layer_spec = [self.input_layer_size] + \ list(hidden_layer_neurons_tuple) + \ [num_output_classes] self.layer_spec = tuple(self.layer_spec) self.num_layers = num_hidden_layer + 2 self.train_data = train_data self.activation_layer_gradient_dict = {} self.preactivation_layer_gradient_dict = {} self.weights_gradient_dict = {} self.bias_gradient_dict = {} self.curr_input = None self.curr_output = None self.weights_dict = {} self.preactivation_layer_dict = {} self.activation_layer_dict = {} self.bias_dict = {} self.setupWeightsBiases() self.output = None self.output_diff = None self.num_output_classes = num_output_classes def predictClass(self): return np.argmax(self.activation_layer_dict[self.num_layers - 1]) def forwardPropogation(self, input): # Load h[0] as the input data self.activation_layer_dict[0] = input ''' load input data into h[0] for i in (1,L): a[k] = W[k] * h[k-1] + b[k] and finally calculate the Lth layer output with the special activation function ''' for i in range(1, self.num_layers): self.preactivation_layer_dict[i] = \ np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \ self.bias_dict[i] # print(self.preactivation_layer_dict[i]) vec = self.preactivation_layer_dict[i] self.activation_layer_dict[i] = self.activationFunction(vec) # This will change h[L] to y' self.activation_layer_dict[self.num_layers - 1] = self.outputFunction() def findGradients(self, index): class_label = self.train_output[index] output_one_hot_vector = np.zeros((self.num_output_classes, 1)) output_one_hot_vector[class_label] = 1 output = self.activation_layer_dict[self.num_layers - 1] self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output) for layer in reversed(range(1, self.num_layers)): self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer], self.activation_layer_dict[layer - 1].T) self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer] self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T, self.preactivation_layer_gradient_dict[layer]) if layer != 1: self.preactivation_layer_gradient_dict[layer - 1] = np.multiply( self.activation_layer_gradient_dict[layer - 1], self.outputFunctionDiff(layer - 1)) def activationFunction(self, vec, type='sigmoid'): if type == 'sigmoid': return 1 / (1 + expit(-vec)) else: print('Please select correct output function') exit() def outputFunction(self, type='sigmoid'): if type == 'sigmoid': return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1])) else: print('Please select correct output function') exit() def outputFunctionDiff(self, layer, type='sigmoid'): op_layer = self.num_layers - 1 if type == 'sigmoid': vec = self.preactivation_layer_dict[layer] return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec)) else: print('Please select correct output function') exit() def updateWeightsAndBiases(self, learning_rate): for layer in range(1, self.num_layers): self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer] self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \ learning_rate * self.preactivation_layer_gradient_dict[layer] if not (layer == self.num_layers - 1): self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \ learning_rate * self.activation_layer_gradient_dict[layer] self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer] def getLoss(self, index): return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0]) def train(self, learning_rate, num_epochs): for curr_epoch in range(num_epochs): print('Evaluating at ' + str(curr_epoch)) index_array = list(np.arange(0, self.train_input.shape[1])) np.random.shuffle(index_array) for train_data_index in index_array: test_input = self.train_input[:, [train_data_index]] self.forwardPropogation(test_input) # print(self.activation_layer_dict[self.num_layers - 1]) self.findGradients(train_data_index) self.updateWeightsAndBiases(learning_rate) print('Loss ' + str(self.getLoss(train_data_index))) # Assumes a 2D array of 784xN array as test input # This will return output classes of the data def test(self, test_data): index_range = test_data.shape[1] test_class_list = [] for index in range(index_range): self.forwardPropogation(test_data[:, [index]]) test_class_list.append(self.predictClass()) return test_class_list # train the NN with BP train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])) b = Backpropogation((2, 2), train_data, 2)
回答:
以下代码(查看这里了解实现,查看这里了解理论)从头开始实现了一个使用反向传播的神经网络,使用单一输出单元和Sigmoid激活函数(除此之外看起来与你的实现相似)。
使用这个可以学习异或函数,选择适当的学习率和训练轮数(虽然有时可能会卡在局部最小值,你可以考虑实现drop-out等正则化方法)。另外,你可以将其转换为你的2输出(softmax?)版本,你能发现你的实现中的任何问题吗?例如,你可以查看以下几点:
- 在反向传播过程中批量更新参数而不是随机更新
- 运行足够多的训练轮数
- 改变学习率
- 对隐藏层使用ReLU激活函数而不是Sigmoid(以应对梯度消失问题)等。