我想实现一个多层感知器。
我在GitHub上找到了一些代码,能够很好地对MNIST进行分类(96%)。然而,出于某些原因,它无法处理XOR任务。
我想了解其中的原因。
以下是代码:
perceptron.py
import randomimport numpy as npclass Perceptron: def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv): self.layer_sizes = layer_sizes if len(self.layer_sizes) - 1 != len(activation_functions): raise ValueError("...") self.activation_functions = activation_functions self.cost_function_deriv = cost_function_deriv self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]] self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])] def train(self, training_data, test_data, epochs, mini_batch_size, lr): test_data_len = len(test_data) for epoch in range(epochs): random.shuffle(training_data) mini_batches = [training_data[x: x + mini_batch_size] for x in range(0, len(training_data), mini_batch_size)] for mini_batch in mini_batches: mb_len = len(mini_batch) gradient_weights = [np.zeros(w.shape) for w in self.weights] gradient_biases = [np.zeros(b.shape) for b in self.biases] for x, y in mini_batch: delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y) gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)] gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)] self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)] self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)] correct_answers = self.how_many_correct_answers(test_data) print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers") def backpropagation(self, x, y): gradient_b = [np.zeros(b.shape) for b in self.biases] gradient_w = [np.zeros(w.shape) for w in self.weights] activations = [x] prev_activation = x for i, (b, w) in enumerate(zip(self.biases, self.weights)): current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b) activations.append(current_activation) prev_activation = current_activation delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1]) gradient_b[-1] = delta gradient_w[-1] = np.dot(delta, activations[-2].T) for i in range(2, len(self.layer_sizes)): z = activations[-i] act_der = self.activation_functions[-i + 1].deriv(z) delta = np.dot(self.weights[-i + 1].T, delta) * act_der gradient_b[-i] = delta gradient_w[-i] = np.dot(delta, activations[-i - 1].T) # Normal indexing variant: # for i in range(len(self.layers) - 1, 0, -1): # z = activations[i] # act_der = self.activation_functions[i].deriv(z) # delta = np.dot(self.weights[i].T, delta) * act_der # gradient_b[i - 1] = delta # gradient_w[i - 1] = np.dot(delta, activations[i - 1].T) return gradient_b, gradient_w def feedforward(self, a): for i, (b, w) in enumerate(zip(self.biases, self.weights)): a = self.activation_functions[i](np.dot(w, a) + b) return a def how_many_correct_answers(self, test_data): k = 0 for x, y in test_data: y_predict = np.argmax(self.feedforward(x)) print(y_predict, y) k += int(y_predict == y) return k
main.py
from copy import deepcopyimport numpy as npfrom perceptron import Perceptronclass Sigmoid: out_min_max = [0, 1] def __call__(self, x): return 1. / (1. + np.exp(-x)) def deriv(self, y): # t = self(x) # return t * (1. - t) return y * (1. - y)def cost_function_derivative(y_predict, y_true_label): label_vector = np.zeros(y_predict.shape) label_vector[y_true_label] = 1.0 return y_predict - label_vectordef main(): training_data = np.asarray([[[[0], [0]], 0], [[[0], [1]], 1], [[[1], [0]], 1], [[[1], [1]], 0]]) layer_sizes = [2, 8, 2] model = Perceptron(layer_sizes=layer_sizes, activation_functions=[Sigmoid(), Sigmoid()], cost_function_deriv=cost_function_derivative) model.train(deepcopy(training_data), deepcopy(training_data), epochs=10000, mini_batch_size=4, lr=0.01)if __name__ == '__main__': main()
最终输出格式为’y_predict y_true’(每个epoch之后):
0 0
0 1
0 1
0 0
如果移除random.shuffle(training_data)则:
1 0
0 1
1 1
0 0
但不是0 1 1 0
回答:
我已经弄明白了。它需要以下设置。
mini_batch_size=1
# random.shuffle(training_data) -- 注释掉
epochs=10000
最好这样做:
lr=0.1
在大多数情况下,大约1000个epoch后可以得到结果:
0 0
1 1
1 1
0 0