我正在使用Python 2.7,并试图更好地了解TensorFlow。
我使用以下代码尝试在MNIST数据上训练自编码器,当我使用sigmoid激活函数时,泛化效果还可以(90%),但当我尝试使用ReLU时,结果几乎是随机的。
我找到了一些相关内容,但没有找到解决我问题的方案。
我做错了什么?我应该添加dropout吗?也许是成本函数或优化器与ReLU不兼容?
# Import MNIST datafrom tensorflow.examples.tutorials.mnist import input_datamnist = input_data.read_data_sets("MNIST_data", one_hot=True)# Parameterslearning_rate = 0.01training_epochs = 60batch_size = 256display_step = 1examples_to_show = 10# Network Parametersn_hidden_1 = 256 # 1st layer num features#n_hidden_1 = 400n_hidden_2 = 128 # 2nd layer num features#n_hidden_2 = 250n_hidden_3 = 60#n_hidden_2 = 30n_input = 784 # MNIST data input (img shape: 28*28)# tf Graph input (only pictures)X = tf.placeholder("float", [None, n_input])keep_prob = tf.placeholder("float", None)#keep_prob = tf.placeholder(tf.float32)weights = { 'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])), 'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 'encoder_h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])), 'decoder_h1': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_2])), 'decoder_h2': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])), 'decoder_h3': tf.Variable(tf.random_normal([n_hidden_1, n_input])),}biases = { 'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])), 'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])), 'encoder_b3': tf.Variable(tf.random_normal([n_hidden_3])), 'decoder_b1': tf.Variable(tf.random_normal([n_hidden_2])), 'decoder_b2': tf.Variable(tf.random_normal([n_hidden_1])), 'decoder_b3': tf.Variable(tf.random_normal([n_input])),}# Building the encoderdef encoder(x): # Encoder Hidden layer with sigmoid activation #1 layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1'])) # Decoder Hidden layer with sigmoid activation #2 dropout1 = tf.nn.dropout(layer_1, keep_prob) layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2'])) layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['encoder_h3']), biases['encoder_b3'])) return layer_3# Building the decoderdef decoder(x): # Encoder Hidden layer with sigmoid activation #1 layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1'])) # Decoder Hidden layer with sigmoid activation #2 layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2'])) layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['decoder_h3']), biases['decoder_b3'])) return layer_3# Construct modelencoder_op = encoder(X)decoder_op = decoder(encoder_op)# Predictiony_pred = decoder_opx_encode = encoder_op # Targets (Labels) are the input data.y_true = X# Define loss and optimizer, minimize the squared errorcost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)#optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)# Initializing the variablesinit = tf.global_variables_initializer()# Launch the graphwith tf.Session() as sess: sess.run(init) print("num examples are ", mnist.train.num_examples, mnist.validation.num_examples, mnist.test.num_examples) total_batch = int(mnist.train.num_examples/batch_size) # Training cycle for epoch in range(training_epochs): # Loop over all batches for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) # Run optimization op (backprop) and cost op (to get loss value) _, c = sess.run([optimizer, cost], feed_dict={X: batch_xs}) # Display logs per epoch step if epoch % display_step == 0: print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c)) print("Optimization Finished!") # Applying encode and decode over test set encode_decode = sess.run( y_pred, feed_dict={X: mnist.test.images[:examples_to_show]}) encoded_data = sess.run(x_encode, feed_dict={X: mnist.test.images})distance_matrix = scipy.spatial.distance.squareform(pdist(encoded_data))d_m_2 = distance_matrix[:,:]np.fill_diagonal(d_m_2,np.inf)labels = np.argmax(mnist.test.labels,1) #these are the labels!predicate = labels[np.argmin(d_m_2,1)] #get the indecies of the closest data sampleprint ("this is the ammount of coorect clasificcations in the test set", np.sum(labels==predicate)) #count how many similar values are there!
谢谢!
回答:
也许ReLU在处理负输入值时遇到了困难,因为它被定义为R(x) := max(0,x)。因此,如果输入为负数,R(x) = 0,梯度也会为零。这样,您的优化器就不知道如何更新参数。您可以尝试使用主要为正值的权重初始化方法,如tf.random_normal(shape=..., mean=0.5, stddev=0.2)
。这可能会减少这个问题的影响。