在下面的代码中,我使用了10个卷积层,然后是一个LSTM来计算输出。
如果我使用1个卷积层然后是一个LSTM,一切正常。但是如果我开始添加更多的卷积层(如下代码中的10个卷积层),损失会变得非常大,准确性开始下降。我在每个卷积层之后都应用了批量归一化,以确保梯度不会消失。为了使这个网络过拟合,我使用了5到10个例子,来检查网络是否过拟合,但结果却给我带来了巨大的损失,如果我减少卷积层的数量,一切正常,如果我增加更多的例子,损失会减少到一定程度然后停止。这里有什么bug吗?
编辑:如果您想尝试,这里是可复现的代码 – 链接
X = tf.placeholder(tf.float32, [None,time_steps,embedding])Y = tf.placeholder(tf.int32, [None])A = tf.placeholder(tf.bool)B = tf.placeholder(tf.float32)x = tf.expand_dims(X,3)filter_shape = [1, embedding, 1, 64]conv_weights = tf.get_variable("conv_weights1" , filter_shape, tf.float32, tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases = tf.Variable(tf.constant(0.1, shape=[64]))conv = tf.nn.conv2d(x, conv_weights, strides=[1,1,1,1], padding = "VALID")normalize = tf.nn.elu(conv + conv_biases)tf_normalize = tf.contrib.layers.batch_norm(inputs = normalize,is_training = A)outputs_fed_lstm = tf_normalizefilter_shape2 = [1, 1, 64, 64]conv_weights2 = tf.get_variable("conv_weights2" , filter_shape2, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases2 = tf.Variable(tf.constant(0.1, shape=[64]))conv2 = tf.nn.conv2d(outputs_fed_lstm, conv_weights2, strides=[1,1,1,1], padding = "VALID")normalize2 = tf.nn.elu(conv2 + conv_biases2)tf_normalize2 = tf.contrib.layers.batch_norm(inputs = normalize2,is_training = A)outputs_fed_lstm2 = tf_normalize2filter_shape3 = [1, 1, 64, 64]conv_weights3 = tf.get_variable("conv_weights3" , filter_shape3, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases3 = tf.Variable(tf.constant(0.1, shape=[64]))conv3 = tf.nn.conv2d(outputs_fed_lstm2, conv_weights3, strides=[1,1,1,1], padding = "VALID")normalize3 = tf.nn.elu(conv3 + conv_biases3)tf_normalize3 = tf.contrib.layers.batch_norm(inputs = normalize3,is_training = A)outputs_fed_lstm3 = tf_normalize3filter_shape4 = [1, 1, 64, 128]conv_weights4 = tf.get_variable("conv_weights4" , filter_shape4, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases4 = tf.Variable(tf.constant(0.1, shape=[128]))conv4 = tf.nn.conv2d(outputs_fed_lstm3, conv_weights4, strides=[1,1,1,1], padding = "VALID")normalize4 = tf.nn.elu(conv4 + conv_biases4)tf_normalize4 = tf.contrib.layers.batch_norm(inputs = normalize4,is_training = A)outputs_fed_lstm4 = tf_normalize4filter_shape5 = [1, 1, 128, 128]conv_weights5 = tf.get_variable("conv_weights5" , filter_shape5, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases5 = tf.Variable(tf.constant(0.1, shape=[128]))conv5 = tf.nn.conv2d(outputs_fed_lstm4, conv_weights5, strides=[1,1,1,1], padding = "VALID")normalize5 = tf.nn.elu(conv5 + conv_biases5)tf_normalize5 = tf.contrib.layers.batch_norm(inputs = normalize5,is_training = A)outputs_fed_lstm5 = tf_normalize5filter_shape6 = [1, 1, 128, 128]conv_weights6 = tf.get_variable("conv_weights6" , filter_shape6, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases6 = tf.Variable(tf.constant(0.1, shape=[128]))conv6 = tf.nn.conv2d(outputs_fed_lstm5, conv_weights6, strides=[1,1,1,1], padding = "VALID")normalize6 = tf.nn.elu(conv6 + conv_biases6)tf_normalize6 = tf.contrib.layers.batch_norm(inputs = normalize6,is_training = A)outputs_fed_lstm6 = tf_normalize6 filter_shape7 = [1, 1, 128, 256]conv_weights7 = tf.get_variable("conv_weights7" , filter_shape7, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases7 = tf.Variable(tf.constant(0.1, shape=[256]))conv7 = tf.nn.conv2d(outputs_fed_lstm6, conv_weights7, strides=[1,1,1,1], padding = "VALID")normalize7 = tf.nn.elu(conv7 + conv_biases7)tf_normalize7 = tf.contrib.layers.batch_norm(inputs = normalize7,is_training = A)outputs_fed_lstm7 = tf_normalize7 filter_shape8 = [1, 1, 256, 256]conv_weights8 = tf.get_variable("conv_weights8" , filter_shape8, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases8 = tf.Variable(tf.constant(0.1, shape=[256]))conv8 = tf.nn.conv2d(outputs_fed_lstm7, conv_weights8, strides=[1,1,1,1], padding = "VALID")normalize8 = tf.nn.elu(conv8 + conv_biases8)tf_normalize8 = tf.contrib.layers.batch_norm(inputs = normalize8,is_training = A)outputs_fed_lstm8 = tf_normalize8 filter_shape9 = [1, 1, 256, 256]conv_weights9 = tf.get_variable("conv_weights9" , filter_shape9, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases9 = tf.Variable(tf.constant(0.1, shape=[256]))conv9 = tf.nn.conv2d(outputs_fed_lstm8, conv_weights9, strides=[1,1,1,1], padding = "VALID")normalize9 = tf.nn.elu(conv9 + conv_biases9)tf_normalize9 = tf.contrib.layers.batch_norm(inputs = normalize9,is_training = A)outputs_fed_lstm9 = tf_normalize9 filter_shape0 = [1, 1, 256, 512]conv_weights0 = tf.get_variable("conv_weights0" , filter_shape0, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))conv_biases0 = tf.Variable(tf.constant(0.1, shape=[512]))conv0 = tf.nn.conv2d(outputs_fed_lstm9, conv_weights0, strides=[1,1,1,1], padding = "VALID")normalize0 = tf.nn.elu(conv0 + conv_biases0)tf_normalize0 = tf.contrib.layers.batch_norm(inputs = normalize0,is_training = A)outputs_fed_lstm0 = tf_normalize0 outputs_fed_lstm10 = tf.nn.dropout(x = outputs_fed_lstm0, keep_prob = B) x = tf.squeeze(outputs_fed_lstm10, [2]) x = tf.transpose(x, [1, 0, 2])x = tf.reshape(x, [-1, 512])x = tf.split(0, time_steps, x)lstm = tf.nn.rnn_cell.LSTMCell(num_units = _units, state_is_tuple=True)# multi_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * lstm_layers, state_is_tuple = True)outputs , state = tf.nn.rnn(lstm,x, dtype = tf.float32) weights = tf.Variable(tf.random_normal([_units,num_classes]))biases = tf.Variable(tf.random_normal([num_classes]))logits = tf.matmul(outputs[-1], weights) + biasesc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,Y)loss = tf.reduce_mean(c_loss)global_step = tf.Variable(0, name="global_step", trainable=False)decayed_learning_rate = tf.train.exponential_decay(learning_rate = 0.01,global_step = global_step,decay_steps = 300, decay_rate = 0.96, staircase = True)optimizer= tf.train.AdamOptimizer(learning_rate = decayed_learning_rate)#grads_and_vars = optimizer.compute_gradients(loss,[conv_weights0])minimize_loss = optimizer.minimize(loss, global_step=global_step) correct_predict = tf.nn.in_top_k(logits, Y, 1)accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))
回答:
我已经弄清楚了为什么会发生这种情况,当我们随机增加神经元或层的数量,比如56、86然后是496时,就会出现这种问题,无论你添加多少层,结果都会是巨大的损失和非常低的准确性,所以解决这个问题的方法是遵循特定的模式,比如64、128、256、512。