我使用TensorFlow Keras来构建我的GAN模型,我想使用TPU,但在使用tf.GradientTape()时遇到了错误。我的判别器的梯度是正常的,但生成器的所有梯度都是None。请使用我的Colab来理解
谢谢你!
错误信息如下:
discriminator_pretrain_loss real_output Tensor("sequential_8/dense_8/BiasAdd:0", shape=(1, 1), dtype=float32)discriminator_pretrain_loss fake_output Tensor("sequential_8/dense_8/BiasAdd_1:0", shape=(1, 1), dtype=float32)discriminator_pretrain_loss like Tensor("likes:0", shape=(1, 1), dtype=float32)discriminator_pretrain_loss real_loss Tensor("binary_crossentropy/weighted_loss/value:0", shape=(), dtype=float32)discriminator_pretrain_loss fake_loss Tensor("binary_crossentropy_1/weighted_loss/value:0", shape=(), dtype=float32)discriminator_pretrain_loss fake_loss Tensor("add:0", shape=(), dtype=float32)disc_loss Tensor("add:0", shape=(), dtype=float32)vars gen_tape: ['dense_7/kernel/packed:0', 'conv2d_transpose_16/kernel/packed:0', 'conv2d_transpose_17/kernel/packed:0', 'conv2d_transpose_18/kernel/packed:0', 'conv2d_transpose_19/kernel/packed:0', 'conv2d_12/kernel:0', 'conv2d_12/bias:0', 'conv2d_13/kernel:0', 'conv2d_13/bias:0', 'conv2d_14/kernel:0', 'conv2d_14/bias:0', 'conv2d_15/kernel:0', 'conv2d_15/bias:0', 'dense_8/kernel:0', 'dense_8/bias:0']vars disc_tape: ['dense_7/kernel/packed:0', 'conv2d_transpose_16/kernel/packed:0', 'conv2d_transpose_17/kernel/packed:0', 'conv2d_transpose_18/kernel/packed:0', 'conv2d_transpose_19/kernel/packed:0', 'conv2d_12/kernel:0', 'conv2d_12/bias:0', 'conv2d_13/kernel:0', 'conv2d_13/bias:0', 'conv2d_14/kernel:0', 'conv2d_14/bias:0', 'conv2d_15/kernel:0', 'conv2d_15/bias:0', 'dense_8/kernel:0', 'dense_8/bias:0']gradients_of_generator [None, None, None, None, None]gradients_of_discriminator [<tf.Tensor 'AddN_3:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Tensor 'AddN_4:0' shape=(64,) dtype=float32>, <tf.Tensor 'AddN_5:0' shape=(5, 5, 64, 128) dtype=float32>, <tf.Tensor 'AddN_6:0' shape=(128,) dtype=float32>, <tf.Tensor 'AddN_7:0' shape=(5, 5, 128, 256) dtype=float32>, <tf.Tensor 'AddN_8:0' shape=(256,) dtype=float32>, <tf.Tensor 'AddN_9:0' shape=(5, 5, 256, 512) dtype=float32>, <tf.Tensor 'AddN_10:0' shape=(512,) dtype=float32>, <tf.Tensor 'AddN_11:0' shape=(73728, 1) dtype=float32>, <tf.Tensor 'AddN_12:0' shape=(1,) dtype=float32>]---------------------------------------------------------------------------ValueError Traceback (most recent call last)<ipython-input-297-668c74d6b82e> in <module>()----> 1 train(raw_dataset, EPOCHS)9 frames/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs) 984 except Exception as e: # pylint:disable=broad-except 985 if hasattr(e, "ag_error_metadata"):--> 986 raise e.ag_error_metadata.to_exception(e) 987 else: 988 raiseValueError: in user code: <ipython-input-290-f71b18632068>:28 pre_train * generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables)) /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:630 apply_gradients ** grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars) /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/optimizer_v2/utils.py:76 filter_empty_gradients ([v.name for _, v in grads_and_vars],)) ValueError: No gradients provided for any variable: ['dense_7/kernel:0', 'conv2d_transpose_16/kernel:0', 'conv2d_transpose_17/kernel:0', 'conv2d_transpose_18/kernel:0', 'conv2d_transpose_19/kernel:0'].
下面的函数用于步骤训练。令我惊讶的是,判别器的梯度被计算出来,而生成器的梯度却没有计算出来。
def train_step(images,likes): noise = tf.random.normal([BATCH_SIZE, noise_dim]) with tf.GradientTape() as gen_tape, tf.GradientTape(persistent=True) as disc_tape: gen_tape.watch(noise) generated_images = generator(noise, training=True) real_output = discriminator(images, training=True) fake_output = discriminator(generated_images, training=True) # gen_loss = generator_loss(fake_output) gen_tape.watch(fake_output) # gen_tape.watch(gen_loss) # print("gen_loss",gen_loss) disc_loss = discriminator_pretrain_loss(real_output, fake_output, likes) gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output) print("disc_loss",disc_loss) print("vars gen_tape: ",[var.name for var in gen_tape.watched_variables()]) print("vars disc_tape: ",[var.name for var in disc_tape.watched_variables()]) gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables) print("gradients_of_generator",gradients_of_generator) gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables) print("gradients_of_discriminator",gradients_of_discriminator) generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables)) discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
下面的函数是训练函数,该函数工作正常
with tpu_strategy.scope(): def train(dataset, epochs): for epoch in range(epochs): start = time.time() for row in dataset: parsed_row = _parse_function(row) image_batch = parsed_row['img_like'] like_batch = parsed_row['is_like'] # try: train_step(image_batch,like_batch) # except Exception as e: # print("Была ошибка...\r\n", e) # train_step(image_batch) # Produce images for the GIF as you go if (epoch + 1) % 10 == 0: display.clear_output(wait=True) generate_and_save_images(generator, epoch + 1, seed) # Save the model every 15 epochs if (epoch + 1) % 100 == 0: checkpoint.save(file_prefix = checkpoint_prefix) print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start)) # Generate after the final epoch display.clear_output(wait=True) generate_and_save_images(generator, epochs, seed)
判别器模型
def make_discriminator_model(): model = tf.keras.Sequential() model.add(layers.MaxPooling2D(pool_size=(5, 5), strides=(5, 5), padding='same')) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Conv2D(64, (5, 5), strides=(1, 1), padding='same', input_shape=[288, 128, 3])) model.add(layers.LeakyReLU()) model.add(layers.Dropout(0.3)) model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Conv2D(128, (5, 5), strides=(1, 1), padding='same')) model.add(layers.LeakyReLU()) model.add(layers.Dropout(0.3)) model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Conv2D(256, (5, 5), strides=(1, 1), padding='same')) model.add(layers.LeakyReLU()) model.add(layers.Dropout(0.3)) model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Conv2D(512, (5, 5), strides=(2, 2), padding='same')) model.add(layers.LeakyReLU()) model.add(layers.Dropout(0.3)) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Flatten()) model.add(layers.Dense(1)) # print(model.output_shape) # print(model.output_shape[1]*BATCH_SIZE) return model
生成器模型
def make_generator_model(): model = tf.keras.Sequential() model.add(layers.Dense(90*40*256, use_bias=False, input_shape=(100,))) # model.add(layers.BatchNormalization()) model.add(layers.Activation('relu')) model.add(layers.Reshape((90, 40, 256))) assert model.output_shape == (None, 90, 40, 256) # Note: None is the batch size print(model.output_shape) print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) model.add(layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False)) assert model.output_shape == (None, 90, 40, 128) print(model.output_shape) print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) # model.add(layers.BatchNormalization()) model.add(layers.Activation('relu')) # model.add(layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False)) # assert model.output_shape == (None, 180, 80, 64) # print(model.output_shape) # print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*256) # model.add(layers.BatchNormalization()) # model.add(layers.LeakyReLU()) model.add(layers.Conv2DTranspose(16, (5, 5), strides=(4, 4), padding='same', use_bias=False)) assert model.output_shape == (None, 360, 160, 16) print(model.output_shape) print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) # model.add(layers.BatchNormalization()) model.add(layers.Activation('relu')) model.add(layers.Conv2DTranspose(8, (5, 5), strides=(2, 2), padding='same', use_bias=False)) assert model.output_shape == (None, 720, 320, 8) print(model.output_shape) print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) # model.add(layers.BatchNormalization()) model.add(layers.Activation('relu')) model.add(layers.Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh')) assert model.output_shape == (None, 1440, 640, 3) print(model.output_shape) print(model.output_shape[1]*model.output_shape[2]*model.output_shape[3]*BATCH_SIZE) return model
def discriminator_pretrain_loss(real_output, fake_output, like): print("discriminator_pretrain_loss real_output",real_output) print("discriminator_pretrain_loss fake_output",fake_output) print("discriminator_pretrain_loss like",like) real_loss = cross_entropy(like, real_output) print("discriminator_pretrain_loss real_loss",real_loss) fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output) print("discriminator_pretrain_loss fake_loss",fake_loss) total_loss = real_loss + fake_loss print("discriminator_pretrain_loss fake_loss",total_loss) return total_loss
def generator_loss(fake_output): print("generator_loss fake_output",fake_output) print("generator_loss ones_like",tf.ones_like(fake_output)) gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output) print("generator_loss gen_loss",gen_loss) return gen_loss
我在我的Colab中使用Google TPU
import tensorflow as tfprint("Tensorflow version " + tf.__version__)tf.keras.backend.set_floatx('float32')try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') # TPU detection print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])except ValueError: raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')tf.config.experimental_connect_to_cluster(tpu)tf.tpu.experimental.initialize_tpu_system(tpu)tpu_strategy = tf.distribute.TPUStrategy(tpu)print("All devices: ", tf.config.list_logical_devices('TPU'))
回答:
问题通过在“train”函数中使用以下代码解决:
tpu_strategy.run(train_step, args = (image_batch,like_batch))
并在策略作用域中使用:
tf.keras.losses.BinaryCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.NONE)
因此,我将“train”函数更改为:
with tpu_strategy.scope(): def train(dataset, epochs): for epoch in range(epochs): start = time.time() for row in dataset: parsed_row = _parse_function(row) image_batch = parsed_row['img_like'] like_batch = parsed_row['is_like'] # try: tpu_strategy.run(train_step, args = (image_batch,like_batch)) # except Exception as e: # print("Была ошибка...\r\n", e) # train_step(image_batch) # Produce images for the GIF as you go if (epoch + 1) % 10 == 0: display.clear_output(wait=True) generate_and_save_images(generator, epoch + 1, seed) # Save the model every 15 epochs if (epoch + 1) % 100 == 0: checkpoint.save(file_prefix = checkpoint_prefix) print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start)) # Generate after the final epoch display.clear_output(wait=True) generate_and_save_images(generator, epochs, seed)
祝你编码愉快!谢谢你!