我在使用TensorFlow实现的深度Q学习来解决CartPole-v0问题时,输出的得分有时(40%的运行)会停留在9。我尝试通过设置种子,使用tf.set_random_seed来解决这个问题,但这仍然无法保证输出不会卡住。以下是我的代码:
from collections import dequeimport tensorflow as tfimport numpy as npimport randomimport gymimport matplotlib.pyplot as pltimport picklefrom time import timet = int(time())class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen = 2000) self.gamma = 0.95 #self.epsilon = 1.0 #self.epsilon_min = 0.01 #self.epsilon_decay = 0.995 self.learning_rate = 0.001 self.model = self._build_model() def _build_model(self): graph = tf.Graph() with graph.as_default(): inp = tf.placeholder(tf.float32, [None, self.state_size]) out = tf.placeholder(tf.float32, [None, self.action_size]) w1 = tf.Variable(tf.truncated_normal([self.state_size, 24])) b1 = tf.Variable(tf.zeros([24])) hidden = tf.nn.tanh(tf.matmul(inp, w1) + b1) w2 = tf.Variable(tf.truncated_normal([24, 24])) b2 = tf.Variable(tf.zeros([24])) hidden1 = tf.nn.tanh(tf.matmul(hidden, w2) + b2) w3 = tf.Variable(tf.truncated_normal([24, 24])) b3 = tf.Variable(tf.zeros([24])) hidden2 = tf.nn.tanh(tf.matmul(hidden1, w3) + b3) wo = tf.Variable(tf.truncated_normal([24, self.action_size])) bo = tf.Variable(tf.zeros([self.action_size])) prediction = tf.matmul(hidden2, wo) + bo loss = tf.losses.mean_squared_error(out, prediction) train = tf.train.AdamOptimizer().minimize(loss) init = tf.global_variables_initializer() return graph, inp, out, prediction, train, init def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state, sess): act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state}) return np.argmax(act_values[0]) def replay(self, batch_size, sess): try: minibatch = random.sample(self.memory, batch_size) except ValueError: minibatch = self.memory for state, action, reward, next_state, done in minibatch: target = reward if not done: target = reward + self.gamma * np.amax(sess.run(self.model[3], feed_dict = { self.model[1]: next_state})) target_f = sess.run(self.model[3], feed_dict = { self.model[1]: state}) target_f[0][action] = target #print(target_f) sess.run(self.model[4], feed_dict = { self.model[1]: state, self.model[2]: target_f})if __name__ == "__main__": environment = 'CartPole-v0' env = gym.make(environment) avgs = deque(maxlen = 50) rewardLA = [] agent = DQNAgent(env.observation_space.shape[0], env.action_space.n) sess = tf.Session(graph = agent.model[0]) sess.run(agent.model[5]) episodes = 10000 rewardL = [] for e in range(episodes): state = env.reset() state = np.reshape(state, [1, 4]) for time_t in range(500): #env.render() action = agent.act(state, sess) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, 4]) agent.remember(state, action, reward, next_state, done) state = next_state if done: break avgs.append(time_t) rewardLA.append(sum(avgs)/len(avgs)) print("episode: ", e, "score: ", time_t) rewardL.append(time_t) agent.replay(32, sess) #pickle.dump(rewardL, open(environment + "_" + str(t) + "_rewardL.pickle", "wb")) plt.plot(rewardLA) plt.show()
我尝试将优化器改为GD、rmsProp,但都没有效果,但如果我简单地重新启动代码,效果会更好(在200个epoch内达到199)。为什么会这样?我该如何解决这个问题?
回答:
看你的代码,我没有看到环境是如何被探索的。你不需要像epsilon贪婪这样的机制来确保探索吗?例如,我尝试修改agent.act()
方法如下,似乎解决了这个问题。
def act(self, state, sess, episode): if random.random() < math.pow(2, -episode / 30): return env.action_space.sample() act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state}) return np.argmax(act_values[0])
可以尝试调整30这个值,我称之为“探索常数”。
无论如何,在我看来,如果没有像epsilon贪婪这样的机制(或像上面那样随时间衰减的机制),你就依赖于神经网络的输出具有足够的熵来进行足够的探索。有时这可能是有效的;有时则不然。