我最近在学习深度强化学习,想将所学应用到使用 Keras 的 gym 问题中。
在训练过程中,我发现速度太慢,检查原因后发现“fit”函数花费了大量时间。
运行每个回合需要3-4分钟。
我做的事情有什么问题吗?或者您能建议一些改进方法吗?
import pandas as pdimport numpy as npimport tensorflow as tfimport tensorflow.keras as kerasfrom tensorflow.keras.optimizers import Adamfrom collections import dequeimport randomimport gymimport datetimeclass DQN(): def __init__(self, env): self.env = env self.memory = deque(maxlen=2000) self.gamma = 0.98 self.epsilon = 1 self.epsilon_min = 0.01 self.epsilon_decay = 0.998 self.learning_rate = 0.001 self.model = self.create_model() self.target_model = self.create_model() def create_model(self): model = keras.Sequential() state_shape = self.env.observation_space.shape model.add(keras.layers.Dense(48, activation="relu", input_dim=state_shape[0])) model.add(keras.layers.Dense(24, activation="relu")) model.add(keras.layers.Dense(self.env.action_space.n, activation="relu")) model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) return model def remember(self, state, action, reward, new_state, done): self.memory.append([state, action, reward, new_state, done]) def replay(self): batch_size = 32 if len(self.memory) < batch_size: return samples = random.sample(self.memory, batch_size) # states, actions, rewards, states_, dones = samples # targets = self.target_model.predict(states) # _states = [i for i in range(len(samples))] # targets = [[0 for j in range(self.env.action_space.n)] for i in range(len(samples))] _states = np.zeros((len(samples), 8)) targets = np.zeros((len(samples), self.env.action_space.n)) for i, sample in enumerate(samples): state, action, reward, new_state, done = sample _states[i] = state # target = self.target_model.predict(state) if done: targets[i][action] = reward else: Q_future = max(self.target_model.predict(new_state)[0]) targets[i][action] = reward + Q_future*self.gamma self.model.fit(_states, targets, epochs=1, verbose=0) # for sample in samples: # state, action, reward, new_state, done = sample # target = self.target_model.predict(state) # if done: # target[0][action] = reward # else: # Q_future = max(self.target_model.predict(new_state)[0]) # target[0][action] = reward + Q_future*self.gamma # start_time = datetime.datetime.now() # self.model.fit(state, target, epochs=1, verbose=0) # end_time = datetime.datetime.now() # print("--fit--") # print(end_time-start_time) def target_train(self): weights = self.model.get_weights() target_weights = self.target_model.get_weights() for i in range(len(target_weights)): target_weights[i] = weights[i] self.target_model.set_weights(target_weights) def act(self, state): self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon_min, self.epsilon) if np.random.random() < self.epsilon: return self.env.action_space.sample() return np.argmax(self.model.predict(state)[0]) def save_model(self, fn): self.model.save(fn) def act_eval(self, state): return np.argmax(self.model.predict(state)[0]) def evaluation(self, n_eval=10): total_reward = 0 for _ in range(n_eval): self.env.reset() cur_state = self.env.reset().reshape(1,8) done = False while not done: action = self.act_eval(cur_state) new_state, reward, done, _ = self.env.step(action) total_reward += reward cur_state = new_state.reshape(1,8) return total_reward / n_evaldef main(): save_path = "policies/" env = gym.make("LunarLander-v2") trials = 2000 trial_len = 500 update_target_network = 500 agent = DQN(env=env) for trial in range(trials): cur_state = env.reset().reshape(1,8) time_step_cntr = 0 # check execution durations dur_replay = 0 dur_step = 0 dur_act = 0 for step in range(trial_len): print("Trial {0}, step {1}".format(trial, step)) action = agent.act(cur_state) # new_state, reward, done, _ = env.step(action) # new_state = new_state.reshape(1,8) agent.remember(cur_state, action, reward, new_state, done) # learn from experience agent.replay() # # after "update_target_network" steps, update target network if time_step_cntr % update_target_network == 0: agent.target_train() time_step_cntr += 1 cur_state = new_state if done: break # print("Duration replay {0}, duration act {1}, duration step {2}".format(dur_replay, dur_act, dur_step)) # at each N steps, evaluate print("Evaluation over 10 episodes", agent.evaluation()) print("Trial #{0} completed.".format(trial)) # # print the progress # if trial % 100 == 0: # print("Trial #{0} completed.".format(trial)) # save the model # if trial % 20 == 0: agent.save_model(save_path + str(trial) + "__.model") agent.save_model(save_path + "_final" + "__.model")if __name__ == "__main__": main()
回答:
你的问题不在于 fit 调用,而是在于 replay() 方法中的循环。 在这些情况下,尽量用 numpy 操作替代循环,这样操作会更加灵活高效。
将你的 replay 方法替换为下面的方法,看看是否能更快地工作
def replay(self): batch_size = 32 if len(self.memory) >= batch_size: # Draw a sample samples = random.sample(self.memory, batch_size) # Prepare the batch state, action, reward, new_state, done = zip(*samples) next_state = np.concatenate(new_state) done = np.array(done)[:,None] state = np.concatenate(state) reward = np.array(reward)[:,None] q_future = self.target_model.predict(next_state) targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)*(1-done) # Fit the model self.model.fit(state, targets, epochs=1, verbose=0)