我在尝试向我之前工作正常的AI模型中添加一个LSTM层。在添加模型后,训练批次时出现了这个错误。
之前没有LSTM时没有这个错误,一切正常。
输入批次大小100与隐藏[0]批次大小1不匹配。
我使用的是nn.LSTMCell
请问有人能帮我检查一下我是否遗漏了初始化我的lstmcell的某些参数,以便它也可以接受批次输入吗?
以下是我的代码…
import osimport timeimport randomimport numpy as npimport matplotlib.pyplot as pltimport pandas as pdfrom sklearn.preprocessing import StandardScalerimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom random import random as rndmfrom torch.autograd import Variablefrom collections import dequeos.chdir("C:\\Users\\granthjain\\Desktop\\startup_code")torch.set_default_tensor_type('torch.DoubleTensor')class ReplayBuffer(object): def __init__(self, max_size=1e6): self.storage = [] self.max_size = max_size self.ptr = 0 def add(self, transition): if len(self.storage) == self.max_size: self.storage[int(self.ptr)] = transition else: self.storage.append(transition) self.ptr = (self.ptr + 1) % self.max_size def sample(self, batch_size): ind = np.random.randint(0, self.ptr, size=batch_size) batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], [] for i in ind: state, next_state, action, reward, done = self.storage[i] if state is None: continue elif next_state is None: continue elif action is None: continue elif reward is None: continue elif done is None: continue batch_states.append(np.array(state, copy=False)) batch_next_states.append(np.array(next_state, copy=False)) batch_actions.append(np.array(action, copy=False)) batch_rewards.append(np.array(reward, copy=False)) batch_dones.append(np.array(done, copy=False)) return np.array(batch_states,dtype=object).astype(float), np.array(batch_next_states,dtype=object).astype(float), np.array(batch_actions,dtype=object).astype(float), np.array(batch_rewards,dtype=object).astype(float), np.array(batch_dones,dtype=object).astype(float)class Actor(nn.Module): def __init__(self, state_dim, action_dim, max_action): super(Actor, self).__init__() self.lstm = nn.LSTMCell(state_dim, 256) self.layer_1 = nn.Linear(256, 400) self.layer_2 = nn.Linear(400, 300) self.layer_3 = nn.Linear(300, action_dim) self.hx = torch.zeros(1,256) self.cx = torch.zeros(1,256) self.max_action = max_action def forward(self, x): self.hx, self.cx = self.lstm(x, (self.hx, self.cx)) x = F.relu(self.layer_1(self.hx)) x = F.relu(self.layer_2(x)) x = self.max_action * torch.tanh(self.layer_3(x)) return xclass Critic(nn.Module): def __init__(self, state_dim, action_dim): super(Critic, self).__init__() # Defining the first Critic neural network self.lstm1 = nn.LSTMCell(state_dim + action_dim, 256) self.layer_1 = nn.Linear(256, 400) self.layer_2 = nn.Linear(400, 300) self.layer_3 = nn.Linear(300, 1) # Defining the second Critic neural network self.lstm2 = nn.LSTMCell(state_dim + action_dim, 256) self.layer_4 = nn.Linear(256, 400) self.layer_5 = nn.Linear(400, 300) self.layer_6 = nn.Linear(300, 1) self.hx1 = torch.zeros(1,256) self.cx1 = torch.zeros(1,256) self.hx2 = torch.zeros(1,256) self.cx2 = torch.zeros(1,256) def forward(self, x, u): xu = torch.cat([x, u], 1) # Forward-Propagation on the first Critic Neural Network self.hx1,self.cx1 = self.lstm(xu, (self.hx1, self.cx1)) x1 = F.relu(self.layer_1(self.hx1)) x1 = F.relu(self.layer_2(x1)) x1 = self.layer_3(x1) # Forward-Propagation on the second Critic Neural Network self.hx2,self.cx2 = self.lstm(xu, (self.hx2, self.cx2)) x2 = F.relu(self.layer_4(self.hx2)) x2 = F.relu(self.layer_5(x2)) x2 = self.layer_6(x2) return x1, x2 def Q1(self, x, u): xu = torch.cat([x, u], 1) self.hx1,self.cx1 = self.lstm(xu, (self.hx1, self.cx1)) x1 = F.relu(self.layer_1(self.hx1)) x1 = F.relu(self.layer_2(x1)) x1 = self.layer_3(x1) return x1# Selecting the device (CPU or GPU)device = torch.device("cuda" if torch.cuda.is_available() else "cpu")# Building the whole Training Process into a classclass TD3(object): def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action def reset_hxcx(self): self.actor.cx = torch.zeros(1,256) self.actor.hx = torch.zeros(1,256) self.actor_target.cx = torch.zeros(1,256) self.actor_target.hx = torch.zeros(1,256) self.critic.cx1 = torch.zeros(1,256) self.critic.cx2 = torch.zeros(1,256) self.critic.hx1 = torch.zeros(1,256) self.critic.hx2 = torch.zeros(1,256) self.critic_target.cx1 = torch.zeros(1,256) self.critic_target.cx2 = torch.zeros(1,256) self.critic_target.hx1 = torch.zeros(1,256) self.critic_target.hx2 = torch.zeros(1,256) def select_action(self, state): print("state =", type(state)) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, iterations, batch_size=50, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): for it in range(iterations): # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size) batch_states=batch_states.astype(float) batch_next_states=batch_next_states.astype(float) batch_actions=batch_actions.astype(float) batch_rewards=batch_rewards.astype(float) batch_dones=batch_dones.astype(float) state = torch.from_numpy(batch_states) next_state = torch.from_numpy(batch_next_states) action = torch.from_numpy(batch_actions) reward = torch.from_numpy(batch_rewards) done = torch.from_numpy(batch_dones) # print("actor cx:",self.actor.cx)# print("actor hx:",self.actor.hx)# print("actor_target cx:",self.actor_target.cx)# print("actor_target cx:",self.actor_target.cx)# print("self.critic.cx1:",self.critic.cx1)# print("self.critic.cx2",self.critic.cx2)# print("self.critic.hx1:",self.critic.hx1)# print("self.critic.hx2:",self.critic.hx2)# print("self.critic_target.cx1:",self.critic_target.cx1)# print("self.critic_target.hx1",self.critic_target.hx1)# print("self.critic_target.cx2:",self.critic_target.cx2)# print("self.critic_target.hx2:",self.critic_target.hx2) # Step 5: From the next state s’, the Actor target plays the next action a’ next_action = self.actor_target(next_state) # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs target_Q1, target_Q2 = self.critic_target(next_state, next_action) # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2) target_Q = torch.min(target_Q1, target_Q2).double() # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor done = done.resize_((done.shape[0],1)) reward = reward.resize_((reward.shape[0],1)) target_Q = reward + ((1 - done) * discount * target_Q).detach() # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs current_Q1, current_Q2 = self.critic(state, action) # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model if it % policy_freq == 0: actor_loss = -self.critic.Q1(state, self.actor(state)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # Making a save method to save a trained model def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) # Making a load method to load a pre-trained model def load(self, filename, directory): self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename))) #set the parametersstart_timesteps = 1e3 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy networkeval_freq = 5e1 # How often the evaluation step is performed (after how many timesteps)max_timesteps = 5e3 # Total number of iterations/timestepssave_models = True # Boolean checker whether or not to save the pre-trained modelexpl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noisebatch_size = 100 # Size of the batchdiscount = 0.99 # Discount factor gamma, used in the calculation of the total discounted rewardtau = 0.005 # Target network update ratepolicy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposesnoise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updatedstate_dim = 3action_dim = 3max_action = 1idx = 0class env1: def __init__(self,state_dim,action_dim,data): self.state_dim = state_dim self.state = torch.zeros(self.state_dim) self.state[state_dim-1]=1000.0 self.next_state = torch.zeros(self.state_dim) self.next_state[state_dim-1] = 1000.0 self.action_dim = action_dim self.data = data self.idx = 0 self.count = 0 self._max_episode_steps = 200 self.state[1] = self.data[self.idx] self.next_state[1] = self.data[self.idx] def reset(self): self.next_state = torch.zeros(self.state_dim) self.next_state[state_dim-1]=1000.0 self.state = torch.zeros(self.state_dim) self.state[state_dim-1]=1000.0 self.state[1] = self.data[self.idx] self.next_state[1] = self.data[self.idx] self.count = 0 ch = self.state[0] cp = self.state[1] cc = self.state[2] st = torch.tensor([ch,cp,cc]) return st def step(self,action): done = False act_t = torch.argmax(action) self.idx += 1 if(act_t==0): num_s = int(self.state[2]/self.state[1]) self.next_state[0] += num_s self.next_state[2] = self.state[2]%self.state[1] self.next_state[1] = self.data[self.idx] elif(act_t==1): self.next_state[1] = self.data[self.idx] elif(act_t==2): self.next_state[2] = self.state[2]+ self.state[1]*self.state[0] self.next_state[0] = 0 self.next_state[1] = self.data[self.idx] reward = self.next_state[2] - self.state[2] + self.next_state[1]*self.next_state[0] - self.state[1]*self.state[0] -1 self.state[0] = self.next_state[0] self.state[1] = self.next_state[1] self.state[2] = self.next_state[2] ch = self.state[0] cp = self.state[1] cc = self.state[2] st = torch.tensor([ch,cp,cc]) self.count = (self.count + 1)%100 if(self.count==0): done = True return st, reward, donepolicy = TD3(state_dim, action_dim, max_action)#Create the environmentdata = pd.read_csv('PAGEIND.csv')data = data['Close']data = np.array(data).reshape(-1,1)max_timesteps = data.shape[0]sc = StandardScaler()data = sc.fit_transform(data)data = torch.DoubleTensor(data)env = env1(state_dim,action_dim,data)replay_buffer = ReplayBuffer()#init training variables total_timesteps = 0timesteps_since_eval = 0episode_num = 0done = Truet0 = time.time()# We start the main loop over 500,000 timestepswhile total_timesteps < max_timesteps: # If the episode is done if done: # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward)) policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq) # When the training step is done, we reset the state of the environment obs = env.reset() policy.reset_hxcx() # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Before 1000 timesteps, we play random actions if total_timesteps < 0.8*max_timesteps:#random action actn = torch.randn(action_dim) action = torch.zeros(action_dim) action[torch.argmax(actn)] = 1 else: # After 1000 timesteps, we switch to the model action = policy.select_action(torch.tensor(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if expl_noise != 0: print("policy action:",action) actn = (action + torch.randn(action_dim)) action = torch.zeros(action_dim) action[torch.argmax(actn)] = 1 # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done = env.step(action) # We check if the episode is done done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) # We increase the total reward episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) replay_buffer.add((obs, new_obs, action, reward, done_bool)) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1
回答:
如果你用零初始化你的单元状态和隐藏状态,那么根本不需要提供初始化,它会为你提供(作为默认值,请参见文档)。然而,如果你决定自己做,你应该始终考虑批次大小(每次迭代可能不同)。
最后,nn.LSTMCell
的单元状态和隐藏状态的形状为(batch_size, hidden_size)
,而你在构造函数中只初始化了一次,形状为(1, hidden_size)
。你需要将初始化移动到forward()
中,并且每次调用时从x
中获取批次大小,应该只是x.shape[0]
顺便提一下,你使用的是nn.LSTMCell
,这只是单个单元计算。一次使用并没有真正意义,请确保这对你有用。也许改用nn.LSTM
?