我在寻找如何将带有LSTM层的循环神经网络整合到我现有的Pytorch深度Q网络中的示例时遇到了些困难,这样它就会变成DRQN。请耐心点,我才刚开始。此外,我不是在处理图像处理,因此不需要CNN,所以不用担心这一点。我的状态只是温度值。
这是我目前用来训练DQN的代码:
# Importing the librariesimport numpy as npimport random # random samples from different batches (experience replay)import os # For loading and saving brainimport torchimport torch.nn as nnimport torch.nn.functional as Fimport torch.optim as optim # for using stochastic gradient descentimport torch.autograd as autograd # Conversion from tensor (advanced arrays) to avoid all that contains a gradient# We want to put the tensor into a varaible taht will also contain a# gradient and to this we need:from torch.autograd import Variable# to convert this tensor into a variable containing the tensor and the gradient# Creating the architecture of the Neural Networkclass Network(nn.Module): #inherinting from nn.Module #Self - refers to the object that will be created from this class # - self here to specify that we're referring to the object def __init__(self, input_size, nb_action): #[self,input neuroner, output neuroner] super(Network, self).__init__() #inorder to use modules in torch.nn # Input and output neurons self.input_size = input_size self.nb_action = nb_action # Full connection between different layers of NN # In this example its one input layer, one hidden layer and one output layer # Using self here to specify that fc1 is a variable of my object self.fc1 = nn.Linear(input_size, 40) self.fc2 = nn.Linear(40, 30) #Example of adding a hiddenlayer # self.fcX = nn.Linear(30,30) self.fc3 = nn.Linear(30, nb_action) # 30 neurons in hidden layer # For function that will activate neurons and perform forward propagation def forward(self, state): # rectifier function x = F.relu(self.fc1(state)) x = F.relu(self.fc2(x)) q_values = self.fc3(x) return q_values# Implementing Experience Replay# We know that RL is based on MDP# So going from one state(s_t) to the next state(s_t+1)# We gonna put 100 transition between state into what we call the memory# So we can use the distribution of experience to make a decisionclass ReplayMemory(object): def __init__(self, capacity): self.capacity = capacity #100 transitions self.memory = [] #memory to save transitions # pushing transitions into memory with append #event=transition def push(self, event): self.memory.append(event) if len(self.memory) > self.capacity: #memory only contain 100 events del self.memory[0] #delete first transition from memory if there is more that 100 # taking random sample def sample(self, batch_size): #Creating variable that will contain the samples of memory #zip =reshape function if list = ((1,2,3),(4,5,6)) zip(*list)= (1,4),(2,5),(3,6) # (state,action,reward),(state,action,reward) samples = zip(*random.sample(self.memory, batch_size)) #This is to be able to differentiate with respect to a tensor #and this will then contain the tensor and gradient #so for state,action and reward we will store the seperately into some #bytes which each one will get a gradient #so that eventually we'll be able to differentiate each one of them return map(lambda x: Variable(torch.cat(x, 0)), samples)# Implementing Deep Q Learningclass Dqn(): def __init__(self, input_size, nb_action, gamma, lrate, T): self.gamma = gamma #self.gamma gets assigned to input argument self.T = T # Sliding window of the evolving mean of the last 100 events/transitions self.reward_window = [] #Creating network with network class self.model = Network(input_size, nb_action) #creating memory with memory class #We gonna take 100000 samples into memory and then we will sample from this memory to #to get a snakk number of random transitions self.memory = ReplayMemory(100000) #creating optimizer (stochastic gradient descent) self.optimizer = optim.Adam(self.model.parameters(), lr = lrate) #learning rate #input vector which is batch of input observations #by unsqeeze we create a fake dimension to this is #what the network expect for its inputs #have to be the first dimension of the last_state self.last_state = torch.Tensor(input_size).unsqueeze(0) #Inilizing self.last_action = 0 self.last_reward = 0 def select_action(self, state): #Q value depends on state #Temperature parameter T will be a positive number and the closer #it is to ze the less sure the NN will when taking an action #forexample #softmax((1,2,3))={0.04,0.11,0.85} ==> softmax((1,2,3)*3)={0,0.02,0.98} #to deactivate brain then set T=0, thereby it is full random probs = F.softmax((self.model(Variable(state, volatile = True))*self.T),dim=1) # T=100 #create a random draw from the probability distribution created from softmax action = probs.multinomial() print(probs.multinomial()) return action.data[0,0] # See section 5.3 in AI handbook def learn(self, batch_state, batch_next_state, batch_reward, batch_action): outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1) #next input for target see page 7 in attached AI handbook next_outputs = self.model(batch_next_state).detach().max(1)[0] target = self.gamma*next_outputs + batch_reward #Using hubble loss inorder to obtain loss td_loss = F.smooth_l1_loss(outputs, target) #using lass loss/error to perform stochastic gradient descent and update weights self.optimizer.zero_grad() #reintialize the optimizer at each iteration of the loop #This line of code that backward propagates the error into the NN #td_loss.backward(retain_variables = True) #userwarning td_loss.backward(retain_graph = True) #And this line of code uses the optimizer to update the weights self.optimizer.step() def update(self, reward, new_signal): #Updated one transition and we have dated the last element of the transition #which is the new state new_state = torch.Tensor(new_signal).float().unsqueeze(0) self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward]))) #After ending in a state its time to play a action action = self.select_action(new_state) if len(self.memory.memory) > 100: batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(100) self.learn(batch_state, batch_next_state, batch_reward, batch_action) self.last_action = action self.last_state = new_state self.last_reward = reward self.reward_window.append(reward) if len(self.reward_window) > 1000: del self.reward_window[0] return action def score(self): return sum(self.reward_window)/(len(self.reward_window)+1.) def save(self): torch.save({'state_dict': self.model.state_dict(), 'optimizer' : self.optimizer.state_dict(), }, 'last_brain.pth') def load(self): if os.path.isfile('last_brain.pth'): print("=> loading checkpoint... ") checkpoint = torch.load('last_brain.pth') self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("done !") else: print("no checkpoint found...")
我希望有人能帮助我,并能将RNN和LSTM层整合到我的代码中!我相信你们,Stackflow!
此致敬礼 [隐藏人名]
回答:
在我看来,我认为你可以在Network#__init__
和Network#forward
中添加RNN和LSTM层;数据的形状应该重新调整为序列…
更多细节,请阅读以下两篇文章;之后实现RNN和LSTM并不像看起来那么难。
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html