我正在尝试将Q-learning应用于我自定义的强化学习环境,该环境模拟了储能套利(使用电池进行电力交易,在电价低时充电,电价上涨时放电)。环境本身是可以运行的,但我无法将Q-learning应用于其中。下面的脚本可以运行这个环境,但我不知道应该如何定义状态变量。有什么建议可以帮助我应用Q-learning来优化充电/放电周期吗?重置函数会从包含每小时电价的数据集中开始下一天的模拟。下方是数据框的图片。
class BatteryEnv(gym.Env):
def __init__(self, df): self.dict_actions = {0:'discharge',1:'charge',2:'wait'} self.df = df self.action_space = spaces.Discrete(3) self.observation_space = spaces.Box(low=0, high=100, shape=(1,1)) self.reward_list = [] self.actual_load_list = []#observations self.SOE_list=[] #State of energy self.state_idx = 0 #iteration (hour of the day) self.SOE = 0 #SOE self.MAX_charge = 20 #C-rate kinda self.Capacity =100 def step(self, action): #mapping integer to action for actual load calculation str_action = self.dict_actions[action] #increase state idx within episode (1= 1 hour) self.state_idx+=1 #calculating our actual load if str_action == 'charge' and self.SOE < self.Capacity: SOE_charge = np.clip(self.Capacity - self.SOE, 0, self.MAX_charge) self.SOE += SOE_charge obs = SOE_charge * self.df['prices'][self.state_idx] elif str_action == 'discharge' and self.SOE > 0: SOE_discharge = np.clip(self.SOE, 0, self.MAX_charge) self.SOE -= SOE_discharge obs = -SOE_discharge * self.df['prices'][self.state_idx] else: self.SOE += 0 obs = 0 * self.df['prices'][self.state_idx] # appending actual load to list for monitoring and comparison purposes self.actual_load_list.append(obs) self.SOE_list.append(self.SOE) #reward system if obs<0: #if observation is positive we spending money. if negative we earning reward =1 else: reward =-1 # appending curr reward to list for monitoring and comparison purposes self.reward_list.append(reward) #checking whether our episode (day interval) ends if self.df.iloc[self.state_idx,:].Daynum != self.df.iloc[self.state_idx-1].Daynum: done = True else: done = False return obs, reward, done def reset(self): return df.iloc[self.state_idx,:]def render(): pass
下面的代码可以证明环境是可以工作的。
for episode in range(7):observation = env.reset()for t in range(24): #can't be smaller than 24 as 24 time points equal to 1 episode (1 day) #print(observation) action = env.action_space.sample() #random actions observation, reward, done = env.step(action) if done: print("Episode finished after {} timesteps".format(t+1)), print (observation), print(reward) break
回答:
我想我已经能够让代码在某种程度上与Q-learning一起工作了。然而,奖励和重置函数需要进一步改进以获得更好的表现。
class BatteryEnv(gym.Env):def __init__(self, prices = np.array(df.prices), daynum = np.array(df.Daynum)): #self.df = df self.prices = prices self.daynum = daynum self.dict_actions = {0:'discharge',1:'charge',2:'wait'} self.action_space = spaces.Discrete(3) # our observation space is just one float value - our load self.observation_space = spaces.Box(low=0, high=100, shape=(1,1)) # reward list for monitoring self.reward_list = [] # lists 4 monitoring self.actual_load_list = [] self.SOE_list=[] #State of energy self.chargio = [] #charge & discharge self.SOEe=[] #State of energy # index of current state within current episode self.state_idx = 0 #iteration self.SOE = 0 #SOE self.MAX_charge = 20 #C-rate kinda self.Capacity =100 self.state = 0 def step(self, action): #mapping integer to action for actual load calculation str_action = self.dict_actions[action] #increase state idx within episode (day) self.state_idx+=1 #calculating our actual load if str_action == 'charge' and self.SOE < self.Capacity: SOE_charge = np.clip(self.Capacity - self.SOE, 0, self.MAX_charge) self.state += SOE_charge self.SOEe.append(self.SOE) self.chargio.append(SOE_charge) obs = SOE_charge * self.prices[self.state_idx] elif str_action == 'discharge' and self.SOE > 0: SOE_discharge = np.clip(self.SOE, 0, self.MAX_charge) self.state -= SOE_discharge self.SOEe.append(self.SOE) self.chargio.append(-SOE_discharge) obs = -SOE_discharge * self.prices[self.state_idx] else: self.state += 0 self.chargio.append(0) self.SOEe.append(self.SOE) obs = 0 # appending actual load to list for monitoring and comparison purposes self.actual_load_list.append(obs) self.SOE_list.append(self.SOE) #reward system if obs<0: #if observation is positive we spending money. if negative we earning reward =1 else: reward =-1 # appending curr reward to list for monitoring and comparison purposes self.reward_list.append(reward) #checking whether our episode (day interval) ends if self.daynum[self.state_idx] != self.daynum[self.state_idx-1]: done = True else: done = False info = { #'step': self.state_idx, 'SOE': self.SOE, #'reward': reward, 'chargio': self.chargio } return obs, reward, done, info def reset(self): self.state = 0 return self.statedef render(): pass