我有以下代码,每次运行时都会出现一个错误,提示'>=' not supported between instances of 'int' and 'str'
,错误来自于gym中的env.step()。似乎是terminated
值引起了这个错误,但我无法找到具体原因:
%matplotlib notebookimport gymimport timeimport matplotlib.pyplot as pltimport numpy as npfrom IPython.display import clear_outputenv = gym.make("MountainCar-v0", 'rgb_array')env.reset()def create_bins(num_bins_per_observation): # CODE HERE car_velocity = np.linspace(-0.07, 0.07, num_bins_per_observation) # based off highest and lowest possible values car_position = np.linspace(-1.2, 0.6, num_bins_per_observation) # run the above loop and see a reasonable range for velocity as it can be -inf - inf bins = np.array([car_position, car_velocity]) return binsNUM_BINS = 10BINS = create_bins(NUM_BINS)def discretize_observation(observations, bins): binned_observations = [] for i,observation in enumerate(observations): discretized_observation = np.digitize(observation, bins[i]) binned_observations.append(discretized_observation) return tuple(binned_observations) # Important for later indexing# CREATE THE Q TABLEq_table_shape = (NUM_BINS,NUM_BINS,env.action_space.n)q_table = np.zeros(q_table_shape)def epsilon_greedy_action_selection(epsilon, q_table, discrete_state): if np.random.random() > epsilon: action = np.argmax(q_table[discrete_state]) else: action = np.random.randint(0, env.action_space.n) return actiondef compute_next_q_value(old_q_value, reward, next_optimal_q_value): return old_q_value + ALPHA * (reward + GAMMA * next_optimal_q_value - old_q_value)def reduce_epsilon(epsilon, epoch): if BURN_IN <= epoch <= EPSILON_END: epsilon -= EPSILON_REDUCE return epsilonEPOCHS = 30000BURN_IN = 100epsilon = 1EPSILON_END= 10000EPSILON_REDUCE = 0.0001ALPHA = 0.8GAMMA = 0.9log_interval = 100 # How often do we update the plot? (Just for performance reasons)### Here we set up the routine for the live plotting of the achieved points ######fig = plt.figure()ax = fig.add_subplot(111)plt.ion()fig.canvas.draw()##################################################################################max_position_log = [] # to store all achieved pointsmean_positions_log = [] # to store a running mean of the last 30 resultsepochs = [] # store the epoch for plottingfor epoch in range(EPOCHS): # TODO: Get initial observation and discretize them. Set done to False initial_state = env.reset()[0] # get the initial observation discretized_state = discretize_observation(initial_state, BINS) # map the observation to the bins done = False # to stop current run when the car reaches the top or the time limit is reached max_position = -np.inf # for plotting epochs.append(epoch) # TODO: As long as current run is alive (i.e not done) perform the following steps: while not done: # Perform current run as long as done is False (as long as there is still time to reach the top) # TODO: Select action according to epsilon-greedy strategy action = epsilon_greedy_action_selection(epsilon, q_table, discretized_state) # Epsilon-Greedy Action Selection # TODO: Perform selected action and get next state. Do not forget to discretize it next_state, reward, done, test, info = env.step(action) # perform action and get next state position, velocity = next_state next_state_discretized = discretize_observation(next_state, BINS) # map the next observation to the bins # TODO: Get old Q-value from Q-Table and get next optimal Q-Value old_q_value = q_table[discretized_state + (action,)] # get the old Q-Value from the Q-Table next_optimal_q_value = np.max(q_table[next_state_discretized]) # Get the next optimal Q-Value # TODO: Compute next Q-Value and insert it into the table next_q = compute_next_q_value(old_q_value, reward, next_optimal_q_value) # Compute next Q-Value q_table[discretized_state + (action,)] = next_q # Insert next Q-Value into the table # TODO: Update the old state with the new one discretized_state = next_state_discretized # Update the old state with the new one if position > max_position: # Only for plotting the results - store the highest point the car is able to reach max_position = position # TODO: Reduce epsilon epsilon = reduce_epsilon(epsilon, epoch) # Reduce epsilon ############################################################################## max_position_log.append(max_position) # log the highest position the car was able to reach running_mean = round(np.mean(max_position_log[-30:]), 2) # Compute running mean of position over the last 30 epochs mean_positions_log.append(running_mean) # and log it ################ Plot the points and running mean ################## if epoch % log_interval == 0: ax.clear() ax.scatter(epochs, max_position_log) ax.plot(epochs, max_position_log) ax.plot(epochs, mean_positions_log, label=f"Running Mean: {running_mean}") plt.legend() fig.canvas.draw()######################################################################env.close()
这是我在Jupyter笔记本中收到的完整错误信息:
---------------------------------------------------------------------------TypeError Traceback (most recent call last)/var/folders/jn/59brf9ps68b366pxgyt4hpfw0000gn/T/ipykernel_55458/601254501.py in <module> 29 action = epsilon_greedy_action_selection(epsilon, q_table, discretized_state) # Epsilon-Greedy Action Selection 30 # TODO: Perform selected action and get next state. Do not forget to discretize it---> 31 next_state, reward, done, test, info = env.step(action) # perform action and get next state 32 position, velocity = next_state 33 next_state_discretized = discretize_observation(next_state, BINS) # map the next observation to the bins~/anaconda3/envs/ai_env/lib/python3.7/site-packages/gym/wrappers/time_limit.py in step(self, action) 51 self._elapsed_steps += 1 52 ---> 53 if self._elapsed_steps >= self._max_episode_steps: 54 truncated = True 55 TypeError: '>=' not supported between instances of 'int' and 'str'
回答:
错误信息已经很清楚了。问题出在
env = gym.make("MountainCar-v0", 'rgb_array')
的第二个参数是一个str
类型,但它应该是一个int
类型,因此在函数step
中定义的第39行的源代码中的比较操作无法使用有效的类型进行操作。