我编写了一个非常简单的游戏,运行方式如下:
给定一个4×4的方格场,玩家可以移动(向上、向右、向下或向左)。
-
进入代理从未访问过的方格会获得1的奖励。
-
踏入“死亡区域”会获得-5的奖励,然后游戏将重置。
-
移动到已经访问过的区域会获得-1的奖励
-
进入“胜利区域”(只有一个这样的区域)会获得5的奖励,游戏也会重置。
现在我想让一个AI通过Q-Learning来学习玩这个游戏。
我如何组织输入/特征工程:
网络的输入是一个形状为1×4的数组,其中arr[0]表示上方的字段(向上移动时),arr[1]表示右侧的字段,arr[2]表示下方的字段,arr[3]表示左侧的字段。
数组可能持有的值:0, 1, 2, 3
0 = “死亡区域”,即最差情况
1 = 这将是4×4字段之外(所以你不能走那里)或该字段已经被访问过
2 = 未访问的字段(所以这是好事)
3 = “胜利区域”,即最佳情况
如你所见,我根据它们的奖励对它们进行了排序。
由于游戏以相同的方式接受输入(0 = 向上移动,1 = 向右移动,2 = 向下移动,3 = 向左移动),AI唯一需要学习的基本上是:选择持有最大值的数组索引。
但遗憾的是它不起作用,输入到神经网络的预期Q值越来越高。它们上升到NaN。
这是我的代码(包括游戏的开始部分):
import numpy as npimport randomImport tensorflow as tfimport matplotlib.pyplot as pltfrom time import sleepepisoden = 0felder = []schon_besucht = []playerx = 0playery = 0grafik = Falsedef gib_zustand(): # besonderes feature engineering: # input besteht nur aus einer richtung, die one-hot-encoded ist; also 4 inputneuronen # (glut, wand/besucht, unbesucht, sieg) # # es ist die richtung, die bewertet werden soll (also 1 outputneuron fuer eine richtung) # rueckgabe hier: array, shape: 4x4 (s.o.) global playerx global playery # oben if playery == 0: oben = 1 else: oben = felder[playery-1][playerx] # rechts if playerx == 4: rechts = 1 else: rechts = felder[playery][playerx+1] # unten if playery == 4: unten = 1 else: unten = felder[playery+1][playerx] # links if playerx == 0: links = 1 else: links = felder[playery][playerx-1] return np.array([oben, rechts, unten, links])def grafisch(): if grafik: # encoding: # glut = G, besucht = b, unbesucht = , sieg = S, Spieler = X global felder global playerx global playery print('') for y in range(0,5): print('|', end='') for x in range(0,5): if felder[y][x] == 0: temp = 'G' if felder[y][x] == 1: temp = 'b' if felder[y][x] == 2: temp = ' ' if felder[y][x] == 3: temp = 'S' if y == playery and x == playerx: temp = 'X' print(temp, end='') print('|', end='') print('')def reset(): print('--- RESET ---') global playery global playerx global felder global schon_besucht playerx = 1 playery = 3 # anordnung # glut = 0, wand/besucht = 1, unbesucht = 2, sieg = 3 felder = [[2 for x in range(0,5)] for y in range(0,5)] # zwei mal glut setzen gl1 = random.randint(1,3) gl1_1 = random.randint(2,3) if gl1==3 else (random.randint(1,2) if gl1==1 else random.randint(1,3)) felder[gl1][gl1_1] = 0 # glut # zweites mal gl1 = random.randint(1,3) gl1_1 = random.randint(2,3) if gl1==3 else (random.randint(1,2) if gl1==1 else random.randint(1,3)) felder[gl1][gl1_1] = 0 # glut # pudding felder[1][3] = 3 # ruecksetzen schon_besucht = [] grafisch() return gib_zustand()def step(zug): # 0 = oben, 1 = rechts, 2 = unten, 3 = links global playerx global playery global felder global schon_besucht if zug == 0: if playery != 0: playery -= 1 if zug == 1: if playerx != 4: playerx += 1 if zug == 2: if playery != 4: playery += 1 if zug == 3: if playerx != 0: playerx -= 1 # belohnung holen wert = felder[playery][playerx] if wert==0: belohnung = -5 if wert==1: belohnung = -1 if wert==2: belohnung = 1 if wert==3: belohnung = 5 # speichern wenn nicht verloren if belohnung != -5: schon_besucht.append((playery,playerx)) felder[playery][playerx] = 1 grafisch() return gib_zustand(), belohnung, belohnung==5, 0 # 0 damits passtepisoden = 0tf.reset_default_graph()#These lines establish the feed-forward part of the network used to choose actionsinputs1 = tf.placeholder(shape=[1,4],dtype=tf.float32)#W1 = tf.Variable(tf.random_uniform([16,8],0,0.01))W2 = tf.Variable(tf.random_uniform([4,4],0,0.01))#schicht2 = tf.matmul(inputs1,W1)Qout = tf.matmul(inputs1,W2)predict = tf.argmax(Qout,1)#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32)loss = tf.reduce_sum(tf.square(nextQ - Qout))trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)updateModel = trainer.minimize(loss)init = tf.initialize_all_variables()# Set learning parametersy = .99e = 0.1num_episodes = 10_000#create lists to contain total rewards and steps per episodejList = []rList = []with tf.Session() as sess: sess.run(init) for i in range(num_episodes): #Reset environment and get first new observation s = reset() rAll = 0 d = False j = 0 #The Q-Network while j < 99: j+=1 #Choose an action by greedily (with e chance of random action) from the Q-network a,allQ = sess.run([predict,Qout],feed_dict={inputs1:s.reshape(1,4)}) # berechnet prediction fuer input (input scheint hier one hot encoded zu sein) if np.random.rand(1) < e: a[0] = random.randint(0,3) #Get new state and reward from environment s1,r,d,_ = step(a[0]) #Obtain the Q' values by feeding the new state through our network Q1 = sess.run(Qout,feed_dict={inputs1:s1.reshape(1,4)}) #Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(Q1) targetQ = allQ targetQ[0,a[0]] = r + y*maxQ1 #Train our network using target and predicted Q values _,W1 = sess.run([updateModel,W2],feed_dict={inputs1:s.reshape(1,4),nextQ:targetQ}) rAll += r s = s1 if r == -5 or r == 5: if r == 5: episoden+=1 reset() #Reduce chance of random action as we train the model. e = 1./((i/50) + 10) break jList.append(j) #print(rAll) rList.append(rAll)print("Percent of succesful episodes: " + str((episoden/num_episodes)*100) + "%")plt.plot(rList)plt.plot(jList)
回答: