Q-learning在游戏中未按预期工作

我尝试在自己编写的一个简单游戏中实现Q-learning。游戏的核心是玩家需要“跳跃”来避开迎面而来的方块。

我设计的系统有两个动作；jump和do_nothing，状态是与下一个方块的距离（通过除法和向下取整来确保状态数量不会过多）。

我的问题似乎是算法的实现没有考虑到“未来奖励”，因此它会在错误的时间跳跃。

这是我对Q-learning算法的实现；

JumpGameAIClass.prototype.getQ = function getQ(state) {    if (!this.Q.hasOwnProperty(state)) {        this.Q[state] = {};        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {            var action = this.actions[actionIndex];            this.Q[state][action] = 0;        }    }    return this.Q[state];};JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {    var closest = -1;    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {        var block = this.blocks[blockIndex];        var distance = block.x - this.playerX;        if (distance >= 0 && (closest === -1 || distance < closest)) {            closest = distance;        }    }    return Math.max(0, Math.floor(closest * this.resolution));};JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {    var jumpReward = this.getQ(distance)[this.actions[0]];    var doNothingReward = this.getQ(distance)[this.actions[1]];    if (jumpReward > doNothingReward) {        return this.actions[0];    } else if (doNothingReward > jumpReward) {        return this.actions[1];    } else {        if (!this.canJump()) {            return this.actions[1];        }        return this.actions[Math.floor(Math.random() * this.actions.length)];    }};JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {    // We can't jump while in mid-air    if (!this.canJump()) {        return this.actions[1];    }    if (Math.random() < this.epsilon) {        return this.actions[Math.floor(Math.random() * this.actions.length)];    } else {        return this.getActionWithHighestQ(this.getBlockDistance());    }};JumpGameAIClass.prototype.think = function think() {    var reward = this.liveReward;    if (this.score !== this.lastScore) {        this.lastScore = this.score;        reward = this.scoreReward;    } else if (!this.playerAlive) {        reward = this.deathReward;    }    this.drawDistance();    var distance = this.getBlockDistance(),        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],        previousQ = this.getQ(this.lastDistance)[this.lastAction];    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);    this.lastAction = this.getActionEpsilonGreedy();    this.lastDistance = distance;    switch (this.lastAction) {        case this.actions[0]:            this.jump();            break;    }};

以下是它使用的一些属性：

epsilon: 0.05,alpha: 1,gamma: 1,resolution: 0.1,actions: [ 'jump', 'do_nothing' ],Q: {},liveReward: 0,scoreReward: 100,deathReward: -1000,lastAction: 'do_nothing',lastDistance: 0,lastScore: 0

我必须使用lastAction/lastDistance来计算Q值，因为我无法使用当前数据（这将是对前一帧执行的动作进行操作）。

think方法在每帧的所有渲染和游戏处理（物理、控制、死亡等）完成后调用一次。

var JumpGameAIClass = function JumpGame(canvas) {    Game.JumpGame.call(this, canvas);    Object.defineProperties(this, {        epsilon: {            value: 0.05        },        alpha: {            value: 1        },        gamma: {            value: 1        },        resolution: {            value: 0.1        },        actions: {            value: [ 'jump', 'do_nothing' ]        },        Q: {            value: { },            writable: true        },        liveReward: {            value: 0        },        scoreReward: {            value: 100        },        deathReward: {            value: -1000        },        lastAction: {            value: 'do_nothing',            writable: true        },        lastDistance: {            value: 0,            writable: true        },        lastScore: {            value: 0,            writable: true        }    });};JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);JumpGameAIClass.prototype.getQ = function getQ(state) {    if (!this.Q.hasOwnProperty(state)) {        this.Q[state] = {};        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {            var action = this.actions[actionIndex];            this.Q[state][action] = 0;        }    }    return this.Q[state];};JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {    var closest = -1;    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {        var block = this.blocks[blockIndex];        var distance = block.x - this.playerX;        if (distance >= 0 && (closest === -1 || distance < closest)) {            closest = distance;        }    }    return Math.max(0, Math.floor(closest * this.resolution));};JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {    var jumpReward = this.getQ(distance)[this.actions[0]];    var doNothingReward = this.getQ(distance)[this.actions[1]];    if (jumpReward > doNothingReward) {        return this.actions[0];    } else if (doNothingReward > jumpReward) {        return this.actions[1];    } else {        if (!this.canJump()) {            return this.actions[1];        }        return this.actions[Math.floor(Math.random() * this.actions.length)];    }};JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {    if (!this.canJump()) {        return this.actions[1];    }    if (Math.random() < this.epsilon) {        return this.actions[Math.floor(Math.random() * this.actions.length)];    } else {        return this.getActionWithHighestQ(this.getBlockDistance());    }};JumpGameAIClass.prototype.onDeath = function onDeath() {    this.restart();};JumpGameAIClass.prototype.think = function think() {    var reward = this.liveReward;    if (this.score !== this.lastScore) {        this.lastScore = this.score;        reward = this.scoreReward;    } else if (!this.playerAlive) {        reward = this.deathReward;    }    this.drawDistance();    var distance = this.getBlockDistance(),        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],        previousQ = this.getQ(this.lastDistance)[this.lastAction];    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);    this.lastAction = this.getActionEpsilonGreedy();    this.lastDistance = distance;    switch (this.lastAction) {        case this.actions[0]:            this.jump();            break;    }};JumpGameAIClass.prototype.drawDistance = function drawDistance() {    this.context.save();    this.context.textAlign = 'center';    this.context.textBaseline = 'bottom';    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);    this.context.textBaseline = 'top';    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);    this.context.restore();};JumpGameAIClass.prototype.onFrame = function onFrame() {    Game.JumpGame.prototype.onFrame.apply(this, arguments);    this.think();}Game.JumpGameAI = JumpGameAIClass;

body {    background-color: #EEEEEE;    text-align: center;}canvas#game {    background-color: #FFFFFF;    border: 1px solid #DDDDDD;}

<!DOCTYPE HTML><html lang="en"><head>    <title>jump</title></head><body>    <canvas id="game" width="512" height="512">        <h1>Your browser doesn't support canvas!</h1>    </canvas>      <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>      <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->      <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script></body>

回答：

你基本上简化了以下版本：

enter image description here

来源：Flappy Bird RL

我使用了以下值：

    epsilon: {        value: 0.01    },    alpha: {        value: 0.7    },    gamma: {        value: 0.9    },    resolution: {        value: 0.1    },      liveReward: {        value: 10    },    scoreReward: {        value: -100    },    deathReward: {        value: 1000    },

在前20次尝试中，它轻松超过了100分。

Q-learning可以用时间逻辑来描述

Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))

其中

r(s,a) = r = 即时奖励
gamma = 延迟奖励与即时奖励的相对值（0到1）
s' = 执行动作a后的新状态
a = 状态s中的动作
a' = 状态s'中的动作

你应该按以下方式执行它

选择一个动作a并执行它

对于每个状态-动作对(s, a)，将表项Q(s, a)初始化为零
观察当前状态s
永远执行以下步骤：
- 选择一个动作a并执行它
- 接收即时奖励r即Q(s, a)
- 观察新状态s’
- 更新表项Q(s, a)=r(s,a)+gamma*max_a'(Q(s’, a’))
- s=s’

学技术

Q-learning在游戏中未按预期工作

发表回复取消回复

相关文章：

Related Posts

使用LSTM在Python中预测未来值

如何在gensim的word2vec模型中查找双词组的相似性

dask_xgboost.predict 可以工作但无法显示 – 数据必须是一维的

ML Tuning – Cross Validation in Spark

如何在React JS中使用fetch从REST API获取预测

如何分析ML.NET中多类分类预测得分数组？

发表回复 取消回复

发表回复取消回复