Q-learning在游戏中未按预期工作

我尝试在自己编写的一个简单游戏中实现Q-learning。游戏的核心是玩家需要“跳跃”来避开迎面而来的方块。

我设计的系统有两个动作;jumpdo_nothing,状态是与下一个方块的距离(通过除法和向下取整来确保状态数量不会过多)。

我的问题似乎是算法的实现没有考虑到“未来奖励”,因此它会在错误的时间跳跃。

这是我对Q-learning算法的实现;

JumpGameAIClass.prototype.getQ = function getQ(state) {    if (!this.Q.hasOwnProperty(state)) {        this.Q[state] = {};        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {            var action = this.actions[actionIndex];            this.Q[state][action] = 0;        }    }    return this.Q[state];};JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {    var closest = -1;    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {        var block = this.blocks[blockIndex];        var distance = block.x - this.playerX;        if (distance >= 0 && (closest === -1 || distance < closest)) {            closest = distance;        }    }    return Math.max(0, Math.floor(closest * this.resolution));};JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {    var jumpReward = this.getQ(distance)[this.actions[0]];    var doNothingReward = this.getQ(distance)[this.actions[1]];    if (jumpReward > doNothingReward) {        return this.actions[0];    } else if (doNothingReward > jumpReward) {        return this.actions[1];    } else {        if (!this.canJump()) {            return this.actions[1];        }        return this.actions[Math.floor(Math.random() * this.actions.length)];    }};JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {    // We can't jump while in mid-air    if (!this.canJump()) {        return this.actions[1];    }    if (Math.random() < this.epsilon) {        return this.actions[Math.floor(Math.random() * this.actions.length)];    } else {        return this.getActionWithHighestQ(this.getBlockDistance());    }};JumpGameAIClass.prototype.think = function think() {    var reward = this.liveReward;    if (this.score !== this.lastScore) {        this.lastScore = this.score;        reward = this.scoreReward;    } else if (!this.playerAlive) {        reward = this.deathReward;    }    this.drawDistance();    var distance = this.getBlockDistance(),        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],        previousQ = this.getQ(this.lastDistance)[this.lastAction];    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);    this.lastAction = this.getActionEpsilonGreedy();    this.lastDistance = distance;    switch (this.lastAction) {        case this.actions[0]:            this.jump();            break;    }};

以下是它使用的一些属性:

epsilon: 0.05,alpha: 1,gamma: 1,resolution: 0.1,actions: [ 'jump', 'do_nothing' ],Q: {},liveReward: 0,scoreReward: 100,deathReward: -1000,lastAction: 'do_nothing',lastDistance: 0,lastScore: 0

我必须使用lastAction/lastDistance来计算Q值,因为我无法使用当前数据(这将是对前一帧执行的动作进行操作)。

think方法在每帧的所有渲染和游戏处理(物理、控制、死亡等)完成后调用一次。

var JumpGameAIClass = function JumpGame(canvas) {    Game.JumpGame.call(this, canvas);    Object.defineProperties(this, {        epsilon: {            value: 0.05        },        alpha: {            value: 1        },        gamma: {            value: 1        },        resolution: {            value: 0.1        },        actions: {            value: [ 'jump', 'do_nothing' ]        },        Q: {            value: { },            writable: true        },        liveReward: {            value: 0        },        scoreReward: {            value: 100        },        deathReward: {            value: -1000        },        lastAction: {            value: 'do_nothing',            writable: true        },        lastDistance: {            value: 0,            writable: true        },        lastScore: {            value: 0,            writable: true        }    });};JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);JumpGameAIClass.prototype.getQ = function getQ(state) {    if (!this.Q.hasOwnProperty(state)) {        this.Q[state] = {};        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {            var action = this.actions[actionIndex];            this.Q[state][action] = 0;        }    }    return this.Q[state];};JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {    var closest = -1;    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {        var block = this.blocks[blockIndex];        var distance = block.x - this.playerX;        if (distance >= 0 && (closest === -1 || distance < closest)) {            closest = distance;        }    }    return Math.max(0, Math.floor(closest * this.resolution));};JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {    var jumpReward = this.getQ(distance)[this.actions[0]];    var doNothingReward = this.getQ(distance)[this.actions[1]];    if (jumpReward > doNothingReward) {        return this.actions[0];    } else if (doNothingReward > jumpReward) {        return this.actions[1];    } else {        if (!this.canJump()) {            return this.actions[1];        }        return this.actions[Math.floor(Math.random() * this.actions.length)];    }};JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {    if (!this.canJump()) {        return this.actions[1];    }    if (Math.random() < this.epsilon) {        return this.actions[Math.floor(Math.random() * this.actions.length)];    } else {        return this.getActionWithHighestQ(this.getBlockDistance());    }};JumpGameAIClass.prototype.onDeath = function onDeath() {    this.restart();};JumpGameAIClass.prototype.think = function think() {    var reward = this.liveReward;    if (this.score !== this.lastScore) {        this.lastScore = this.score;        reward = this.scoreReward;    } else if (!this.playerAlive) {        reward = this.deathReward;    }    this.drawDistance();    var distance = this.getBlockDistance(),        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],        previousQ = this.getQ(this.lastDistance)[this.lastAction];    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);    this.lastAction = this.getActionEpsilonGreedy();    this.lastDistance = distance;    switch (this.lastAction) {        case this.actions[0]:            this.jump();            break;    }};JumpGameAIClass.prototype.drawDistance = function drawDistance() {    this.context.save();    this.context.textAlign = 'center';    this.context.textBaseline = 'bottom';    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);    this.context.textBaseline = 'top';    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);    this.context.restore();};JumpGameAIClass.prototype.onFrame = function onFrame() {    Game.JumpGame.prototype.onFrame.apply(this, arguments);    this.think();}Game.JumpGameAI = JumpGameAIClass;
body {    background-color: #EEEEEE;    text-align: center;}canvas#game {    background-color: #FFFFFF;    border: 1px solid #DDDDDD;}
<!DOCTYPE HTML><html lang="en"><head>    <title>jump</title></head><body>    <canvas id="game" width="512" height="512">        <h1>Your browser doesn't support canvas!</h1>    </canvas>      <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>      <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->      <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script></body>


回答:

你基本上简化了以下版本:

enter image description here

来源:Flappy Bird RL

我使用了以下值:

    epsilon: {        value: 0.01    },    alpha: {        value: 0.7    },    gamma: {        value: 0.9    },    resolution: {        value: 0.1    },      liveReward: {        value: 10    },    scoreReward: {        value: -100    },    deathReward: {        value: 1000    },

在前20次尝试中,它轻松超过了100分。


Q-learning可以用时间逻辑来描述

Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))

其中

  • r(s,a) = r = 即时奖励
  • gamma = 延迟奖励与即时奖励的相对值(0到1)
  • s' = 执行动作a后的新状态
  • a = 状态s中的动作
  • a' = 状态s'中的动作

你应该按以下方式执行它

选择一个动作a并执行它

  1. 对于每个状态-动作对(s, a),将表项Q(s, a)初始化为零
  2. 观察当前状态s
  3. 永远执行以下步骤:
    • 选择一个动作a并执行它
    • 接收即时奖励r即Q(s, a)
    • 观察新状态s’
    • 更新表项Q(s, a)=r(s,a)+gamma*max_a'(Q(s’, a’))
    • s=s’

Related Posts

L1-L2正则化的不同系数

我想对网络的权重同时应用L1和L2正则化。然而,我找不…

使用scikit-learn的无监督方法将列表分类成不同组别,有没有办法?

我有一系列实例,每个实例都有一份列表,代表它所遵循的不…

f1_score metric in lightgbm

我想使用自定义指标f1_score来训练一个lgb模型…

通过相关系数矩阵进行特征选择

我在测试不同的算法时,如逻辑回归、高斯朴素贝叶斯、随机…

可以将机器学习库用于流式输入和输出吗?

已关闭。此问题需要更加聚焦。目前不接受回答。 想要改进…

在TensorFlow中,queue.dequeue_up_to()方法的用途是什么?

我对这个方法感到非常困惑,特别是当我发现这个令人费解的…

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注