我在尝试用Java实现多元梯度下降算法(来自AI Coursera课程),但我无法找出我的代码中错误所在。
这是下面程序的输出:
Before train: parameters := [0.0, 0.0, 0.0] -> cost function := 2.5021875E9After first iteration: parameters := [378.5833333333333, 2.214166666666667, 50043.75000000001] -> cost function := 5.404438291015627E9
如你所见,第一次迭代后的值偏差很大。我做错了什么?
这是我试图实现的算法:
这是代码:
import java.util.*; public class GradientDescent { private double[][] trainingData; private double[] means; private double[] scale; private double[] parameters; private double learningRate; GradientDescent() { this.learningRate = 0D; } public double predict(double[] inp){ double[] features = new double[inp.length + 1]; features[0] = 1; for(int i = 0; i < inp.length; i++) { features[i+1] = inp[i]; } double prediction = 0; for(int i = 0; i < parameters.length; i++) { prediction = parameters[i] * features[i]; } return prediction; } public void train(){ double[] tempParameters = new double[parameters.length]; for(int i = 0; i < parameters.length; i++) { tempParameters[i] = parameters[i] - learningRate * partialDerivative(i); //System.out.println(tempParameters[i] + " = " + parameters[i] + " - " + learningRate + " * " + partialDerivative(i)); } System.out.println("Before train: parameters := " + Arrays.toString(parameters) + " -> cost function := " + costFunction()); parameters = tempParameters; System.out.println("After first iteration: parameters := " + Arrays.toString(parameters) + " -> cost function := " + costFunction()); } private double partialDerivative(int index) { double sum = 0; for(int i = 0; i < trainingData.length; i++) { double[] input = new double[trainingData[i].length - 1]; int j = 0; for(; j < trainingData[i].length - 1; j++) { input[j] = trainingData[i][j]; } sum += ((predict(input) - trainingData[i][j]) * trainingData[i][index]); } return (1D/trainingData.length) * sum; } public double[][] getTrainingData() { return trainingData; } public void setTrainingData(double[][] data) { this.trainingData = data; this.means = new double[this.trainingData[0].length-1]; this.scale = new double[this.trainingData[0].length-1]; for(int j = 0; j < data[0].length-1; j++) { double min = data[0][j], max = data[0][j]; double sum = 0; for(int i = 0; i < data.length; i++) { if(data[i][j] < min) min = data[i][j]; if(data[i][j] > max) max = data[i][j]; sum += data[i][j]; } scale[j] = max - min; means[j] = sum / data.length; } } public double[] getParameters() { return parameters; } public void setParameters(double[] parameters) { this.parameters = parameters; } public double getLearningRate() { return learningRate; } public void setLearningRate(double learningRate) { this.learningRate = learningRate; } /** 1 m i i 2 * J(theta) = ----- * SUM( h (x ) - y ) * 2*m i=1 theta */ public double costFunction() { double sum = 0; for(int i = 0; i < trainingData.length; i++) { double[] input = new double[trainingData[i].length - 1]; int j = 0; for(; j < trainingData[i].length - 1; j++) { input[j] = trainingData[i][j]; } sum += Math.pow(predict(input) - trainingData[i][j], 2); } double factor = 1D/(2*trainingData.length); return factor * sum; } @Override public String toString() { StringBuilder sb = new StringBuilder("hypothesis: "); int i = 0; sb.append(parameters[i++] + " + "); for(; i < parameters.length-1; i++) { sb.append(parameters[i] + "*x" + i + " + "); } sb.append(parameters[i] + "*x" + i); sb.append("\n Feature scale: "); for(i = 0; i < scale.length-1; i++) { sb.append(scale[i] + " "); } sb.append(scale[i]); sb.append("\n Feature means: "); for(i = 0; i < means.length-1; i++) { sb.append(means[i] + " "); } sb.append(means[i]); sb.append("\n Cost fuction: " + costFunction()); return sb.toString(); } public static void main(String[] args) { final double[][] TDATA = { {200, 2, 20000}, {300, 2, 41000}, {400, 3, 51000}, {500, 3, 61500}, {800, 4, 41000}, {900, 5, 141000} }; GradientDescent gd = new GradientDescent(); gd.setTrainingData(TDATA); gd.setParameters(new double[]{0D,0D,0D}); gd.setLearningRate(0.00001); gd.train(); //System.out.println(gd); //System.out.println("PREDICTION: " + gd.predict(new double[]{300, 2})); } }
编辑:
我已经更新了代码以使其更易读,并尝试将其映射到Douglas使用的符号。我认为现在它运行得更好,但仍有一些我没有完全理解的模糊区域。
似乎如果我有多个参数(如下面示例中的房间数量和面积),预测结果与第二个参数(在这种情况下是面积)强烈相关,而改变第一个参数(房间数量)几乎没有影响。
这是对{2, 200}
的预测:
PREDICTION: 200000.00686158828
这是对{5, 200}
的预测:
PREDICTION: 200003.0068315415
如你所见,这两个值之间几乎没有区别。
在将数学翻译成代码的尝试中是否还有错误?
这是更新后的代码:
import java.util.*; public class GradientDescent { private double[][] trainingData; private double[] means; private double[] scale; private double[] parameters; private double learningRate; GradientDescent() { this.learningRate = 0D; } public double predict(double[] inp) { return predict(inp, this.parameters); } private double predict(double[] inp, double[] parameters){ double[] features = concatenate(new double[]{1}, inp); double prediction = 0; for(int j = 0; j < features.length; j++) { prediction += parameters[j] * features[j]; } return prediction; } public void train(){ readjustLearningRate(); double costFunctionDelta = Math.abs(costFunction() - costFunction(iterateGradient())); while(costFunctionDelta > 0.0000000001) { System.out.println("Old cost function : " + costFunction()); System.out.println("New cost function : " + costFunction(iterateGradient())); System.out.println("Delta: " + costFunctionDelta); parameters = iterateGradient(); costFunctionDelta = Math.abs(costFunction() - costFunction(iterateGradient())); readjustLearningRate(); } } private double[] iterateGradient() { double[] nextParameters = new double[parameters.length]; for(int r = 0; r < parameters.length; r++) { nextParameters[r] = parameters[r] - learningRate * partialDerivative(r); } return nextParameters; } private double partialDerivative(int index) { double sum = 0; for(int i = 0; i < trainingData.length; i++) { int indexOfResult = trainingData[i].length - 1; double[] input = Arrays.copyOfRange(trainingData[i], 0, indexOfResult); sum += ((predict(input) - trainingData[i][indexOfResult]) * trainingData[i][index]); } return sum/trainingData.length ; } private void readjustLearningRate() { while(costFunction(iterateGradient()) > costFunction()) { System.out.print("Learning rate: " + learningRate + " is too big, readjusted to: "); learningRate = learningRate/2; System.out.println(learningRate); } } public double[][] getTrainingData() { return trainingData; } public void setTrainingData(double[][] data) { this.trainingData = data; this.means = new double[this.trainingData[0].length-1]; this.scale = new double[this.trainingData[0].length-1]; for(int j = 0; j < data[0].length-1; j++) { double min = data[0][j], max = data[0][j]; double sum = 0; for(int i = 0; i < data.length; i++) { if(data[i][j] < min) min = data[i][j]; if(data[i][j] > max) max = data[i][j]; sum += data[i][j]; } scale[j] = max - min; means[j] = sum / data.length; } } public double[] getParameters() { return parameters; } public void setParameters(double[] parameters) { this.parameters = parameters; } public double getLearningRate() { return learningRate; } public void setLearningRate(double learningRate) { this.learningRate = learningRate; } /** 1 m i i 2 * J(theta) = ----- * SUM( h (x ) - y ) * 2*m i=1 theta */ public double costFunction() { return costFunction(this.parameters); } private double costFunction(double[] parameters) { int m = trainingData.length; double sum = 0; for(int i = 0; i < m; i++) { int indexOfResult = trainingData[i].length - 1; double[] input = Arrays.copyOfRange(trainingData[i], 0, indexOfResult); sum += Math.pow(predict(input, parameters) - trainingData[i][indexOfResult], 2); } double factor = 1D/(2*m); return factor * sum; } private double[] normalize(double[] input) { double[] normalized = new double[input.length]; for(int i = 0; i < input.length; i++) { normalized[i] = (input[i] - means[i]) / scale[i]; } return normalized; } private double[] concatenate(double[] a, double[] b) { int size = a.length + b.length; double[] concatArray = new double[size]; int index = 0; for(double d : a) { concatArray[index++] = d; } for(double d : b) { concatArray[index++] = d; } return concatArray; } @Override public String toString() { StringBuilder sb = new StringBuilder("hypothesis: "); int i = 0; sb.append(parameters[i++] + " + "); for(; i < parameters.length-1; i++) { sb.append(parameters[i] + "*x" + i + " + "); } sb.append(parameters[i] + "*x" + i); sb.append("\n Feature scale: "); for(i = 0; i < scale.length-1; i++) { sb.append(scale[i] + " "); } sb.append(scale[i]); sb.append("\n Feature means: "); for(i = 0; i < means.length-1; i++) { sb.append(means[i] + " "); } sb.append(means[i]); sb.append("\n Cost function: " + costFunction()); return sb.toString(); } public static void main(String[] args) { final double[][] TDATA = { {2, 200, 200000}, {3, 300, 300000}, {4, 400, 400000}, {5, 500, 500000}, {8, 800, 800000}, {9, 900, 900000} }; GradientDescent gd = new GradientDescent(); gd.setTrainingData(TDATA); gd.setParameters(new double[]{0D, 0D, 0D}); gd.setLearningRate(0.1); gd.train(); System.out.println(gd); System.out.println("PREDICTION: " + gd.predict(new double[]{3, 600})); } }
回答: