我调整了以下梯度下降算法,用于对存储在data[:,4]中的y变量和存储在data[:,1]中的x变量进行回归。然而,梯度下降似乎出现了发散的情况。我希望能得到一些帮助,以找出我做错的地方。
#define the sum of squared residualsssquares <- function(x) { t = 0 for(i in 1:200) { t <- t + (data[i,4] - x[1] - x[2]*data[i,1])^2 } t/200 }# define the derivativesderivative <- function(x) { t1 = 0 for(i in 1:200) { t1 <- t1 - 2*(data[i,4] - x[1] - x[2]*data[i,1]) } t2 = 0 for(i in 1:200) { t2 <- t2 - 2*data[i,1]*(data[i,4] - x[1] - x[2]*data[i,1]) } c(t1/200,t2/200) }# definition of the gradient descent method in 2Dgradient_descent <- function(func, derv, start, step=0.05, tol=1e-8) { pt1 <- start grdnt <- derv(pt1) pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2]) while (abs(func(pt1)-func(pt2)) > tol) { pt1 <- pt2 grdnt <- derv(pt1) pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2]) print(func(pt2)) # print progress } pt2 # return the last point}# locate the minimum of the function using the Gradient Descent methodresult <- gradient_descent( ssquares, # the function to optimize derivative, # the gradient of the function c(1,1), # start point of theplot_loss(simple_ex) search 0.05, # step size (alpha) 1e-8) # relative tolerance for one step# display a summary of the resultsprint(result) # coordinate of fucntion minimumprint(ssquares(result)) # response of function minimum
回答:
你可以将目标函数/梯度函数向量化以实现更快的执行速度,如你所见,它实际上在随机生成的数据上收敛了,并且系数与R中的lm()函数得到的非常接近:
ssquares <- function(x) { n <- nrow(data) # 200 sum((data[,4] - cbind(1, data[,1]) %*% x)^2) / n}# define the derivativesderivative <- function(x) { n <- nrow(data) # 200 c(sum(-2*(data[,4] - cbind(1, data[,1]) %*% x)), sum(-2*(data[,1])*(data[,4] - cbind(1, data[,1]) %*% x))) / n}set.seed(1)#data <- matrix(rnorm(800), nrow=200)# locate the minimum of the function using the Gradient Descent methodresult <- gradient_descent( ssquares, # the function to optimize derivative, # the gradient of the function c(1,1), # start point of theplot_loss(simple_ex) search 0.05, # step size (alpha) 1e-8) # relative tolerance for one step# [1] 2.511904# [1] 2.263448# [1] 2.061456# [1] 1.89721# [1] 1.763634# [1] 1.654984# [1] 1.566592# [1] 1.494668# ...# display a summary of the resultsprint(result) # coefficients obtained with gradient descent#[1] -0.10248356 0.08068382lm(data[,4]~data[,1])$coef # coefficients from R lm()# (Intercept) data[, 1] # -0.10252181 0.08045722 # use new dataset, this time it takes quite sometime to converge, but the # values GD converges to are pretty accurate as you can see from below.data <- read.csv('Advertising.csv') # with advertising data, removing the first rownames column# locate the minimum of the function using the Gradient Descent methodresult <- gradient_descent( ssquares, # the function to optimize derivative, # the gradient of the function c(1,1), # start point of theplot_loss(simple_ex) search 0.00001, # step size (alpha), decreasing the learning rate 1e-8) # relative tolerance for one step# ...# [1] 10.51364# [1] 10.51364# [1] 10.51364print(result) # coordinate of fucntion minimum[1] 6.97016852 0.04785365lm(data[,4]~data[,1])$coef(Intercept) data[, 1] 7.03259355 0.04753664