我在尝试实现一个k-NN算法,但结果始终是非常低的准确性值。一定存在逻辑错误,但我不确定错误在哪里。以下是代码:
start <- Sys.time()
AccEuc <- NULL
AccMan <- NULL
for(K in grid){
cvAccEuc <- NULL
cvAccMan <- NULL
for (fold in 1:10){
split = kfoldsplit(dfmerged,10,fold)
train <- split[[1]][,-c(1,2)]
valid <- split[[2]][,-c(1,2)]
trainclass <- split[[1]][,2]
validclass <- split[[2]][,2]
combined=rbind(valid,train)
eucdistcombined = as.matrix(dist(combined, method = "euclidian")) # 欧几里得距离
mandistcombined = as.matrix(dist(combined, method = "manhattan")) # 曼哈顿距离
lnvalid = dim(valid)[1]; lntrain = dim(train)[1]; lnall = lnvalid + lntrain
eucdistcombined = eucdistcombined[1:lnvalid,(lnvalid+1):lnall]
mandistcombined = mandistcombined[1:lnvalid,(lnvalid+1):lnall]
neighbors_euc = t(apply(eucdistcombined, 1, order))
neighbors_man = t(apply(mandistcombined, 1, order))
idxeuc = t(apply(neighbors_euc, 1, function(x)( x <= K) ))
idxman = t(apply(neighbors_man, 1, function(x)( x <= K) ))
predseuc <- apply(idxeuc, 1, function(x) as.numeric(getmode( trainclass[as.vector(x)] )) )
predsman <- apply(idxman, 1, function(x) as.numeric(getmode( trainclass[as.vector(x)] )) )
cvAccEuc <- c( cvAccEuc, sum(validclass == predseuc)/dim(validclass)[1] )
cvAccMan <- c( cvAccMan, sum(validclass == predsman)/dim(validclass)[1] )
}
AccEuc <- rbind(AccEuc, t(c(K,cvAccEuc)))
AccMan <- rbind(AccMan, t(c(K,cvAccMan)))
}
Sys.time() - start
回答:
( x <= K)
应该替换为 x[1:K]
。这里的 x
是包含 eucdistcombined
/mandistcombined
行顺序值的行。 ( x <= K)
仅提供值小于K的索引,然而所需的是最小距离值的索引。应该使用 x[1:K]
来获取K个最近邻居。