我想计算两个数据框中各行之间的距离(不相似度),以便为每个观测值找到最接近的聚类。由于我的数据包含了因子和数值变量,我选择使用Gower距离。因为我想比较两个不同的数据框(而不是同一矩阵中各行之间的不相似度),所以我需要使用gower.dist函数。然而,当我实现它时,我发现结果与使用daisy的Gower距离得到的结果不同,这些结果是将行结合在一起,并查看不相似度矩阵中感兴趣的部分。
我在这里只提供了一部分数据,但在使用所有数据计算不相似度时,gower.dist经常会得到零的不相似度,尽管相应的行并不相等。这是为什么呢?不同结果的原因又是什么呢?在我看来,daisy的Gower距离计算是正确的,而gower.dist在这例子中是不正确的。
library(cluster)library(StatMatch)# 使用daisy的Gower距离计算daisyDist <- daisy(rbind(df,cent),metric="gower")daisyDist <- as.matrix(daisyDist)daisyDist <- daisyDist[(nrow(df)+1):nrow(daisyDist),1:nrow(df)] #只查看df的行与cent的行比较的部分# 使用dist.gower计算距离gowerDist <- gower.dist(cent,df)
以下是数据
df <- structure(list(searchType = structure(c(NA, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(4L, 1L, 1L, 6L, 6L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(8L, 8L, NA, 10L, 9L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(NA, 73, 60, 29, 11), priceMax = c(35, 11, 1, 62, 23), sizeMin = structure(c(5L, 5L, 5L, 6L, 6L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 6L, 5L, 3L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(6.6306, 7.47195, 8.5562, NA, 8.569), latitude = c(46.52425, 46.9512, 47.37515, NA, 47.3929), specificSearch = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(NA, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(112457L, 94601L, 78273L, 59172L, 117425L), class = "data.frame") cent <- structure(list(searchType = structure(c(1L, 1L, 1L), .Label = c("1", "2"), class = "factor"), roomMin = structure(c(1L, 4L, 4L), .Label = c("10", "100", "150", "20", "255", "30", "40", "50", "60", "70", "Missing[NoInput]"), class = "factor"), roomMax = structure(c(6L, 9L, 8L), .Label = c("10", "100", "120", "150", "160", "20", "255", "30", "40", "50", "60", "70", "80", "90", "Missing[NoInput]"), class = "factor"), priceMin = c(60, 33, 73), priceMax = c(103, 46, 23), sizeMin = structure(c(1L, 5L, 5L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), sizeMax = structure(c(1L, 2L, 1L), .Label = c("100", "125", "150", "250", "50", "75", "Missing[NoInput]"), class = "factor"), longitude = c(8.3015, 7.42765, 7.6104), latitude = c(47.05485, 46.9469, 46.75125), specificSearch = structure(c(1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), objectType = structure(c(2L, 2L, 2L), .Label = c("1", "2", "3", "Missing[]"), class = "factor")), .Names = c("searchType", "roomMin", "roomMax", "priceMin", "priceMax", "sizeMin", "sizeMax", "longitude", "latitude", "specificSearch", "objectType"), row.names = c(60656L, 66897L, 130650L), class = "data.frame")
谢谢!
编辑:似乎错误/差异发生是因为数值列中有NA值,这些NA值似乎被不同地处理了。我如何调整daisy对NA的处理以适应gower.dist?
回答:
这是由于您的数据框中数值列的NA值引起的。考虑以下代码,看看当数值列有NA值时,这两个函数的行为完全不同(daisy比gower.dist更健壮):
df1 <- rbind(df,cent)head(df1) searchType roomMin roomMax priceMin priceMax sizeMin sizeMax longitude latitude specificSearch objectType112457 <NA> 20 30 NA 35 50 100 6.63060 46.52425 0 <NA>94601 1 10 30 73 11 50 75 7.47195 46.95120 0 278273 1 10 <NA> 60 1 50 50 8.55620 47.37515 0 259172 1 30 50 29 62 75 150 NA NA 0 2117425 1 30 40 11 23 75 100 8.56900 47.39290 0 260656 1 10 20 60 103 100 100 8.30150 47.05485 0 2# 只使用数值列priceMin(第四列)来计算距离class(df1[,4])# [1] "numeric"df2 <- df1[4]# daisy输出as.matrix(daisy(df2,metric="gower")) 112457 94601 78273 59172 117425 60656 66897 130650112457 0 NA NA NA NA NA NA NA94601 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.000000078273 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.209677459172 NA 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774117425 NA 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.000000060656 NA 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.209677466897 NA 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613130650 NA 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000# gower.dist输出gower.dist(df2) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8][1,] NaN NaN NaN NaN NaN NaN NaN NaN[2,] NaN 0 0 0 0 0 0 0[3,] NaN 0 0 0 0 0 0 0[4,] NaN 0 0 0 0 0 0 0[5,] NaN 0 0 0 0 0 0 0[6,] NaN 0 0 0 0 0 0 0[7,] NaN 0 0 0 0 0 0 0[8,] NaN 0 0 0 0 0 0 0
使用gower.dist函数中的rngs参数来修复这个问题:
gower.dist(df2, rngs=max(df2, na.rm=TRUE) - min(df2, na.rm=TRUE)) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8][1,] NaN NaN NaN NaN NaN NaN NaN NaN[2,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000[3,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774[4,] NaN 0.7096774 0.5000000 0.00000000 0.2903226 0.5000000 0.06451613 0.7096774[5,] NaN 1.0000000 0.7903226 0.29032258 0.0000000 0.7903226 0.35483871 1.0000000[6,] NaN 0.2096774 0.0000000 0.50000000 0.7903226 0.0000000 0.43548387 0.2096774[7,] NaN 0.6451613 0.4354839 0.06451613 0.3548387 0.4354839 0.00000000 0.6451613[8,] NaN 0.0000000 0.2096774 0.70967742 1.0000000 0.2096774 0.64516129 0.0000000
因此,当数值变量中有NA值时,可以像下面这样使gower.dist函数的运行方式与daisy类似:
df1 <- rbind(df,cent)# 正确计算数值变量的范围cols <- which(sapply(df1, is.numeric))rngs <- rep(1, ncol(df1))rngs[cols] <- sapply(df1[cols], function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE)) daisyDist <- as.matrix(daisy(df1,metric="gower"))gowerDist <- gower.dist(df1)daisyDist 112457 94601 78273 59172 117425 60656 66897 130650112457 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.110555194601 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.219932478273 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.363153359172 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812117425 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.475746060656 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.427275566897 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150130650 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000gowerDist [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8][1,] 0.0000000 0.3951059 0.6151851 0.7107843 0.6397059 0.6424374 0.3756990 0.1105551[2,] 0.3951059 0.0000000 0.2355126 0.5788530 0.5629176 0.4235379 0.3651002 0.2199324[3,] 0.6151851 0.2355126 0.0000000 0.5122549 0.4033046 0.3500130 0.3951874 0.3631533[4,] 0.7107843 0.5788530 0.5122549 0.0000000 0.2969639 0.5446623 0.4690421 0.5657812[5,] 0.6397059 0.5629176 0.4033046 0.2969639 0.0000000 0.4638003 0.4256891 0.4757460[6,] 0.6424374 0.4235379 0.3500130 0.5446623 0.4638003 0.0000000 0.5063082 0.4272755[7,] 0.3756990 0.3651002 0.3951874 0.4690421 0.4256891 0.5063082 0.0000000 0.2900150[8,] 0.1105551 0.2199324 0.3631533 0.5657812 0.4757460 0.4272755 0.2900150 0.0000000