我正在尝试为24维和32维特征向量实现GMM聚类,其中初始参数的分配由Kmeans算法完成(K均值聚类仅提供聚类中心 – MU)。我参考了这个链接,那里只为2D特征向量实现了预定义的Mu和sigma。
如果有人有GMM聚类的代码,请发布出来。
sklearn中也有预定义的GMM库,但它不提供每次迭代的似然值。sklearn GMM
回答:
def kmeans(dataSet, k, c): # 1. 随机选择聚类 rng = np.random.RandomState(c) p = rng.permutation(dataSet.shape[0])[:k] centers = dataSet[p] while True: labels = pairwise_distances_argmin(dataSet, centers) new_centers = np.array([dataSet[labels == i].mean(0) for i in range(k)]) if np.all(centers == new_centers): break centers = new_centers cluster_data = [dataSet[labels == i] for i in range(k)] l = [] covs = [] for i in range(k): l.append(len(cluster_data[i]) * 1.0 / len(dataSet)) covs.append(np.cov(np.array(cluster_data[i]).T)) return centers, l, covs, cluster_datareturn new_mu, new_covs, cluster_dataclass gaussian_Mix_Model: def __init__(self, k = 8, eps = 0.0000001): self.k = k ## 聚类数量 self.eps = eps ## 停止阈值 `epsilon` def calculate_Exp_Maxim(self, X, max_iters = 1000): # n = 数据点数量, d = 数据点维度 n, d = X.shape mu, Cov = [], [] for i in range(1,k): new_mu, new_covs, cluster_data = kmeans(dataSet, k, c) # 初始化新参数 mu[k] = new_mu Cov[k]= new_cov # 初始化权重 w = [1./self.k] * self.k R = np.zeros((n, self.k)) ### 似然值 LLhoods = [] P = lambda mu, s: np.linalg.det(s) ** -.5 ** (2 * np.pi) ** (-X.shape[1]/2.) \ * np.exp(-.5 * np.einsum('ij, ij -> i',\ X - mu, np.dot(np.linalg.inv(s) , (X - mu).T).T ) ) # 迭代至最大迭代次数 while len(LLhoods) < max_iters: # 期望计算 ## 每个K聚类的成员关系 for k in range(self.k): R[:, k] = w[k] * P(mu[k], Cov[k]) # 计算对数似然值 LLhood = np.sum(np.log(np.sum(R, axis = 1))) # 现在将对数似然值存储到列表中 LLhoods.append(LLhood) # 计算每个聚类的数据点数量 R = (R.T / np.sum(R, axis = 1)).T N_ks = np.sum(R, axis = 0) # 最大化并计算新参数 for k in range(self.k): # 计算新均值 mu[k] = 1. / N_ks[k] * np.sum(R[:, k] * X.T, axis = 1).T x_mu = np.matrix(X - mu[k]) # 计算新协方差 Cov[k] = np.array(1 / N_ks[k] * np.dot(np.multiply(x_mu.T, R[:, k]), x_mu)) # 计算新PiK w[k] = 1. / n * N_ks[k] # 检查是否收敛 if (np.abs(LLhood - LLhoods[-2]) < self.eps) and (iteration < max_iters): break else: Continue from collections import namedtuple self.params = namedtuple('params', ['mu', 'Cov', 'w', 'LLhoods', 'num_iters']) self.params.mu = mu self.params.Cov = Cov self.params.w = w self.params.LLhoods = LLhoods self.params.num_iters = len(LLhoods) return self.params# 调用GMM来查找模型 gmm = gaussian_Mix_Model(3, 0.000001)params = gmm.fit_EM(X, max_iters= 150)# 绘制对数似然值与迭代次数的关系图 plt.plot(LLhoods[0])plt.savefig('Dataset_2A_GMM_Class_1_K_16.png')plt.clf()plt.plot(LLhoods[1])plt.savefig('Dataset_2A_GMM_Class_2_K_16.png')plt.clf()plt.plot(LLhoods[2])plt.savefig('Dataset_2A_GMM_Class_3_K_16.png')plt.clf()