K-均值聚类算法
测试程序
import numpy as np import matplotlib.pyplot as plt def loadDataSet(fileName): """ 从文件中加载数据 :param fileName: :return: 数据集矩阵 dataMat """ dataMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = list(map(float, curLine)) # Python 3 需要使用list将map转化为列表 dataMat.append(fltLine) return dataMat def distEclud(vecA, vecB): """ 计算两个向量的欧式距离 :param vecA: :param vecB: :return: """ return np.sqrt(np.sum(np.power(vecA - vecB, 2))) def randCent(dataSet, k): """ 为给定数据集构建一个包含k个随机质心的集合 :param dataSet: 数据集矩阵 :param k: :return: 质心矩阵 """ n = np.shape(dataSet)[1] # 获取矩阵的列数 n,即维数 n centroids = np.mat(np.zeros((k, n))) # centroids初始值为k行n列的零矩阵 for j in range(n): # 遍历每一列,即每一个维 minJ = np.min(dataSet[:, j]) # 第j列的最小值 rangeJ = float(np.max(dataSet[:, j]) - minJ) # 第j列的数据范围 centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1) # 生成质心集合,质心的坐标点都在该列的最大值和最小值之间 return centroids def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): """ K-均值聚类算法 :param dataSet: 数据集矩阵 :param k: k值,即k个簇 :param distMeas: 距离函数 :param createCent: 创建质心矩阵函数 :return: 所有的类质心与点分配结果 """ m = np.shape(dataSet)[0] # 数据集矩阵的行数,即数据点总数 # clusterAssment矩阵用来存储每个点的簇分配结果:一列记录簇索引值,一列存储误差 # 这里的误差是指当前点到簇质心的距离,后边会使用该误差来评价聚类的效果 clusterAssment = np.mat(np.zeros((m, 2))) centroids = createCent(dataSet, k) clusterChanged = True # 循环,直到所有数据点的簇分配结果不再改变为止 while clusterChanged: clusterChanged = False for i in range(m): minDist = np.inf # 最小距离初始化为无限大的正数 minIndex = -1 # 获取每一个数据点 i 到所有质心的最近距离,记录其索引 for j in range(k): distJI = distMeas(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI minIndex = j # 如果数据点的簇分配结果发生变化,更新标志位 if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 print("centroids: \n", centroids) # 遍历所有质心并更新它们的取值 for cent in range(k): # 获取给定簇的所有点 # 通过数组过滤,获取第cent个簇的所有点 ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]] centroids[cent, :] = np.mean(ptsInClust, axis=0) # 对每一列求平均值,返回一个行矩阵,作为该簇的坐标 return centroids, clusterAssment def plotPoint(dataSet, centroids, clustAssing, k): """ 绘制散点图 :param dataSet: 数据集矩阵 :param centroids: 质心坐标矩阵 :param clustAssing: 簇分配结果 :param k: k值 :return: """ colors = ['r', 'g', 'y', 'b'] markers = ['o', '+', '*', 's'] for cent in range(k): ptsInClust = dataSet[np.nonzero(clustAssing[:, 0].A == cent)[0]] x = ptsInClust.A[:, 0] y = ptsInClust.A[:, 1] # fig = plt.figure() plt.scatter(x, y, c=colors[cent], marker=markers[cent]) plt.scatter(centroids.A[cent, :][0], centroids.A[cent, :][1], c='black', marker='D') plt.show() dataMat = np.mat(loadDataSet('testSet.txt')) k = 4 myCentroids, clustAssing = kMeans(dataMat, k) plotPoint(dataMat, myCentroids, clustAssing, k)
测试数据:
https://raw.githubusercontent.com/Jack-Cherish/Machine-Learning/master/Logistic/testSet.txt
测试结果:
参考:
https://github.com/apachecn/AiLearning/blob/master/docs/ml/10.k-means%E8%81%9A%E7%B1%BB.md
https://blog.csdn.net/loveliuzz/article/details/78783773
https://www.zhihu.com/question/31296149
https://wizardforcel.gitbooks.io/dm-algo-top10/content/k-means.html+&cd=9&hl=zh-CN&ct=clnk&gl=sg
https://www.cnblogs.com/jerrylead/archive/2011/04/06/2006910.html
https://my.oschina.net/u/3473376/blog/895294+&cd=1&hl=zh-CN&ct=clnk&gl=sg
https://www.qingtingip.com/h_210651.html+&cd=1&hl=zh-CN&ct=clnk&gl=sg
https://www.cnblogs.com/pinard/p/6164214.html
https://www.cnblogs.com/pinard/p/6156009.html
https://blog.csdn.net/doulinxi115413/article/details/80382066
https://blog.csdn.net/sun_shengyun/article/details/54616386
https://blog.csdn.net/niutianzhuang/article/details/79781353
- 点赞
- 收藏
- 关注作者
评论(0)