K-均值聚类算法
【摘要】 测试程序import numpy as npimport matplotlib.pyplot as pltdef loadDataSet(fileName): """ 从文件中加载数据 :param fileName: :return: 数据集矩阵 dataMat """ dataMat = [] fr = open(fileName) for...
测试程序
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(fileName):
"""
从文件中加载数据
:param fileName:
:return: 数据集矩阵 dataMat
"""
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine)) # Python 3 需要使用list将map转化为列表
dataMat.append(fltLine)
return dataMat
def distEclud(vecA, vecB):
"""
计算两个向量的欧式距离
:param vecA:
:param vecB:
:return:
"""
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
def randCent(dataSet, k):
"""
为给定数据集构建一个包含k个随机质心的集合
:param dataSet: 数据集矩阵
:param k:
:return: 质心矩阵
"""
n = np.shape(dataSet)[1] # 获取矩阵的列数 n,即维数 n
centroids = np.mat(np.zeros((k, n))) # centroids初始值为k行n列的零矩阵
for j in range(n): # 遍历每一列,即每一个维
minJ = np.min(dataSet[:, j]) # 第j列的最小值
rangeJ = float(np.max(dataSet[:, j]) - minJ) # 第j列的数据范围
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1) # 生成质心集合,质心的坐标点都在该列的最大值和最小值之间
return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
"""
K-均值聚类算法
:param dataSet: 数据集矩阵
:param k: k值,即k个簇
:param distMeas: 距离函数
:param createCent: 创建质心矩阵函数
:return: 所有的类质心与点分配结果
"""
m = np.shape(dataSet)[0] # 数据集矩阵的行数,即数据点总数
# clusterAssment矩阵用来存储每个点的簇分配结果:一列记录簇索引值,一列存储误差
# 这里的误差是指当前点到簇质心的距离,后边会使用该误差来评价聚类的效果
clusterAssment = np.mat(np.zeros((m, 2)))
centroids = createCent(dataSet, k)
clusterChanged = True
# 循环,直到所有数据点的簇分配结果不再改变为止
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf # 最小距离初始化为无限大的正数
minIndex = -1
# 获取每一个数据点 i 到所有质心的最近距离,记录其索引
for j in range(k):
distJI = distMeas(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI
minIndex = j
# 如果数据点的簇分配结果发生变化,更新标志位
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
print("centroids: \n", centroids)
# 遍历所有质心并更新它们的取值
for cent in range(k):
# 获取给定簇的所有点
# 通过数组过滤,获取第cent个簇的所有点
ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
centroids[cent, :] = np.mean(ptsInClust, axis=0) # 对每一列求平均值,返回一个行矩阵,作为该簇的坐标
return centroids, clusterAssment
def plotPoint(dataSet, centroids, clustAssing, k):
"""
绘制散点图
:param dataSet: 数据集矩阵
:param centroids: 质心坐标矩阵
:param clustAssing: 簇分配结果
:param k: k值
:return:
"""
colors = ['r', 'g', 'y', 'b']
markers = ['o', '+', '*', 's']
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clustAssing[:, 0].A == cent)[0]]
x = ptsInClust.A[:, 0]
y = ptsInClust.A[:, 1]
# fig = plt.figure()
plt.scatter(x, y, c=colors[cent], marker=markers[cent])
plt.scatter(centroids.A[cent, :][0], centroids.A[cent, :][1], c='black', marker='D')
plt.show()
dataMat = np.mat(loadDataSet('testSet.txt'))
k = 4
myCentroids, clustAssing = kMeans(dataMat, k)
plotPoint(dataMat, myCentroids, clustAssing, k)
测试数据:
https://raw.githubusercontent.com/Jack-Cherish/Machine-Learning/master/Logistic/testSet.txt
测试结果:
参考:
https://github.com/apachecn/AiLearning/blob/master/docs/ml/10.k-means%E8%81%9A%E7%B1%BB.md
https://blog.csdn.net/loveliuzz/article/details/78783773
https://www.zhihu.com/question/31296149
https://wizardforcel.gitbooks.io/dm-algo-top10/content/k-means.html+&cd=9&hl=zh-CN&ct=clnk&gl=sg
https://www.cnblogs.com/jerrylead/archive/2011/04/06/2006910.html
https://my.oschina.net/u/3473376/blog/895294+&cd=1&hl=zh-CN&ct=clnk&gl=sg
https://www.qingtingip.com/h_210651.html+&cd=1&hl=zh-CN&ct=clnk&gl=sg
https://www.cnblogs.com/pinard/p/6164214.html
https://www.cnblogs.com/pinard/p/6156009.html
https://blog.csdn.net/doulinxi115413/article/details/80382066
https://blog.csdn.net/sun_shengyun/article/details/54616386
https://blog.csdn.net/niutianzhuang/article/details/79781353
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
作者其他文章
飞翔的大象2020/09/27 06:47:171楼编辑删除举报