1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
| import numpy as np
def loadDataSet(fileName): """[训练、测试数据文件的加载] Arguments: fileName {[str]} -- [训练数据集] Returns: [list] -- [训练数据集] """ dataMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().strip('\t') fltLine = map(float, curLine) dataMat.append(fltLine) return dataMat
def distEclud(vecA, vecB): """[计算各个点到中心点的距离] Arguments: vecA {[np.array]} -- [中心点] vecB {[np.array]} -- [未知点] Returns: [double] -- [距离] """ return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
def randCent(dataSet, k): """[根据dataSet构建k个随机的中心点] Arguments: dataSet {[list]} -- [数据集] k {[int]} -- [聚类目标个数] Returns: [list] -- [中心点集合] """ n = np.shape(dataSet)[1] centroids = np.mat(np.zeros((k, n))) for j in range(n): minJ = min(dataSet[:, j]) rangeJ = float(max(dataSet[:, j]) - minJ) centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1) return centroids
def KMeans(dataSet, k, distMeans=distEclud, createCent=randCent): """[K-Means算法主体] Arguments: dataSet {[list} -- [数据集] k {[int]} -- [目标聚类个数] Keyword Arguments: distMeans {[function]} -- [计算距离的方法] (default: {distEclud}) createCent {[function]} -- [产生中心点的方法] (default: {randCent}) Returns: [tuple] -- [最终每一类的中心点与分配结果] """ m = np.shape(dataSet)[0] clusterAssment = np.mat(np.zeros((m, 2))) centroids = createCent(dataSet, k) clusterChanged = True while clusterChanged: clusterChanged = False for i in range(m): minDist = np.inf; minIndex = -1 for j in range(k): distJI = distMeans(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 print centroids for cent in range(k): ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]] centroids[cent, :] = np.mean(ptsInClust, axis=0) return centroids, clusterAssment
|