美文网首页
k-means聚类算法研究

k-means聚类算法研究

作者: hexg1016 | 来源:发表于2018-11-15 16:06 被阅读0次

            聚类是机器学习中一种重要的无监督算法,它可以将数据点归结为一系列特定的组合。理论上归为一类的数据点具有相同的特性,而不同类别的数据点则具有各不相同的属性。在数据科学中聚类会从数据中发掘出很多分析和理解的视角,让我们更深入的把握数据资源的价值、并据此指导生产生活。

    K均值聚类

    这一最著名的聚类算法主要基于数据点之间的均值和与聚类中心的聚类迭代而成。它主要的优点是十分的高效,由于只需要计算数据点与剧类中心的距离,其计算复杂度只有O(n)。其工作原理主要分为以下四步:

    1.首先我们需要预先给定聚类的数目同时随机初始化聚类中心。我们可以初略的观察数据并给出较为准确的聚类数目;

    2.每一个数据点通过计算与聚类中心的距离了来分类到最邻近的一类中;

    3.根据分类结果,利用分类后的数据点重新计算聚类中心;

    4.重复步骤二三直到聚类中心不再变化。(可以随机初始化不同的聚类中心以选取最好的结果)

    python实现

    1 point.py

    class Point:

    def __init__(self, latit_, longit_):

    #self.id = id_  #an id which uniquely identifies a point

            self.latit = latit_

    self.longit = longit_

    2 clustering.py

    import randomas rand

    import mathas math

    from pointimport Point

    #import pkg_resources

    #pkg_resources.require("matplotlib")

    import numpyas np

    from mpl_toolkits.mplot3dimport Axes3D

    import matplotlib.pyplotas plt

    class clustering:

    def __init__(self, geo_locs_, k_):

    self.geo_locations = geo_locs_

    self.k = k_

    self.clusters = []#clusters of nodes

            self.means = []#means of clusters

            self.debug =True  #debug flag

    #this method returns the next random node

        def next_random(self, index, points, clusters):

    #pick next node that has the maximum distance from other nodes

            print 'index: %d ' % (index)

    dist = {}

    for point_1in points:

    #if self.debug:

    #print 'point_1: %f %f' % (point_1.latit, point_1.longit)

    #compute this node distance from all other points in cluster

                for clusterin clusters.values():

    point_2 = cluster[0]

    #if self.debug:

    # print 'point_2: %f %f' % (point_2.latit, point_2.longit)

                    if point_1not in dist:

    dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

    else:

    dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

    #if self.debug:

    #for key, value in dist.items():

    #print "(%f, %f) ==> %f" % (key.latit,key.longit,value)

    #now let's return the point that has the maximum distance from previous nodes

            count_ =0

            max_ =0

            for key, valuein dist.items():

    if count_ ==0:

    max_ = value

    max_point = key

    count_ +=1

                else:

    if value > max_:

    max_ = value

    max_point = key

    return max_point

    #this method computes the initial means

        def initial_means(self, points):

    #pick the first node at random

            point_ = rand.choice(points)

    if self.debug:

    print 'point#0: %f %f' % (point_.latit, point_.longit)

    clusters =dict()

    clusters.setdefault(0, []).append(point_)

    points.remove(point_)

    #now let's pick k-1 more random points

            for iin range(1,self.k):

    point_ =self.next_random(i, points, clusters)

    if self.debug:

    print 'point#%d: %f %f' % (i, point_.latit, point_.longit)

    #clusters.append([point_])

                clusters.setdefault(i, []).append(point_)

    points.remove(point_)

    #compute mean of clusters

    #self.print_clusters(clusters)

            self.means =self.compute_mean(clusters)

    if self.debug:

    print "initial means:"

                self.print_means(self.means)

    def compute_mean(self, clusters):

    means = []

    for clusterin clusters.values():

    mean_point = Point(0.0,0.0)

    cnt =0.0

                for pointin cluster:

    #print "compute: point(%f,%f)" % (point.latit, point.longit)

                    mean_point.latit += point.latit

    mean_point.longit += point.longit

    cnt +=1.0

                mean_point.latit = mean_point.latit/cnt

    mean_point.longit = mean_point.longit/cnt

    means.append(mean_point)

    return means

    #this method assign nodes to the cluster with the smallest mean

        def assign_points(self, points):

    if self.debug:

    print "assign points"

            clusters =dict()

    for pointin points:

    dist = []

    if self.debug:

    print "point(%f,%f)" % (point.latit, point.longit)

    #find the best cluster for this node

                for meanin self.means:

    dist.append(math.sqrt(math.pow(point.latit - mean.latit,2.0) + math.pow(point.longit - mean.longit,2.0)))

    #let's find the smallest mean

                if self.debug:

    print dist

    cnt_ =0

                index =0

                min_ = dist[0]

    for din dist:

    if d < min_:

    min_ = d

    index = cnt_

    cnt_ +=1

                if self.debug:

    print "index: %d" % index

    clusters.setdefault(index, []).append(point)

    return clusters

    def update_means(self, means, threshold):

    #check the current mean with the previous one to see if we should stop

            for iin range(len(self.means)):

    mean_1 =self.means[i]

    mean_2 = means[i]

    if self.debug:

    print "mean_1(%f,%f)" % (mean_1.latit, mean_1.longit)

    print "mean_2(%f,%f)" % (mean_2.latit, mean_2.longit)

    if math.sqrt(math.pow(mean_1.latit - mean_2.latit,2.0) + math.pow(mean_1.longit - mean_2.longit,2.0)) > threshold:

    return False

            return True

        #debug function: print cluster points

        def print_clusters(self, clusters):

    cluster_cnt =1

            for clusterin clusters.values():

    print "nodes in cluster #%d" % cluster_cnt

    cluster_cnt +=1

                for pointin cluster:

    print "point(%f,%f)" % (point.latit, point.longit)

    #print means

        def print_means(self, means):

    for pointin means:

    print "%f %f" % (point.latit, point.longit)

    #k_means algorithm

        def k_means(self, plot_flag):

    if len(self.geo_locations)

    return -1  #error

            points_ = [pointfor pointin self.geo_locations]

    #compute the initial means

            self.initial_means(points_)

    stop =False

            while not stop:

    #assignment step: assign each node to the cluster with the closest mean

                points_ = [pointfor pointin self.geo_locations]

    clusters =self.assign_points(points_)

    if self.debug:

    self.print_clusters(clusters)

    means =self.compute_mean(clusters)

    if self.debug:

    print "means:"

                    print self.print_means(means)

    print "update mean:"

                stop =self.update_means(means,0.01)

    if not stop:

    self.means = []

    self.means = means

    self.clusters = clusters

    #plot cluster for evluation

            if plot_flag:

    fig = plt.figure()

    ax = fig.add_subplot(111)

    markers = ['o','d','x','h','H',7,4,5,6,'8','p',',','+','.','s','*',3,0,1,2]

    colors = ['r','k','b', [0,0,0], [0,0,1], [0,1,0], [0,1,1], [1,0,0], [1,0,1], [1,1,0], [1,1,1]]

    cnt =0

                for clusterin clusters.values():

    latits = []

    longits = []

    for pointin cluster:

    latits.append(point.latit)

    longits.append(point.longit)

    ax.scatter(longits, latits,s=60,c=colors[cnt],marker=markers[cnt])

    cnt +=1

                plt.show()

    return 0

    3 main.py

    import randomas rand

    from clusteringimport clustering

    from pointimport Point

    import csv

    geo_locs = []

    #loc_ = Point(0.0, 0.0)  #tuples for location

    #geo_locs.append(loc_)

    #read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object

    f =open('./drinking_fountains.csv','r')

    reader = csv.reader(f,delimiter=",")

    for linein reader:

    loc_ = Point(float(line[0]),float(line[1]))#tuples for location

        geo_locs.append(loc_)

    #print len(geo_locs)

    #for p in geo_locs:

    #    print "%f %f" % (p.latit, p.longit)

    #let's run k_means clustering. the second parameter is the no of clusters

    cluster = clustering(geo_locs,8 )

    flag = cluster.k_means(True)

    if flag == -1:

    print "Error in arguments!"

    else:

    #the clustering results is a list of lists where each list represents one cluster

        print "clustering results:"

        cluster.print_clusters(cluster.clusters)

    4 drinking_fountains.csv 示例数据

    1,1

    2,2

    3,3

    4,4

    5,5

    6,6

    7,7

    8,8

    9,9

    10,10

    11,11

    12,12

    13,13

    14,14

    15,15

    运行main.py ,图片如下:

    相关文章

      网友评论

          本文标题:k-means聚类算法研究

          本文链接:https://www.haomeiwen.com/subject/zcbyfqtx.html