美文网首页
c语言实现K均值算法

c语言实现K均值算法

作者: 一路向后 | 来源:发表于2021-02-02 20:45 被阅读0次

    1.算法简介

       k均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。

    2.源码实现

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    
    #define N   11
    #define k   3
    
    typedef struct {
        float x;
        float y;
    } Point;
    
    Point point[N] = {
        { 2.0, 10.0 }, { 2.0, 5.0 }, { 8.0, 4.0 }, { 5.0, 8.0 }, { 7.0, 5.0 },
        { 6.0,  4.0 }, { 1.0, 2.0 }, { 4.0, 9.0 }, { 7.0, 3.0 }, { 1.0, 3.0 },
        { 3.0,  9.0 }
    };
    
    Point mean[k];
    int center[N];
    
    float getdistance(Point point1, Point point2);
    void cluster();
    float gete();
    void getmean(int center[N]);
    
    int main()
    {
        int number = 0;
        float temp1, temp2;
    
        //初始化k个中心点,这里选择给定中心点,而不是随机生成,需要更多的先验知识
        //若没有相关先验知识,可选择随机生成初始中心点
        mean[0].x = point[0].x;
        mean[0].y = point[0].y;
    
        mean[1].x = point[3].x;
        mean[1].y = point[3].y;
    
        mean[2].x = point[6].x;
        mean[2].y = point[6].y;
    
        //第一次聚类
        cluster();
    
        //number统计进行了几次聚类
        number++;
    
        //对第一次聚类的结果进行误差平方和的计算   
        temp1 = gete();
        printf("the error1 is:%f\n", temp1);
    
        //针对第一次聚类的结果,重新计算聚类中心
        getmean(center);
    
        //第二次聚类
        cluster();
        number++;
        temp2 = gete();
        printf("the error2 is:%f\n", temp2);
    
        //迭代循环,直到两次迭代误差的差值在一定阈值范围内,则迭代停止
        while(fabs(temp1 - temp2) > 0.1)
        {
            temp1 = temp2;
            getmean(center);
            cluster();
            temp2 = gete();
            number++;
    
            printf("the error%d is:%f\n", number,temp2);
        }
    
        printf("the total number of cluster is:%d\n", number);
    
        return 0;
    }
    
    //计算距离函数,欧式距离
    float getdistance(Point point1, Point point2)
    {
        float d;
    
        d = sqrt((point1.x - point2.x)*(point1.x - point2.x) + (point1.y - point2.y)*(point1.y - point2.y));
    
        return d;
    }
    
    //聚类函数
    void cluster()
    {
        float distance[N][k];
        float min;
        int i, j;
    
        for(i=0; i<N; i++)
        {
            for(j=0; j<k; j++)
            {
                distance[i][j] = getdistance(point[i], mean[j]);
            }
    
            min = 9999.0;
    
            for(j=0; j<k; j++)
            {
                if(distance[i][j] < min)
                {
                    min = distance[i][j];
                    center[i] = j;
                }
            }
    
            printf("(%.0f,%.0f)\t in cluster-%d\n", point[i].x, point[i].y, center[i] + 1);
        }
    }
    
    //聚类后误差计算函数
    float gete()
    {
        float cnt=0, sum=0;
        int i, j;
    
        for(i=0; i<N; i++)
        {
            for(j=0; j<k; j++)
            {
                if(center[i] == j)
                {
                    cnt = getdistance(point[i], mean[j]);
                }
            }
    
            sum += cnt;     
        }
    
        return sum;
    }
    
    //重新计算聚类中心
    void getmean(int center[N])
    {
        Point sum;
        int count;
        int i, j;
    
        for(i=0; i<k; i++)
        {
            sum.x = 0.0;
            sum.y = 0.0;
            count = 0;
    
            for(j=0; j<N; j++)
            {
                if(center[j] == i)
                {
                    sum.x += point[j].x;
                    sum.y += point[j].y;
                    count++;
                }
            }
    
            mean[i].x = sum.x / count;
            mean[i].y = sum.y / count;
        }
    
        for(i=0; i<k; i++)
        {
            printf("the new center point of %d is:\t(%f,%f)\n", i + 1, mean[i].x, mean[i].y);
        }
    }
    

    3.编译源码

    $ gcc -o example example.c -lm
    

    4.运行程序及其结果

    ./example
    (2,10)   in cluster-1
    (2,5)    in cluster-3
    (8,4)    in cluster-2
    (5,8)    in cluster-2
    (7,5)    in cluster-2
    (6,4)    in cluster-2
    (1,2)    in cluster-3
    (4,9)    in cluster-2
    (7,3)    in cluster-2
    (1,3)    in cluster-3
    (3,9)    in cluster-1
    the error1 is:25.104525
    the new center point of 1 is:   (2.500000,9.500000)
    the new center point of 2 is:   (6.166667,5.500000)
    the new center point of 3 is:   (1.333333,3.333333)
    (2,10)   in cluster-1
    (2,5)    in cluster-3
    (8,4)    in cluster-2
    (5,8)    in cluster-2
    (7,5)    in cluster-2
    (6,4)    in cluster-2
    (1,2)    in cluster-3
    (4,9)    in cluster-1
    (7,3)    in cluster-2
    (1,3)    in cluster-3
    (3,9)    in cluster-1
    the error2 is:16.880072
    the new center point of 1 is:   (3.000000,9.333333)
    the new center point of 2 is:   (6.600000,4.800000)
    the new center point of 3 is:   (1.333333,3.333333)
    (2,10)   in cluster-1
    (2,5)    in cluster-3
    (8,4)    in cluster-2
    (5,8)    in cluster-1
    (7,5)    in cluster-2
    (6,4)    in cluster-2
    (1,2)    in cluster-3
    (4,9)    in cluster-1
    (7,3)    in cluster-2
    (1,3)    in cluster-3
    (3,9)    in cluster-1
    the error3 is:13.537380
    the new center point of 1 is:   (3.500000,9.000000)
    the new center point of 2 is:   (7.000000,4.000000)
    the new center point of 3 is:   (1.333333,3.333333)
    (2,10)   in cluster-1
    (2,5)    in cluster-3
    (8,4)    in cluster-2
    (5,8)    in cluster-1
    (7,5)    in cluster-2
    (6,4)    in cluster-2
    (1,2)    in cluster-3
    (4,9)    in cluster-1
    (7,3)    in cluster-2
    (1,3)    in cluster-3
    (3,9)    in cluster-1
    the error4 is:12.246379
    the new center point of 1 is:   (3.500000,9.000000)
    the new center point of 2 is:   (7.000000,4.000000)
    the new center point of 3 is:   (1.333333,3.333333)
    (2,10)   in cluster-1
    (2,5)    in cluster-3
    (8,4)    in cluster-2
    (5,8)    in cluster-1
    (7,5)    in cluster-2
    (6,4)    in cluster-2
    (1,2)    in cluster-3
    (4,9)    in cluster-1
    (7,3)    in cluster-2
    (1,3)    in cluster-3
    (3,9)    in cluster-1
    the error5 is:12.246379
    the total number of cluster is:5
    

    相关文章

      网友评论

          本文标题:c语言实现K均值算法

          本文链接:https://www.haomeiwen.com/subject/incutltx.html