1.算法简介
k均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。
2.源码实现
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 11
#define k 3
typedef struct {
float x;
float y;
} Point;
Point point[N] = {
{ 2.0, 10.0 }, { 2.0, 5.0 }, { 8.0, 4.0 }, { 5.0, 8.0 }, { 7.0, 5.0 },
{ 6.0, 4.0 }, { 1.0, 2.0 }, { 4.0, 9.0 }, { 7.0, 3.0 }, { 1.0, 3.0 },
{ 3.0, 9.0 }
};
Point mean[k];
int center[N];
float getdistance(Point point1, Point point2);
void cluster();
float gete();
void getmean(int center[N]);
int main()
{
int number = 0;
float temp1, temp2;
//初始化k个中心点,这里选择给定中心点,而不是随机生成,需要更多的先验知识
//若没有相关先验知识,可选择随机生成初始中心点
mean[0].x = point[0].x;
mean[0].y = point[0].y;
mean[1].x = point[3].x;
mean[1].y = point[3].y;
mean[2].x = point[6].x;
mean[2].y = point[6].y;
//第一次聚类
cluster();
//number统计进行了几次聚类
number++;
//对第一次聚类的结果进行误差平方和的计算
temp1 = gete();
printf("the error1 is:%f\n", temp1);
//针对第一次聚类的结果,重新计算聚类中心
getmean(center);
//第二次聚类
cluster();
number++;
temp2 = gete();
printf("the error2 is:%f\n", temp2);
//迭代循环,直到两次迭代误差的差值在一定阈值范围内,则迭代停止
while(fabs(temp1 - temp2) > 0.1)
{
temp1 = temp2;
getmean(center);
cluster();
temp2 = gete();
number++;
printf("the error%d is:%f\n", number,temp2);
}
printf("the total number of cluster is:%d\n", number);
return 0;
}
//计算距离函数,欧式距离
float getdistance(Point point1, Point point2)
{
float d;
d = sqrt((point1.x - point2.x)*(point1.x - point2.x) + (point1.y - point2.y)*(point1.y - point2.y));
return d;
}
//聚类函数
void cluster()
{
float distance[N][k];
float min;
int i, j;
for(i=0; i<N; i++)
{
for(j=0; j<k; j++)
{
distance[i][j] = getdistance(point[i], mean[j]);
}
min = 9999.0;
for(j=0; j<k; j++)
{
if(distance[i][j] < min)
{
min = distance[i][j];
center[i] = j;
}
}
printf("(%.0f,%.0f)\t in cluster-%d\n", point[i].x, point[i].y, center[i] + 1);
}
}
//聚类后误差计算函数
float gete()
{
float cnt=0, sum=0;
int i, j;
for(i=0; i<N; i++)
{
for(j=0; j<k; j++)
{
if(center[i] == j)
{
cnt = getdistance(point[i], mean[j]);
}
}
sum += cnt;
}
return sum;
}
//重新计算聚类中心
void getmean(int center[N])
{
Point sum;
int count;
int i, j;
for(i=0; i<k; i++)
{
sum.x = 0.0;
sum.y = 0.0;
count = 0;
for(j=0; j<N; j++)
{
if(center[j] == i)
{
sum.x += point[j].x;
sum.y += point[j].y;
count++;
}
}
mean[i].x = sum.x / count;
mean[i].y = sum.y / count;
}
for(i=0; i<k; i++)
{
printf("the new center point of %d is:\t(%f,%f)\n", i + 1, mean[i].x, mean[i].y);
}
}
3.编译源码
$ gcc -o example example.c -lm
4.运行程序及其结果
./example
(2,10) in cluster-1
(2,5) in cluster-3
(8,4) in cluster-2
(5,8) in cluster-2
(7,5) in cluster-2
(6,4) in cluster-2
(1,2) in cluster-3
(4,9) in cluster-2
(7,3) in cluster-2
(1,3) in cluster-3
(3,9) in cluster-1
the error1 is:25.104525
the new center point of 1 is: (2.500000,9.500000)
the new center point of 2 is: (6.166667,5.500000)
the new center point of 3 is: (1.333333,3.333333)
(2,10) in cluster-1
(2,5) in cluster-3
(8,4) in cluster-2
(5,8) in cluster-2
(7,5) in cluster-2
(6,4) in cluster-2
(1,2) in cluster-3
(4,9) in cluster-1
(7,3) in cluster-2
(1,3) in cluster-3
(3,9) in cluster-1
the error2 is:16.880072
the new center point of 1 is: (3.000000,9.333333)
the new center point of 2 is: (6.600000,4.800000)
the new center point of 3 is: (1.333333,3.333333)
(2,10) in cluster-1
(2,5) in cluster-3
(8,4) in cluster-2
(5,8) in cluster-1
(7,5) in cluster-2
(6,4) in cluster-2
(1,2) in cluster-3
(4,9) in cluster-1
(7,3) in cluster-2
(1,3) in cluster-3
(3,9) in cluster-1
the error3 is:13.537380
the new center point of 1 is: (3.500000,9.000000)
the new center point of 2 is: (7.000000,4.000000)
the new center point of 3 is: (1.333333,3.333333)
(2,10) in cluster-1
(2,5) in cluster-3
(8,4) in cluster-2
(5,8) in cluster-1
(7,5) in cluster-2
(6,4) in cluster-2
(1,2) in cluster-3
(4,9) in cluster-1
(7,3) in cluster-2
(1,3) in cluster-3
(3,9) in cluster-1
the error4 is:12.246379
the new center point of 1 is: (3.500000,9.000000)
the new center point of 2 is: (7.000000,4.000000)
the new center point of 3 is: (1.333333,3.333333)
(2,10) in cluster-1
(2,5) in cluster-3
(8,4) in cluster-2
(5,8) in cluster-1
(7,5) in cluster-2
(6,4) in cluster-2
(1,2) in cluster-3
(4,9) in cluster-1
(7,3) in cluster-2
(1,3) in cluster-3
(3,9) in cluster-1
the error5 is:12.246379
the total number of cluster is:5
网友评论