K_means是最基本的一种无监督学习分类的模型。原理非常简单。下面分享两种K_means使用方法的例子。
本章所有源码和数据都在如下github地址能下载:https://github.com/fredfeng0326/Machine_learning
1.从基本数学模型写出算法
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.io as sio
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
# print(data2.head())
"""
X1 X2
0 1.842080 4.607572
1 5.658583 4.799964
2 6.352579 3.290854
3 2.904017 4.612204
4 3.231979 4.939894
"""
sns.set(context="notebook", style="white")
sns.lmplot('X1', 'X2', data=data2, fit_reg=False)
plt.show()
"""
0. random init
"""
def combine_data_C(data, C):
data_with_c = data.copy()
data_with_c['C'] = C
return data_with_c
def random_init(data, k):
"""choose k sample from data set as init centroids
Args:
data: DataFrame
k: int
Returns:
k samples: ndarray
"""
# data.sample() Return a random sample of items from an axis of object.
return data.sample(k).as_matrix()
def _find_your_cluster(x, centroids):
"""find the right cluster for x with respect to shortest distance
np.linalg.norm 求范数
Args:
x: ndarray (n, ) -> n features
centroids: ndarray (k, n)
Returns:
k: int
"""
distances = np.apply_along_axis(func1d=np.linalg.norm, # this give you l2 norm
axis=1,
arr=centroids - x) # use ndarray's broadcast
return np.argmin(distances)
def assign_cluster(data, centroids):
"""assign cluster for each node in data
return C ndarray
"""
return np.apply_along_axis(lambda x: _find_your_cluster(x, centroids),
axis=1,
arr=data.as_matrix())
def new_centroids(data, C):
data_with_c = combine_data_C(data, C)
return data_with_c.groupby('C', as_index=False).\
mean().\
sort_values(by='C').\
drop('C', axis=1).\
as_matrix()
def cost(data, centroids, C):
m = data.shape[0]
expand_C_with_centroids = centroids[C]
distances = np.apply_along_axis(func1d=np.linalg.norm,
axis=1,
arr=data.as_matrix() - expand_C_with_centroids)
return distances.sum() / m
def _k_means_iter(data, k, epoch=100, tol=0.0001):
"""one shot k-means
with early break
"""
centroids = random_init(data, k)
cost_progress = []
for i in range(epoch):
print('running epoch {}'.format(i))
C = assign_cluster(data, centroids)
centroids = new_centroids(data, C)
cost_progress.append(cost(data, centroids, C))
if len(cost_progress) > 1: # early break
if (np.abs(cost_progress[-1] - cost_progress[-2])) / cost_progress[-1] < tol:
break
return C, centroids, cost_progress[-1]
def k_means(data, k, epoch=100, n_init=10):
"""do multiple random init and pick the best one to return
Args:
data (pd.DataFrame)
Returns:
(C, centroids, least_cost)
"""
tries = np.array([_k_means_iter(data, k, epoch) for _ in range(n_init)])
least_cost_idx = np.argmin(tries[:, -1])
return tries[least_cost_idx]
init_centroids = random_init(data2, 3)
"""
[[2.85962615 5.26041997]
[6.28438193 3.17360643]
[3.18412176 1.41410799]]
"""
x = np.array([1, 1])
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])
for i, node in enumerate(init_centroids):
ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
ax.scatter(x[0], x[1], marker='x', s=200)
plt.show()
"""
epoch cluster assigning
"""
C = assign_cluster(data2, init_centroids)
data_with_c =combine_data_C(data2, C)
data_with_c.head()
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()
结果1
2.调用sklearn K_means的包直接使用
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.io as sio
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
sk_kmeans = KMeans(n_clusters=3)
sk_kmeans.fit(data2)
sk_C = sk_kmeans.predict(data2)
def combine_data_C(data, C):
data_with_c = data.copy()
data_with_c['C'] = C
return data_with_c
# 多加一列C
data_with_c = combine_data_C(data2, sk_C)
# print(data_with_c)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()
结果2
网友评论