美文网首页机器学习笔记
K_means算法和调用sklearn中的k_means包

K_means算法和调用sklearn中的k_means包

作者: fred_33c7 | 来源:发表于2019-06-26 15:02 被阅读0次

    K_means是最基本的一种无监督学习分类的模型。原理非常简单。下面分享两种K_means使用方法的例子。
    本章所有源码和数据都在如下github地址能下载:https://github.com/fredfeng0326/Machine_learning

    1.从基本数学模型写出算法

    import matplotlib.pyplot as plt
    import seaborn as sns
    
    import numpy as np
    import pandas as pd
    import scipy.io as sio
    
    
    mat = sio.loadmat('./data/ex7data2.mat')
    data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
    # print(data2.head())
    """
             X1        X2
    0  1.842080  4.607572
    1  5.658583  4.799964
    2  6.352579  3.290854
    3  2.904017  4.612204
    4  3.231979  4.939894
    """
    sns.set(context="notebook", style="white")
    sns.lmplot('X1', 'X2', data=data2, fit_reg=False)
    plt.show()
    
    """
    0. random init
    """
    
    def combine_data_C(data, C):
        data_with_c = data.copy()
        data_with_c['C'] = C
        return data_with_c
    
    
    def random_init(data, k):
        """choose k sample from data set as init centroids
        Args:
            data: DataFrame
            k: int
        Returns:
            k samples: ndarray
        """
        # data.sample() Return a random sample of items from an axis of object.
        return data.sample(k).as_matrix()
    
    
    def _find_your_cluster(x, centroids):
        """find the right cluster for x with respect to shortest distance
        np.linalg.norm  求范数
        Args:
            x: ndarray (n, ) -> n features
            centroids: ndarray (k, n)
        Returns:
            k: int
        """
        distances = np.apply_along_axis(func1d=np.linalg.norm,  # this give you l2 norm
                                        axis=1,
                                        arr=centroids - x)  # use ndarray's broadcast
        return np.argmin(distances)
    
    
    def assign_cluster(data, centroids):
        """assign cluster for each node in data
        return C ndarray
        """
        return np.apply_along_axis(lambda x: _find_your_cluster(x, centroids),
                                   axis=1,
                                   arr=data.as_matrix())
    
    
    def new_centroids(data, C):
        data_with_c = combine_data_C(data, C)
    
        return data_with_c.groupby('C', as_index=False).\
                           mean().\
                           sort_values(by='C').\
                           drop('C', axis=1).\
                           as_matrix()
    
    
    def cost(data, centroids, C):
        m = data.shape[0]
    
        expand_C_with_centroids = centroids[C]
    
        distances = np.apply_along_axis(func1d=np.linalg.norm,
                                        axis=1,
                                        arr=data.as_matrix() - expand_C_with_centroids)
        return distances.sum() / m
    
    
    def _k_means_iter(data, k, epoch=100, tol=0.0001):
        """one shot k-means
        with early break
        """
        centroids = random_init(data, k)
        cost_progress = []
    
        for i in range(epoch):
            print('running epoch {}'.format(i))
    
            C = assign_cluster(data, centroids)
            centroids = new_centroids(data, C)
            cost_progress.append(cost(data, centroids, C))
    
            if len(cost_progress) > 1:  # early break
                if (np.abs(cost_progress[-1] - cost_progress[-2])) / cost_progress[-1] < tol:
                    break
    
        return C, centroids, cost_progress[-1]
    
    
    def k_means(data, k, epoch=100, n_init=10):
        """do multiple random init and pick the best one to return
        Args:
            data (pd.DataFrame)
        Returns:
            (C, centroids, least_cost)
        """
    
        tries = np.array([_k_means_iter(data, k, epoch) for _ in range(n_init)])
    
        least_cost_idx = np.argmin(tries[:, -1])
    
        return tries[least_cost_idx]
    
    
    init_centroids = random_init(data2, 3)
    """
    [[2.85962615 5.26041997]
     [6.28438193 3.17360643]
     [3.18412176 1.41410799]]
    """
    
    x = np.array([1, 1])
    
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])
    
    for i, node in enumerate(init_centroids):
        ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
    
    ax.scatter(x[0], x[1], marker='x', s=200)
    plt.show()
    """
     epoch cluster assigning
    """
    
    C = assign_cluster(data2, init_centroids)
    data_with_c =combine_data_C(data2, C)
    data_with_c.head()
    
    sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
    plt.show()
    
    结果1

    2.调用sklearn K_means的包直接使用

    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    import numpy as np
    import pandas as pd
    import scipy.io as sio
    
    mat = sio.loadmat('./data/ex7data2.mat')
    data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
    
    sk_kmeans = KMeans(n_clusters=3)
    sk_kmeans.fit(data2)
    sk_C = sk_kmeans.predict(data2)
    
    
    def combine_data_C(data, C):
        data_with_c = data.copy()
        data_with_c['C'] = C
        return data_with_c
    
    # 多加一列C
    data_with_c = combine_data_C(data2, sk_C)
    
    # print(data_with_c)
    sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
    plt.show()
    
    结果2

    相关文章

      网友评论

        本文标题:K_means算法和调用sklearn中的k_means包

        本文链接:https://www.haomeiwen.com/subject/bpyccctx.html