美文网首页
构建Kmeans模型

构建Kmeans模型

作者: Chaweys | 来源:发表于2020-12-19 16:25 被阅读0次

    import pandas as pd
    import numpy as np
    
    df = pd.read_csv('rfm.csv')
    df.head()
    '''
        CustomerID  Recency Frequency   Monetary
    0   12347.0     2        7          2866.77
    1   12348.0     248      1          17.00
    2   12349.0     18       1          1155.75
    3   12350.0     310      1          274.00
    4   12352.0     36       7          1147.44
    '''
    
    
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    import matplotlib.pyplot as plt
    
    # 确定数据源
    X = df.iloc[:,1:]
    
    X.head()
    '''
        Recency Frequency   Monetary
    0   2        7          2866.77
    1   248      1          17.00
    2   18       1          1155.75
    3   310      1          274.00
    4   36       7          1147.44
    '''
    
    
    # 简单查看下如何创建模型
    # 模型 = 算法 + 预处理后的数据
    kmeans = KMeans(n_clusters=2,random_state=666,n_jobs=-1)
    y_predict = kmeans.fit_predict(X)
    silhouette_score(X,y_predict)
    #结果:0.9727905413052251  此数接近1,说明构建的模型还是可行的
    
    
    #查看模型的属性:labels_
    kmeans.labels_
    '''
    array([[9.27459745e+01, 3.87262677e+00, 8.82704240e+02],
           [2.50000000e+00, 1.03000000e+02, 4.30544575e+04]])
    '''
    
    #重新训练新的数据:X
    kmeans.predict(X)
    '''
    array([0, 0, 0, ..., 0, 0, 0], dtype=int32)
    '''
    
    
    # 创建学习曲线,查看查看轮廓系数,选择最合适的类别
    # 用一个列表记录轮廓系数得分
    s_list = []
    
    '''
    不确定分成多少类,可先试下分成 2-20个类别,查看一下轮廓系数
    n_clusters
    random_state
    n_jobs
    '''
    
    for i in range(2,21):
        k = KMeans(n_clusters=i,random_state=666,n_jobs=-1)
        y_predict = k.fit_predict(X)
        
        score = silhouette_score(X,y_predict)
        s_list.append(score)
        
    '''
    绘制学习曲线
    设置X轴间距
    from matplotlib.pyplot import MultipleLocator
    ax.xaxis.set_major_locator(MultipleLocator(1))
    '''
    from matplotlib.pyplot import MultipleLocator
    
    figure,ax = plt.subplots()
    
    ax.plot(range(2,21),s_list)
    ax.xaxis.set_major_locator(MultipleLocator(1))
    
    学习曲线1.png

    # 继续寻找,在2-5之间,寻找最优得分
    s_list = []
    
    for i in range(2,6):
        k = KMeans(n_clusters=i,random_state=666,n_jobs=-1)
        y_predict = k.fit_predict(X)
        
        score = silhouette_score(X,y_predict)
        s_list.append(score)
        
    ''' 
    从图示上看,最优的簇 是 2
    不过一般而言,簇都会大于2,所以这里最优簇选择3
    '''
    from matplotlib.pyplot import MultipleLocator
    
    figure,ax = plt.subplots()
    
    ax.plot(range(2,6),s_list)
    ax.xaxis.set_major_locator(MultipleLocator(1))
    
    学习曲线2.png

    # 用户簇类标签 
    # labels_
    k = KMeans(n_clusters=3,random_state=666,n_jobs=-1)
    k.fit(X)
    '''
    KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
           n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='auto',
           random_state=666, tol=0.0001, verbose=0)
    '''
    
    k.labels_
    #结果:array([2, 0, 0, ..., 0, 0, 0], dtype=int32)
    
    df['cluster'] = k.labels_
    df.head()
    '''
    
    CustomerID  Recency Frequency   Monetary    cluster
    0   12347.0 2        7          2866.77      2
    1   12348.0 248      1          17.00        0
    2   12349.0 18       1          1155.75      0
    3   12350.0 310      1          274.00       0
    4   12352.0 36       7          1147.44      0
    '''
    

    相关文章

      网友评论

          本文标题:构建Kmeans模型

          本文链接:https://www.haomeiwen.com/subject/owrcnktx.html