美文网首页
聚类分析_客户群聚类分析

聚类分析_客户群聚类分析

作者: a_big_cat | 来源:发表于2021-03-28 11:07 被阅读0次

    聚类是非监督学习的一种算法,我们使用k-means聚类算法,实现客户细分,以及营销战略如何在实际业务中应用。

    1.导入数据

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.ticker as ticker
    import seaborn as sns
    from sklearn.cluster import KMeans
    
    data = pd.read_csv('./Mall_Customers.csv')
    

    2.数据探索

    
    data.head()
    
    CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
    0 1 Male 19 15 39
    1 2 Male 21 15 81
    2 3 Female 20 16 6
    3 4 Female 23 16 77
    4 5 Female 31 17 40
    
    
    data.info()
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 200 entries, 0 to 199
    Data columns (total 5 columns):
     #   Column                  Non-Null Count  Dtype 
    ---  ------                  --------------  ----- 
     0   CustomerID              200 non-null    int64 
     1   Gender                  200 non-null    object
     2   Age                     200 non-null    int64 
     3   Annual Income (k$)      200 non-null    int64 
     4   Spending Score (1-100)  200 non-null    int64 
    dtypes: int64(4), object(1)
    memory usage: 7.9+ KB
    
    
    
    data.isnull().any()
    
    
    
    CustomerID                False
    Gender                    False
    Age                       False
    Annual Income (k$)        False
    Spending Score (1-100)    False
    dtype: bool
    
    
    
    data.describe()
    
    
    
    CustomerID Age Annual Income (k$) Spending Score (1-100)
    count 200.000000 200.000000 200.000000 200.000000
    mean 100.500000 38.850000 60.560000 50.200000
    std 57.879185 13.969007 26.264721 25.823522
    min 1.000000 18.000000 15.000000 1.000000
    25% 50.750000 28.750000 41.500000 34.750000
    50% 100.500000 36.000000 61.500000 50.000000
    75% 150.250000 49.000000 78.000000 73.000000
    max 200.000000 70.000000 137.000000 99.000000
    
    
    data[['Gender','CustomerID']].groupby('Gender').count()
    
    
    
    CustomerID

    |Gender||
    |Female|112|
    |Male|88|

    
    
    
    gender = data['Gender'].value_counts()
    labels = ['Female', 'Male']
    colors = ['c', 'coral']
    explode = [0, 0.05]
    plt.figure(figsize=(8,8))
    plt.title('Total of customers by gender', fontsize = 16, fontweight='bold') 
    plt.pie(gender, colors = colors, autopct = '%1.0f%%', labels = labels, explode = explode, startangle=90, textprops={'fontsize': 16})
    plt.savefig('Total of customers by gender.png', bbox_inches = 'tight')
    plt.show()
    
    
    
    output_11_0.png
    
    
    plt.figure(figsize=(16,6))
    plt.subplot(1,2,1)
    sns.distplot(data['Spending Score (1-100)'], color = 'green')
    plt.title('Distribution of Spending Score')
    plt.subplot(1,2,2)
    sns.distplot(data['Annual Income (k$)'], color = 'green')
    plt.title('Distribution of Annual Income (k$)')
    plt.show()
    
    
    
    output_12_0.png
    
    
    sns.pairplot(data=data[['Spending Score (1-100)','Annual Income (k$)','Age']], diag_kind="kde")
    plt.savefig('Distribution.png', bbox_inches = 'tight')
    
    
    
    output_13_0.png
    
    
    plt.figure(figsize=(8,6))
    plt.title('Annual Income vs Spending Score', fontsize = 16, fontweight='bold')  
    plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], color = 'indianred', edgecolors = 'crimson')
    plt.xlabel('Annual Income', fontsize = 14)
    plt.ylabel('Spending Score', fontsize = 14)
    plt.savefig('Annual Income vs Spending Score.png', bbox_inches = 'tight')
    plt.show()
    
    
    
    output_14_0.png

    3.模型开发

    
    
    X1_Matrix = data.iloc[:, [2,4]].values # Age & Spending Score
    X2_Matrix = data.iloc[:, [3,4]].values # Annual Income & Spending Score
    
    
    
    
    
    inertias_1 = []
    for i in range(1,20):
        kmeans = KMeans(n_clusters=i, init='k-means++',  max_iter=300, n_init=10,random_state=0)
        kmeans.fit(X1_Matrix)
        inertia = kmeans.inertia_
        inertias_1.append(inertia)
        print('For n_cluster =', i, 'The inertia is:', inertia)
    
    
    
    For n_cluster = 1 The inertia is: 171535.5
    For n_cluster = 2 The inertia is: 75949.15601023017
    For n_cluster = 3 The inertia is: 45840.67661610867
    For n_cluster = 4 The inertia is: 28165.58356662934
    For n_cluster = 5 The inertia is: 23830.24505228459
    For n_cluster = 6 The inertia is: 19502.407839362204
    For n_cluster = 7 The inertia is: 15523.684014328752
    For n_cluster = 8 The inertia is: 13020.084512948222
    For n_cluster = 9 The inertia is: 11517.231348351697
    For n_cluster = 10 The inertia is: 10299.698359250398
    For n_cluster = 11 The inertia is: 9404.802904325206
    For n_cluster = 12 The inertia is: 8659.542579270144
    For n_cluster = 13 The inertia is: 7896.277200074606
    For n_cluster = 14 The inertia is: 7223.8088214073505
    For n_cluster = 15 The inertia is: 6691.75644045497
    For n_cluster = 16 The inertia is: 6160.592835350923
    For n_cluster = 17 The inertia is: 5552.953625949214
    For n_cluster = 18 The inertia is: 5356.265766259883
    For n_cluster = 19 The inertia is: 4869.198509239299
    
    
    
    # Creating the figure
    figure = plt.figure(1, figsize=(15,6), dpi=300)
    plt.plot(np.arange(1,20), inertias_1, alpha=0.8, marker='o')
    plt.xlabel("K")
    plt.ylabel("Inertia ")
    
    
    
    Text(0, 0.5, 'Inertia ')
    
    output_18_1.png
    
    
    Kmeans = KMeans(n_clusters=5, init='k-means++',  max_iter=300, n_init=10,random_state=0)
    labels = Kmeans.fit_predict(X1_Matrix)
    centroids1 = Kmeans.cluster_centers_ 
    # the centroid points in each cluster
    # Visualizing the 5 clusters
    plt.scatter(x=X1_Matrix[labels==0, 0], y=X1_Matrix[labels==0, 1], s=20, c='red', marker='o')
    plt.scatter(x=X1_Matrix[labels==1, 0], y=X1_Matrix[labels==1, 1], s=20, c='blue', marker='^')
    plt.scatter(x=X1_Matrix[labels==2, 0], y=X1_Matrix[labels==2, 1], s=20, c='grey', marker='s')
    plt.scatter(x=X1_Matrix[labels==3, 0], y=X1_Matrix[labels==3, 1], s=20, c='orange', marker='p')
    plt.scatter(x=X1_Matrix[labels==4, 0], y=X1_Matrix[labels==4, 1], s=20, c='green', marker='*')
    #Visualizing every centroids in different cluster.
    plt.scatter(x=centroids1[:,0], y=centroids1[:,1], s=300, alpha=0.8, marker='+', label='Centroids')
    #Style Setting
    plt.title("Cluster Of Customers", fontsize=20)
    plt.xlabel("Age")
    plt.ylabel("Spending Score (1-100)")
    plt.legend(loc=0)
    
    
    
    <matplotlib.legend.Legend at 0x228401f81c8>
    
    output_19_1.png
    
    
    pd.Series(labels).value_counts()
    
    
    
    0    57
    1    41
    2    37
    3    34
    4    31
    dtype: int64
    
    
    
    inertias_2 = []
    for i in range(1,8):
        kmeans = KMeans(n_clusters=i, init='k-means++',  max_iter=300, n_init=10,random_state=1)
        kmeans.fit(X2_Matrix)
        inertia = kmeans.inertia_
        inertias_2.append(inertia)
        print('For n_cluster =', i, 'The inertia is:', inertia)
    
    
    
    For n_cluster = 1 The inertia is: 269981.28
    For n_cluster = 2 The inertia is: 181363.59595959596
    For n_cluster = 3 The inertia is: 106348.37306211118
    For n_cluster = 4 The inertia is: 73679.78903948834
    For n_cluster = 5 The inertia is: 44448.45544793371
    For n_cluster = 6 The inertia is: 37233.81451071001
    For n_cluster = 7 The inertia is: 30227.606513152015
    
    
    
    # Creating the figure
    figure = plt.figure(1, figsize=(15,6), dpi=80)
    plt.plot(np.arange(1,8), inertias_2, alpha=0.8, marker='o')
    plt.xlabel("K")
    plt.ylabel("Inertia ")
    Kmeans = KMeans(n_clusters=5, init='k-means++',  max_iter=300, n_init=10,random_state=1)
    labels = Kmeans.fit_predict(X2_Matrix)
    centroids2 = Kmeans.cluster_centers_ 
    
    
    
    output_22_0.png
    
    
    # the centroid points in each cluster
    # Visualizing the 5 clusters
    plt.scatter(x=X2_Matrix[labels==0, 0], y=X1_Matrix[labels==0, 1], s=20, c='red', marker='o')
    plt.scatter(x=X2_Matrix[labels==1, 0], y=X1_Matrix[labels==1, 1], s=20, c='blue', marker='^')
    plt.scatter(x=X2_Matrix[labels==2, 0], y=X1_Matrix[labels==2, 1], s=20, c='grey', marker='s')
    plt.scatter(x=X2_Matrix[labels==3, 0], y=X1_Matrix[labels==3, 1], s=20, c='orange', marker='p')
    plt.scatter(x=X2_Matrix[labels==4, 0], y=X1_Matrix[labels==4, 1], s=20, c='green', marker='*')
    #Visualizing every centroids in different cluster.
    plt.scatter(x=centroids2[:,0], y=centroids2[:,1], s=300, alpha=0.8, marker='+', label='Centroids')
    #Style Setting
    plt.title("Cluster Of Customers", fontsize=20)
    plt.xlabel("Annual Income (k$)")
    plt.ylabel("Spending Score (1-100)")
    plt.legend(loc=7)
    
    
    
    <matplotlib.legend.Legend at 0x22840569d88>
    
    output_23_1.png

    5.总结

    聚类结果显示:
    在年龄方面,我们可以将客户分为5类,其中一类年轻人消费能力特别强,需要重点关注。
    在年收入方面,我们可以将客户分为5类,有高收入低消费、高收入消费、中等收入中端消费、低收入第消费以及低收入高消费,可以针对他们做有针对性的营销策略。

    相关文章

      网友评论

          本文标题:聚类分析_客户群聚类分析

          本文链接:https://www.haomeiwen.com/subject/wrlphltx.html