import pandas as pd
import numpy as np
df = pd.read_csv('rfm.csv')
df.head()
'''
CustomerID Recency Frequency Monetary
0 12347.0 2 7 2866.77
1 12348.0 248 1 17.00
2 12349.0 18 1 1155.75
3 12350.0 310 1 274.00
4 12352.0 36 7 1147.44
'''
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# 确定数据源
X = df.iloc[:,1:]
X.head()
'''
Recency Frequency Monetary
0 2 7 2866.77
1 248 1 17.00
2 18 1 1155.75
3 310 1 274.00
4 36 7 1147.44
'''
# 简单查看下如何创建模型
# 模型 = 算法 + 预处理后的数据
kmeans = KMeans(n_clusters=2,random_state=666,n_jobs=-1)
y_predict = kmeans.fit_predict(X)
silhouette_score(X,y_predict)
#结果:0.9727905413052251 此数接近1,说明构建的模型还是可行的
#查看模型的属性:labels_
kmeans.labels_
'''
array([[9.27459745e+01, 3.87262677e+00, 8.82704240e+02],
[2.50000000e+00, 1.03000000e+02, 4.30544575e+04]])
'''
#重新训练新的数据:X
kmeans.predict(X)
'''
array([0, 0, 0, ..., 0, 0, 0], dtype=int32)
'''
# 创建学习曲线,查看查看轮廓系数,选择最合适的类别
# 用一个列表记录轮廓系数得分
s_list = []
'''
不确定分成多少类,可先试下分成 2-20个类别,查看一下轮廓系数
n_clusters
random_state
n_jobs
'''
for i in range(2,21):
k = KMeans(n_clusters=i,random_state=666,n_jobs=-1)
y_predict = k.fit_predict(X)
score = silhouette_score(X,y_predict)
s_list.append(score)
'''
绘制学习曲线
设置X轴间距
from matplotlib.pyplot import MultipleLocator
ax.xaxis.set_major_locator(MultipleLocator(1))
'''
from matplotlib.pyplot import MultipleLocator
figure,ax = plt.subplots()
ax.plot(range(2,21),s_list)
ax.xaxis.set_major_locator(MultipleLocator(1))
![](https://img.haomeiwen.com/i13857104/fdd516cbdd797e81.png)
学习曲线1.png
# 继续寻找,在2-5之间,寻找最优得分
s_list = []
for i in range(2,6):
k = KMeans(n_clusters=i,random_state=666,n_jobs=-1)
y_predict = k.fit_predict(X)
score = silhouette_score(X,y_predict)
s_list.append(score)
'''
从图示上看,最优的簇 是 2
不过一般而言,簇都会大于2,所以这里最优簇选择3
'''
from matplotlib.pyplot import MultipleLocator
figure,ax = plt.subplots()
ax.plot(range(2,6),s_list)
ax.xaxis.set_major_locator(MultipleLocator(1))
![](https://img.haomeiwen.com/i13857104/9631587cd9fe3dd4.png)
学习曲线2.png
# 用户簇类标签
# labels_
k = KMeans(n_clusters=3,random_state=666,n_jobs=-1)
k.fit(X)
'''
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='auto',
random_state=666, tol=0.0001, verbose=0)
'''
k.labels_
#结果:array([2, 0, 0, ..., 0, 0, 0], dtype=int32)
df['cluster'] = k.labels_
df.head()
'''
CustomerID Recency Frequency Monetary cluster
0 12347.0 2 7 2866.77 2
1 12348.0 248 1 17.00 0
2 12349.0 18 1 1155.75 0
3 12350.0 310 1 274.00 0
4 12352.0 36 7 1147.44 0
'''
网友评论