import numpy as np
import pandas as pd
df=pd.read_csv("data.csv")
df.head()
'''
RFM
最近一次消费 Recency
消费频率 Frequency
消费金额 Monetary
为了构建用户聚类模型,需要处理的数据有:
InvoiceDate 日期间隔
InvociceNo 消费次数
UnitPrice 消费金额
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
InvoiceNo: 代表订单的编号
StockCode: 代表具体商品的编号
Description:代表对商品的描述
Quantity: 代表购买的数量
InvoiceDate:代表生成此订单的日期
UnitPrice: 代表此商品的单价
CustomerID: 代表用户ID
Country: 代表购买用户国家
'''
#预处理数据:
#1、将用户ID(CustomerID)类型转为字符串类型
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo 541909 non-null object
StockCode 541909 non-null object
Description 540455 non-null object
Quantity 541909 non-null int64
InvoiceDate 541909 non-null object
UnitPrice 541909 non-null float64
CustomerID 406829 non-null float64
Country 541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
'''
df.CustomerID=df.CustomerID.astype('str')
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo 541909 non-null object
StockCode 541909 non-null object
Description 540455 non-null object
Quantity 541909 non-null int64
InvoiceDate 541909 non-null object
UnitPrice 541909 non-null float64
CustomerID 541909 non-null object
Country 541909 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 33.1+ MB
'''
#2、处理异常值
df.describe()
'''
由结果得知,describe()只对数值类型的列进行处理显示。
且由结果得知,Quantity数量和UnitPrice单价存在小数,这在现实中是不存在的,因此需要排除掉小于0的值
'''
'''
Quantity UnitPrice
count 541909.000000 541909.000000
mean 9.552250 4.611114
std 218.081158 96.759853
min -80995.000000 -11062.060000
25% 1.000000 1.250000
50% 3.000000 2.080000
75% 10.000000 4.130000
max 80995.000000 38970.000000
'''
df=df.loc[(df.Quantity>0) & (df.UnitPrice>0)]
df.describe()
'''
Quantity UnitPrice
count 530104.000000 530104.000000
mean 10.542037 3.907625
std 155.524124 35.915681
min 1.000000 0.001000
25% 1.000000 1.250000
50% 3.000000 2.080000
75% 10.000000 4.130000
max 80995.000000 13541.330000
'''
'''
发现 Quantity 以及 UnitPrice 最大值,远远大于正常值
简单处理:最大值 = q3 + 1.5 IQR
即
Quantity_max = df.Quantity.quantile(0.75) + 1.5*(df.Quantity.quantile(0.75) - df.Quantity.quantile(0.25))
UnitPrice_max = df.UnitPrice.quantile(0.75)+ 1.5*(df.UnitPrice.quantile(0.75) - df.UnitPrice.quantile(0.25))
'''
![](https://img.haomeiwen.com/i13857104/f4bd6b14dbd4e56f.png)
箱型图.png
#上四分位数值:
df.Quantity.quantile(0.75)
#结果:10.0
#下四分位数值:
df.Quantity.quantile(0.25)
#结果:1.0
#Quantity 的IQR值
Quantity_IQR=df.Quantity.quantile(0.75)-df.Quantity.quantile(0.25)
#UnitPrice 的IQR值
UnitPrice_IQR=df.UnitPrice.quantile(0.75)-df.UnitPrice.quantile(0.25)
#Quantity的 上界值
Quantity的 上界值_max = df.Quantity.quantile(0.75) + 1.5*Quantity_IQR
#UnitPrice的 上界值
UnitPrice_max = df.UnitPrice.quantile(0.75) + 1.5*UnitPrice_IQR
Quantity_max
#结果:23.5
UnitPrice_max
#结果:8.45
#重新筛选数据,过滤出不合理的超大值
df=df.loc[(df.Quantity<Quantity_max) & (df.UnitPrice<UnitPrice_max)]
df.describe()
'''
Quantity UnitPrice
count 436166.000000 436166.000000
mean 4.938260 2.699178
std 4.508586 1.927294
min 1.000000 0.001000
25% 1.000000 1.250000
50% 3.000000 2.080000
75% 8.000000 3.750000
max 23.000000 8.330000
'''
#处理缺失值
#查看是否存在缺失值 df.isna().sum()
df.isna().sum()
'''
由结果得知,没有存在缺失值的情况,但是请注意:
之前做过操作,将CustomeID的值强改为字符串类型了,因此也将控制改为了字符串类型,所以此方法不够严谨
InvoiceNo 0
StockCode 0
Description 0
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 0
Country 0
dtype: int64
'''
#使用 nan Null None null 重新判断CustomerID的值
df.loc[df.CustomerID.isin(['nan','Null','None','null'])]
#添加波浪线,即取反
df=df.loc[~df.CustomerID.isin(['nan','Null','None','null'])]
#双重保险,保证df中再无空值
df=df.loc[~df.CustomerID.isna()]
处理重复值
df.shape
#结果:(321007, 8)
df.drop_duplicates().shape
#结果:(316093, 8),由结果得知,明显存在重复行
#删除重复行
df=df.drop_duplicates()
df.shape
#结果:(316093, 8)
处理时间
#将日期转为 date 类型
#DatetimeIndex(df.列名).date
#即将InvoiceDate列的时间值转为"年月日"的日期类型
df['date']=pd.DatetimeIndex(df.InvoiceDate).date
df.head()
'''
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country date
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom 2010-12-01
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom 2010-12-01
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01
'''
处理购买总金额
total_bill=UnitPrice * Quantity
df['total_bill']=df.UnitPrice * df.Quantity
df.head()
'''
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country date total_bill
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom 2010-12-01 15.30
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01 20.34
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom 2010-12-01 22.00
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01 20.34
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2010-12-01 20.34
'''
需求:以用户为分组,计算距上次购买的间隔天数,总购买次数,总购买金额
df.groupby('CustomerID').agg({'date':'max','InvoiceNo':'nunique','total_bill':'sum'})
#重置行索引,即将原来的行索引变为普通列,以默认的数字作为行索引
rfm=df.groupby('CustomerID').agg({'date':'max','InvoiceNo':'nunique','total_bill':'sum'}).reset_index()
rfm.head()
'''
CustomerID date InvoiceNo total_bill
0 12347.0 2011-12-07 7 2866.77
1 12348.0 2011-04-05 1 17.00
2 12349.0 2011-11-21 1 1155.75
3 12350.0 2011-02-02 1 274.00
4 12352.0 2011-11-03 7 1147.44
'''
#为了演示效果设置rfm中的最大date值作为now
now=rfm.date.max()
now
datetime.date(2011, 12, 9)
#接下来求距上次购买的间隔天数
#(now - x).days
rfm['Recency']=rfm.date.apply(lambda x:(now - x).days)
rfm.head()
'''
CustomerID date InvoiceNo total_bill Recency
0 12347.0 2011-12-07 7 2866.77 2
1 12348.0 2011-04-05 1 17.00 248
2 12349.0 2011-11-21 1 1155.75 18
3 12350.0 2011-02-02 1 274.00 310
4 12352.0 2011-11-03 7 1147.44 36
'''
#更改列名
InvoiceNo 改名为 Frquency
total_bill 改名为 Monetary
rfm.rename(columns={'total_bill':'Monetary','InvoiceNo':'Frquency'},inplace=True)
rfm.head()
'''
CustomerID date Frquency Monetary Recency
0 12347.0 2011-12-07 7 2866.77 2
1 12348.0 2011-04-05 1 17.00 248
2 12349.0 2011-11-21 1 1155.75 18
3 12350.0 2011-02-02 1 274.00 310
4 12352.0 2011-11-03 7 1147.44 36
'''
#更改列名顺序
rfm=rfm.loc[:,['CustomerID','Recency','Frquency','Monetary']]
rfm.head()
'''
CustomerID Recency Frquency Monetary
0 12347.0 2 7 2866.77
1 12348.0 248 1 17.00
2 12349.0 18 1 1155.75
3 12350.0 310 1 274.00
4 12352.0 36 7 1147.44
'''
#最后将数据写入csv文件保存
rfm.to_csv('rfm.csv')
网友评论