本章内容包括:
用户个体消费分析
python 的 query = SQL 的 where,切比雪夫不等式:mean+5std = 95%的数据。
//input1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
columns = ['user_id','order_dt','order_products','order_amount']
df = pd.read_table('CDNOW_master.txt',names = columns,sep = '\s+')
//user_id:用户ID
//order_dt:购买日期
//orderproducts:购买产品数
//order_amount:购买金额
df['order_dt'] = pd.to_datetime(df.order_dt,format='%Y%m%d')
df['month'] = df.order_dt.values.astype('datetime64[M]')
df.info()
//output1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69659 entries, 0 to 69658
Data columns (total 5 columns):
user_id 69659 non-null int64
order_dt 69659 non-null datetime64[ns]
order_products 69659 non-null int64
order_amount 69659 non-null float64
month 69659 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 2.7 MB
//input2
df.head()
//output2
user_id order_dt order_products order_amount month
0 1 1997-01-01 1 11.77 1997-01-01
1 2 1997-01-12 1 12.00 1997-01-01
2 2 1997-01-12 5 77.00 1997-01-01
3 3 1997-01-02 2 20.76 1997-01-01
4 3 1997-03-30 2 20.76 1997-03-01
//input3
grouped_user = df.groupby('user_id')
grouped_user.sum().describe()
//output3
order_products order_amount
count 23570.000000 23570.000000
mean 7.122656 106.080426
std 16.983531 240.925195
min 1.000000 0.000000
25% 1.000000 19.970000
50% 3.000000 43.395000
75% 7.000000 106.475000
max 1033.000000 13990.930000
//input4
grouped_user.sum().query('order_amount < 4000').plot.scatter(x= 'order_amount',y = 'order_products')
//output4
1
//input5
grouped_user.sum().query('order_products <60').order_products.hist(bins = 10)
//output5
2
//input6
user_cumsum = grouped_user.sum().sort_values('order_amount').apply(lambda x:x.cumsum()/x.sum())
user_cumsum.reset_index().order_amount.plot()
//output6
3
网友评论