注:将Jupyter nootbook的code下载下来,用spyder打开,今后的代码都是python代码,
//input1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
columns = ['user_id','order_dt','order_products','order_amount']
df = pd.read_table('CDNOW_master.txt',names = columns,sep = '\s+')
//user_id:用户ID
//order_dt:购买日期
//orderproducts:购买产品数
//order_amount:购买金额
df['order_dt'] = pd.to_datetime(df.order_dt,format='%Y%m%d')
df['month'] = df.order_dt.values.astype('datetime64[M]')
df.info()
//output1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69659 entries, 0 to 69658
Data columns (total 5 columns):
user_id 69659 non-null int64
order_dt 69659 non-null datetime64[ns]
order_products 69659 non-null int64
order_amount 69659 non-null float64
month 69659 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 2.7 MB
//input2
df.head()
//output2
user_id order_dt order_products order_amount month
0 1 1997-01-01 1 11.77 1997-01-01
1 2 1997-01-12 1 12.00 1997-01-01
2 2 1997-01-12 5 77.00 1997-01-01
3 3 1997-01-02 2 20.76 1997-01-01
4 3 1997-03-30 2 20.76 1997-03-01
//input3
pivoted_counts = df.pivot_table(index = 'user_id',
columns = 'month',
values = 'order_dt',
aggfunc = 'count').fillna(0)
pivoted_counts.head()
//output3
1997-01-01 00:00:00 1997-02-01 00:00:00 1997-03-01 00:00:00 1997-04-01 00:00:00 1997-05-01 00:00:00 1997-06-01 00:00:00 1997-07-01 00:00:00 1997-08-01 00:00:00 1997-09-01 00:00:00 1997-10-01 00:00:00 1997-11-01 00:00:00 1997-12-01 00:00:00 1998-01-01 00:00:00 1998-02-01 00:00:00 1998-03-01 00:00:00 1998-04-01 00:00:00 1998-05-01 00:00:00 1998-06-01 00:00:00
user_id
1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
5 2.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0
//input4
df_purchase = pivoted_counts.applymap(lambda x:1 if x > 0 else 0 )
df_purchase.tail()
//output4
month 1997-01-01 00:00:00 1997-02-01 00:00:00 1997-03-01 00:00:00 1997-04-01 00:00:00 1997-05-01 00:00:00 1997-06-01 00:00:00 1997-07-01 00:00:00 1997-08-01 00:00:00 1997-09-01 00:00:00 1997-10-01 00:00:00 1997-11-01 00:00:00 1997-12-01 00:00:00 1998-01-01 00:00:00 1998-02-01 00:00:00 1998-03-01 00:00:00 1998-04-01 00:00:00 1998-05-01 00:00:00 1998-06-01 00:00:00
user_id
23566 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23567 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23568 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23569 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23570 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
//input5
def active_status(data):
status = []
for i in range(18):
#若本月没有消费
if data[i] == 0:
if len(status) >0:
if status[i-1] == 'unreg':
status.append('unreg')
else:
status.append('unactive')
else:
status.append('unreg')
#若本月消费
else:
if len(status) == 0:
status.append('new')
else:
if status[i-1] == 'unactive':
status.append('return')
elif status[i-1] == 'unreg':
status.append('new')
else:
status.append('active')
return status
purchase_stats = df_purchase.apply(active_status,axis = 1)
purchase_stats.head()
//output5
user_id
1 [new, unactive, unactive, unactive, unactive, ...
2 [new, unactive, unactive, unactive, unactive, ...
3 [new, unactive, return, active, unactive, unac...
4 [new, unactive, unactive, unactive, unactive, ...
5 [new, active, unactive, return, active, active...
dtype: object
//input6
purchases_stats_ct = purchase_stats.replace('unreg',np.NaN).apply(lambda x:pd.value_counts(x))
purchases_stats_ct.fillna(0).T.head()
//output6
user_id 1 2 3 4 5 6 7 8 9 10 ... 23561 23562 23563 23564 23565 23566 23567 23568 23569 23570
unactive 17.0 17.0 13.0 15.0 9.0 17.0 15.0 11.0 15.0 17.0 ... 13.0 15.0 14.0 13.0 15.0 15.0 15.0 14.0 15.0 15.0
new 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
return 0.0 0.0 3.0 2.0 3.0 0.0 2.0 3.0 2.0 0.0 ... 2.0 0.0 1.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
active 0.0 0.0 1.0 0.0 5.0 0.0 0.0 3.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
unreg 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
5 rows × 23570 columns
//input7
purchases_stats_ct.fillna(0).T.plot.area()
//output7
//计算不出来,笔记本计算能力不够
网友评论