美文网首页
python基础-25-数据分析python——pandas——

python基础-25-数据分析python——pandas——

作者: 比特跃动 | 来源:发表于2019-04-02 15:50 被阅读0次

注:将Jupyter nootbook的code下载下来,用spyder打开,今后的代码都是python代码,


//input1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
columns = ['user_id','order_dt','order_products','order_amount']
df = pd.read_table('CDNOW_master.txt',names = columns,sep = '\s+')
//user_id:用户ID
//order_dt:购买日期
//orderproducts:购买产品数
//order_amount:购买金额
df['order_dt'] = pd.to_datetime(df.order_dt,format='%Y%m%d')
df['month'] = df.order_dt.values.astype('datetime64[M]')
df.info()



//output1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69659 entries, 0 to 69658
Data columns (total 5 columns):
user_id           69659 non-null int64
order_dt          69659 non-null datetime64[ns]
order_products    69659 non-null int64
order_amount      69659 non-null float64
month             69659 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 2.7 MB








//input2
df.head()



//output2
user_id order_dt    order_products  order_amount    month
0   1   1997-01-01  1   11.77   1997-01-01
1   2   1997-01-12  1   12.00   1997-01-01
2   2   1997-01-12  5   77.00   1997-01-01
3   3   1997-01-02  2   20.76   1997-01-01
4   3   1997-03-30  2   20.76   1997-03-01








//input3
pivoted_counts = df.pivot_table(index = 'user_id',
                               columns = 'month',
                               values = 'order_dt',
                               aggfunc = 'count').fillna(0)
pivoted_counts.head()



//output3
1997-01-01 00:00:00 1997-02-01 00:00:00 1997-03-01 00:00:00 1997-04-01 00:00:00 1997-05-01 00:00:00 1997-06-01 00:00:00 1997-07-01 00:00:00 1997-08-01 00:00:00 1997-09-01 00:00:00 1997-10-01 00:00:00 1997-11-01 00:00:00 1997-12-01 00:00:00 1998-01-01 00:00:00 1998-02-01 00:00:00 1998-03-01 00:00:00 1998-04-01 00:00:00 1998-05-01 00:00:00 1998-06-01 00:00:00
user_id                                                                     
1   1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2   2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3   1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4   2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
5   2.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0








//input4
df_purchase = pivoted_counts.applymap(lambda x:1 if x > 0 else 0 )
df_purchase.tail()



//output4
month   1997-01-01 00:00:00 1997-02-01 00:00:00 1997-03-01 00:00:00 1997-04-01 00:00:00 1997-05-01 00:00:00 1997-06-01 00:00:00 1997-07-01 00:00:00 1997-08-01 00:00:00 1997-09-01 00:00:00 1997-10-01 00:00:00 1997-11-01 00:00:00 1997-12-01 00:00:00 1998-01-01 00:00:00 1998-02-01 00:00:00 1998-03-01 00:00:00 1998-04-01 00:00:00 1998-05-01 00:00:00 1998-06-01 00:00:00
user_id                                                                     
23566   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
23567   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
23568   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
23569   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
23570   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0








//input5
def active_status(data):
    status = []
    for i in range(18):
        
        #若本月没有消费
        if data[i] == 0:
            if len(status) >0:
                if status[i-1] == 'unreg':
                    status.append('unreg')
                else:
                    status.append('unactive')
            else:
                status.append('unreg')
                
        #若本月消费
        else:
            if len(status) == 0:
                status.append('new')
            else:
                if status[i-1] == 'unactive':
                    status.append('return')
                elif status[i-1] == 'unreg':
                    status.append('new')
                else:
                    status.append('active')
    return status
purchase_stats = df_purchase.apply(active_status,axis = 1)
purchase_stats.head()




//output5
user_id
1    [new, unactive, unactive, unactive, unactive, ...
2    [new, unactive, unactive, unactive, unactive, ...
3    [new, unactive, return, active, unactive, unac...
4    [new, unactive, unactive, unactive, unactive, ...
5    [new, active, unactive, return, active, active...
dtype: object








//input6
purchases_stats_ct = purchase_stats.replace('unreg',np.NaN).apply(lambda x:pd.value_counts(x))
purchases_stats_ct.fillna(0).T.head()


//output6
user_id 1   2   3   4   5   6   7   8   9   10  ... 23561   23562   23563   23564   23565   23566   23567   23568   23569   23570
unactive    17.0    17.0    13.0    15.0    9.0 17.0    15.0    11.0    15.0    17.0    ... 13.0    15.0    14.0    13.0    15.0    15.0    15.0    14.0    15.0    15.0
new 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
return  0.0 0.0 3.0 2.0 3.0 0.0 2.0 3.0 2.0 0.0 ... 2.0 0.0 1.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
active  0.0 0.0 1.0 0.0 5.0 0.0 0.0 3.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
unreg   0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
5 rows × 23570 columns



//input7
purchases_stats_ct.fillna(0).T.plot.area()



//output7
//计算不出来,笔记本计算能力不够

相关文章

网友评论

      本文标题:python基础-25-数据分析python——pandas——

      本文链接:https://www.haomeiwen.com/subject/yycgbqtx.html