四、用户行为分析
这里我们需要使用订单明细表:ods_sales_orders,ods_customer用户表
#读取数据库客户信息表
engine = create_engine('mysql://frogdata001:Frogdata!123@106.13.128.83:3306/adventure_ods?charset=gbk')
datafrog=engine
df_CUSTOMER = pd.read_sql_query("select customer_key,birth_date,gender,marital_status from ods_customer where create_date < '2019-12-1'",con = datafrog)
#读取数据库销售订单表
engine = create_engine('mysql://frogdata001:Frogdata!123@106.13.128.83:3306/adventure_ods?charset=gbk')
datafrog=engine
df_sales_orders_11 = pd.read_sql_query("select * from ods_sales_orders where create_date>='2019-11-1' and create_date<'2019-12-1'",con = datafrog)
#销售订单表中仅客户编号,无客户年龄性别等信息,需要将销售订单表和客户信息表合并
#pd.merge
sales_customer_order_11=pd.merge(df_sales_orders_11,df_CUSTOMER,on='customer_key',how='left')
sales_customer_order_11.head(3)
作业 4.1.1 根据sales_customer_order_11['birth_date'],获取客人的年份作为新的一列,要求以字符串类型存储,最终效果如图
提示:
1.可以利用split
2.利用切片
3.请注意birth_date的数据类型
customer_birth_year = sales_customer_order_11.birth_date.str[0:4].rename('birth_year') 或者
customer_birth_year = sales_customer_order_11.birth_date.str.split('-').str[0].rename('birth_year')
#改名以避免重复列名
sales_customer_order_11[sales_customer_order_11['birth_year'].isnull()].head()
#查看是否有NaN
4.1 用户年龄分析
作业 4.1.2 sales_customer_order_11['birth_year']字段要求修改为int类型
#修改出生年为int数据类型
sales_customer_order_11['birth_year'] = sales_customer_order_11['birth_year'].apply(lambda x:np.NaN if pd.isnull(x) else int(x))
# 计算用户年龄
sales_customer_order_11['customer_age'] = 2019 - sales_customer_order_11['birth_year']
sales_customer_order_11
作业 4.1.3 请利用customer_age字段,进行年龄分层,划分层次为"30-34","35-39","40-44","45-49","50-54","55-59","60-64",最终形成age_level字段,具体如图。提示:尝试使用pd.cut
#新增'age_level'分层区间列
sales_customer_order_11['age_level'] = pd.cut(sales_customer_order_11['customer_age'],bins=[30,35,40,45,50,55,60,65],right=False,labels=["30-34","35-39","40-44","45-49","50-54","55-59","60-64"])
#筛选销售订单为自行车的订单信息
df_customer_order_bycle = sales_customer_order_11.loc[sales_customer_order_11['cplb_zw'] == '自行车']
作业 4.1.4 计算年龄比例,最终形成df_customer_order_bycle['age_level_rate']
# 计算年龄比率
df_customer_order_bycle['age_level_rate'] = 1/df_customer_order_bycle.age_level.count()
作业 4.1.5 将年龄分为3个层次,分别为'<=29'、'30-39'、'>=40'
#将年龄分为3个层次
df_customer_order_bycle['age_level2']=pd.cut(df_customer_order_bycle['customer_age'],bins=[0,30,40,200],right=False,labels=['<=29','30-39','>=40'])
# 求每个年龄段人数
age_level2_count = df_customer_order_bycle.groupby(by = 'age_level2').sales_order_key.count().reset_index()
age_level2_count
4.2、用户性别
gender_count = df_customer_order_bycle.groupby(by = 'gender').cplb_zw.count().reset_index()
gender_count
df_customer_order_bycle = pd.merge(df_customer_order_bycle,age_level2_count,on = 'age_level2').rename(columns = {'sales_order_key_y':'age_level2_count'})
df_customer_order_bycle['age_level2_rate'] = 1/df_customer_order_bycle['age_level2_count']
df_customer_order_bycle = pd.merge(df_customer_order_bycle,gender_count,on = 'gender').rename(columns = {'cplb_zw_y':'gender_count'})
df_customer_order_bycle['gender_rate'] = 1/df_customer_order_bycle['gender_count']
df_customer_order_bycle
网友评论