本项目基于淘宝用户行为数据,探索用户行为规律,寻找高价值用户,具体指标包括:日PV和日UV分析,支付率分析,复购行为分析,漏斗转化率分析和用户价值RFM分析。
分析步骤
在MySQL关系型数据库,利用SQL对淘宝用户行为进行数据分析。分析步骤如下:
提出问题
数据准备
数据预处理
建模分析
提出问题
本次分析的目的是想通过对淘宝用户行为数据分析,为以下问题提供解释和改进建议:
1.基于AARRR模型的电商分析指标,确定各个环节的转换率,找到需要改进的环节
2.时间维度分析:研究用户在不同时间维度下的行为规律
3.商品分析:研究用户对不同种类商品的偏好,找到针对不同商品的营销策略
4.基于RFM模型找出核心付费用户群,对这部分用户进行精准营销
数据准备
数据来源:略
利用Python导入数据,至mysql(略)
查看数据、查看列字段属性
SELECT * FROM data.tianchi_mobile_recommend_train_user;
SELECT count(1) FROM data.tianchi_mobile_recommend_train_user; -- 1048575
desc tianchi_mobile_recommend_train_user;
1.PNG
理解数据:数据集导入1048575条数据;包括6个字段,分别是
2.PNG
- user_id:用户ID,脱敏
- item_id:商品ID,脱敏
- behavior_type:用户行为类型,取值1,2,3,4,分别表示用户的点击,收藏,加入购物车,购买四种行为
- user_geohash:地理位置
- item_category:品类ID,商品所属的品类
- time:用户行为发生的时间
数据预处理
数据类型处理:增加新列date_time、dates和hours,便于后续时间维度分析;删列
set sql_safe_updates=0;
-- 增加新列date_time、dates和hours
alter table tianchi_mobile_recommend_train_user add column date_time datetime null;
update tianchi_mobile_recommend_train_user
set date_time =str_to_date(time,'%Y-%m-%d %H')
-- %H可以表示0-23;而%h表示0-12.注意空格导致的报错
alter table tianchi_mobile_recommend_train_user add column dates char(10) null;
update tianchi_mobile_recommend_train_user
set dates=date(date_time);
alter table data.tianchi_mobile_recommend_train_user add column hours char(10) null;
update data.tianchi_mobile_recommend_train_user
set hours=time (date_time);
缺失值处理:因缺失值过多,删去地理信息user_geohash;删去time列
-- 因缺失值过多,删去地理信息user_geohash;删去time列
alter table data.tianchi_mobile_recommend_train_user drop column user_geohash;
alter table data.tianchi_mobile_recommend_train_user drop column time;
SELECT * FROM data.tianchi_mobile_recommend_train_user;
3.PNG
重复值处理:创建新表a,并插入前50000条无重复数据
create table a insert into a select distinct * from XXX #创建新表a,并将重复值去除
-- 创建新表a,并插入前50000条数据。【limit 50000不需要加括号!!】
create table a like data.tianchi_mobile_recommend_train_user;
insert into a select distinct * from data.tianchi_mobile_recommend_train_user limit 50000;
SELECT count(1) FROM data.a;
4.时间跨度:从2014-11-18到2014-12-18
建模分析
基于AARRR模型分析用户购物情况
-- aquisition用户获取:【日新增用户数分析:详见下文day_0】
-- activation用户激活:【流量分析:从整体、局部(每日)分别分析】
-- 整体了解:uv、pv以及pv/uv(人均页面访问数)各是多少
select count(distinct user_id) as 'UV',
(select count(*) from data.a where behavior_type ='1') as 'PV',
(select count(*) from data.a where behavior_type ='1')/count(distinct user_id) as 'PV/UV'
from data.a;
5.PNG
-- 每日了解:日PV、日uv、人均页面访问数是多少?
select b1.dates,b1.日uv,b2.日pv,b2.日pv /b1.日uv as'每天的人均页面访问数' from
(select dates,count(distinct user_id) as '日uv'
from a
group by dates) b1
left join (
select dates,count(user_id) as'日pv'
from a
where behavior_type ='1'
group by dates) b2
on b1.dates=b2.dates
6.PNG
-- 每日的分析(1-4,分别表示点击pv、购买buy、加购物车cart、喜欢fav)
select dates,count(1) as '每日的总数',
sum(case when behavior_type=1 then 1 else 0 end) as'pv',
sum(case when behavior_type=2 then 1 else 0 end) as'buy',
sum(case when behavior_type=3 then 1 else 0 end) as'cart',
sum(case when behavior_type=4 then 1 else 0 end) as'fav'
from data.a
group by dates
7.PNG
-- 每时刻的分析
select hours,count(1) as '该时刻的总数',
sum(case when behavior_type=1 then 1 else 0 end) as'pv',
sum(case when behavior_type=2 then 1 else 0 end) as'buy',
sum(case when behavior_type=3 then 1 else 0 end) as'cart',
sum(case when behavior_type=4 then 1 else 0 end) as'fav'
from a
group by hours
8.PNG
-- # 跳失率计算:只有点击行为的用户/总用户数
select count(distinct user_id) as '只有点击行为的用户'
,concat(cast(count(distinct user_id) /410*100 as DECIMAL(18,2)),'%') as '跳失率'
from data.a
where user_id not in(select distinct user_id from data.a where behavior_type ='2')
and user_id not in(select distinct user_id from data.a where behavior_type ='3')
and user_id not in(select distinct user_id from data.a where behavior_type ='4')
-- 结果显示只有点击1pv行为没有收藏4fav、加购物车3cart以及购买2buy行为的总用户数是71,除以总用户数UV 410为17%
-- 跳出率为17%
9.concat(cast(count(distinct user_id) /410*100 as DECIMAL(18,2)),'%') as '跳失率'
-- retention用户留存:【留存分析】
-- 获取每个用户的使用时间与第一次使用时间
create view v0
as
select a0.user_id , a0.dates ,a1.firstday
from
(select user_id , dates from a
group by user_id , dates) as a0
inner join (
select user_id , min(dates) as firstday from a
group by user_id) as a1
on a0.user_id = a1.user_id
order by a0.user_id , a0.dates
-- 第一次使用时间和使用时间的间隔by_day
create view v1
as
select user_id ,dates,firstday,datediff(dates,firstday) as by_day
from v0
-- 提取字段作为列名,计算留存日的用户数量
create view v2
as
select firstday,
sum(case when by_day=0 then 1 else 0 end) as 'day_0',
sum(case when by_day=1 then 1 else 0 end) as 'day_1',sum(case when by_day=2 then 1 else 0 end) as 'day_2',
sum(case when by_day=3 then 1 else 0 end) as 'day_3',sum(case when by_day=4 then 1 else 0 end) as 'day_4',
sum(case when by_day=5 then 1 else 0 end) as 'day_5',sum(case when by_day=6 then 1 else 0 end) as 'day_6',
sum(case when by_day=7 then 1 else 0 end) as 'day_7',sum(case when by_day=15 then 1 else 0 end) as 'day_15',
sum(case when by_day=30 then 1 else 0 end) as 'day_30'
from v1
group by firstday
order by firstday;
-
day_0即当天新增人数
9【时间跨度:从2014-11-18到2014-12-18(day_0即当天新增人数)】.PNG
-- 留存率
select firstday, day_0,
concat(cast((day_1/day_0)*100 as DECIMAL(18,2)),'%') as 'day_1%',
concat(cast((day_2/day_0)*100 as DECIMAL(18,2)),'%') as 'day_2%',
concat(cast((day_3/day_0)*100 as DECIMAL(18,2)),'%') as 'day_3%',
concat(cast((day_4/day_0)*100 as DECIMAL(18,2)),'%') as 'day_4%',
concat(cast((day_5/day_0)*100 as DECIMAL(18,2)),'%') as 'day_5%',
concat(cast((day_6/day_0)*100 as DECIMAL(18,2)),'%') as 'day_6%',
concat(cast((day_7/day_0)*100 as DECIMAL(18,2)),'%') as 'day_7%',
concat(cast((day_15/day_0)*100 as DECIMAL(18,2)),'%') as 'day_15%',
concat(cast((day_30/day_0)*100 as DECIMAL(18,2)),'%') as 'day_30%'
from v2
10.PNG
-- revenue用户收益
-- 【复购分析】复购率是多少?
DROP VIEW IF EXISTS f;
create view f
as
select user_id,count(1) as '购买次数'
from a
where behavior_type='2'
group by user_id
having count(behavior_type) >=2 -- 此处使用【'购买次数'>=2】会报错;正确方式:购买次数>=2(不需要加‘’)
order by 购买次数 desc
11.PNG
-- 有购买行为的消费用户数
select count(distinct user_id)
from a
where behavior_type='2'
-- 计算复购率【复购率为70%】
select (select count(1) from f)/(select count(distinct user_id) from a where behavior_type='2') as '复购率'
from a limit 1; -- 此处要加limit 1
12.PNG
-- 复购频数分类
select 购买次数,count(user_id) as 人数
from f
group by 购买次数 -- 【购买次数】不需要加''
order by 购买次数 desc
13.PNG
-- refer用户推荐:【转化率分析 + 漏斗分析】购物车cart-3转化率、收藏fav-4转化率、购买buy-2转化率各是多少?
转化类型函数:CAST (expression AS data_type)
ROUND(expression,2)
select concat(cast(
(select count(1) from a where behavior_type='3')/
(select count(1) from a where behavior_type='1')*100 as Decimal(18,2)),'%') as '购物车转化率',
concat(cast(
(select count(1) from a where behavior_type='4')/
(select count(1) from a where behavior_type='1')*100 as Decimal(18,2)),'%') as '收藏转化率',
concat(cast(
(select count(1) from a where behavior_type='2')/
(select count(1) from a where behavior_type='1')*100 as Decimal(18,2)),'%') as 购买转化率
from a limit 1 ;
14.PNG
最终得到【购物车转化率, 收藏转化率, 购买转化率分别为2.74%, 0.90%, 2.30%】。
用户行为的漏斗:以行为发生与否为研究对象,有此行为即+1;
UV(独立访客)转化的漏斗:以每个用户为研究对象。
-- 用户行为的漏斗计算:
select behavior_type,count(user_id)
from a
group by behavior_type
-- UV(独立访客)转化的漏斗计算:
select behavior_type,count(distinct user_id) AS DIS_user
from a
group by behavior_type
15-1.PNG
15-2.PNG
-- 商品分析:统计出受欢迎的产品item_id以及类目item_categary,优化产品销售。
-- 按产品item_id分析
select product_buytimes,count(*) as product_count from(
select item_id,count(user_id) as product_buytimes
from a
where behavior_type='2'
group by item_id) as product_buypool
group by product_buytimes
order by product_buytimes desc;
16.PNG
-- 类目item_categary分析
select cat_buytimes,count(*) as cat_count from(
select item_category,count(user_id) as cat_buytimes
from a
where behavior_type='2'
group by item_category) as cat_buypool
group by cat_buytimes
order by cat_buytimes desc;
17.PNG
基于RFM模型进行用户分类
-- R等级分类
-- 1.建立r视图,将近期购买时间提取到R临时表中
create view r
as
select user_id ,max(dates) as '近期购买时间'
from a
where behavior_type='2'
group by user_id
select max(dates)
from a -- 2014-12-18
-- 2.建立R等级划分视图:将客户近期购买时间进行等级划分,越接近2017-12-04号R越大;
-- 构建R数值型(用于后续的计算全体用户的平均值)和'R'字符串型(用于计算每个等级的数量)
create view R等级划分
as
select user_id,近期购买时间,datediff('2014-12-18',近期购买时间)as 距今天数,
(case
when datediff('2014-12-18',近期购买时间)<=2 then 5
when datediff('2014-12-18',近期购买时间)<=4 then 4
when datediff('2014-12-18',近期购买时间)<=6 then 3
when datediff('2014-12-18',近期购买时间)<=8 then 2
else 1 end) as R,
(case
when datediff('2014-12-18',近期购买时间)<=2 then '5'
when datediff('2014-12-18',近期购买时间)<=4 then '4'
when datediff('2014-12-18',近期购买时间)<=6 then '3'
when datediff('2014-12-18',近期购买时间)<=8 then '2'
else '1' end) as R值
from r
18.PNG
-- F等级划分
-- 1.建立f视图
create view f
as
select user_id ,count(user_id) as '购买次数'
from a
where behavior_type='2'
group by user_id
-- 建立F等级划分
create view F等级划分
as
select user_id,购买次数,
(case when 购买次数<=2 then 1
when 购买次数<=4 then 2
when 购买次数<=6 then 3
when 购买次数<=8 then 4
else 5 end) as 'F',
(case when 购买次数<=2 then '1'
when 购买次数<=4 then '2'
when 购买次数<=6 then '3'
when 购买次数<=8 then '4'
else '5' end) as 'F值'
from f
19.PNG
-- 建立RFM模型
-- 1.R平均值(因为不区分大小写:【R等级划分】=【r等级划分】)
SELECT avg(R) as 'R平均值' FROM R等级划分; -- 2.8462
-- 2.F平均值
select avg(F) as 'F平均值' from F等级划分; -- 3.2353
-- 3.用户八大类等级划分,由于该数据没有M值,故只建立了4个分类
create view RFM汇总
as
select a.*,b.F,b.F值,
(case
when a.R>2.8462 and b.F>3.2353 then '重要高价值客户'
when a.R<2.8462 and b.F>3.2353 then '重要唤回客户'
when a.R>2.8462 and b.F<3.2353 then '重要深耕客户'
when a.R<2.8462 and b.F<3.2353 then '重要挽留客户'
END
) as 客户分类
from R等级划分 a, F等级划分 b
where a.user_id=b.user_id
20.PNG
select 客户分类,count(1) from RFM汇总
group by 客户分类
21.PNG
结论:
1.基于AARRR模型:跳出率为17%;复购率70%;购物车转化率, 收藏转化率, 购买转化率分别为2.74%, 0.90%, 2.30%。针对收藏商品但还没购买的用户,可通过短信、APP弹窗push等方式提醒用户去购物车去完成支付,进一步提高购买率。
2.时间维度分析:用户活跃时间集中在19点到23点(峰值在21点),可在该活跃时间推送新品、商家折扣优惠及促销活动,提高购买率。
3.商品分析:对于热销的商品,可以提高曝光率以及多推出一些和该商品相关的其他商品捆绑 / 交叉销售,提高商品的销量。多种方式进行客户关系维护:购买升级、交叉销售、追加销售等。
4.基于RFM模型:通过RFM模型对用户进行划分等级,对每个用户进行精准化营销;同时可以对R和F的数据进行监控,推测客户消费的异常情况,挽回流失客户。
网友评论