流量分析
1. 基础指标统计分析:
--PageView 浏览次数(PV):
select count(*) as pvs from dw_weblog_detail where datestr ="20181101";
select count(*) as pvs from dw_weblog_detail where datestr ="20181101" and valid = "true"; --过滤非法请求
--Unique Visitor 独立访客(UV):
select count(distinct remote_user) as uvs from dw_weblog_detail where datestr ="20181101"; --以cookie统计 精确度高
select count(distinct remote_addr) as uvs from dw_weblog_detail where datestr ="20181101"; --以ip统计,精确度低
--访问次数(VV):
select count(distinct session) as vvs from ods_click_stream_visit where datestr ="20181101";
--IP:
select count(distinct remote_addr) as ips from dw_weblog_detail where datestr ="20181101";
基础指标结果保存入库:
#创建一个关于uv,pv,vv,ip的表
drop table dw_webflow_basic_info;
create table dw_webflow_basic_info(month string,day string,
pv bigint,uv bigint,ip bigint,vv bigint) partitioned by(datestr string);
insert into table dw_webflow_basic_info partition(datestr="20181101")
select '201811','01',a.*,b.* from
(select count(*) as pv,count(distinct remote_addr) as uv,count(distinct remote_addr) as ips
from dw_weblog_detail
where datestr ='20181101') a join
(select count(distinct session) as vvs from ods_click_stream_visit where datestr ="20181101") b;
2. 多维度统计分析:注意gruop by语句的语法
多维度统计PV总量
第一种方式:直接在dw_weblog_detail单表上进行查询
计算该处理批次(一天)中的各小时pvs
drop table dw_pvs_everyhour_oneday;
create table dw_pvs_everyhour_oneday(month string,day string,hour string,pvs bigint) partitioned by(datestr string);
insert into table dw_pvs_everyhour_oneday partition(datestr='20181101')
select a.month as month,a.day as day,a.hour as hour,count(*) as pvs from dw_weblog_detail a
where a.datestr='20181101' group by a.month,a.day,a.hour;
--计算每天的pvs
drop table dw_pvs_everyday;
create table dw_pvs_everyday(pvs bigint,month string,day string);
insert into table dw_pvs_everyday
select count(*) as pvs,a.month as month,a.day as day from dw_weblog_detail a
group by a.month,a.day;
第二种方式:与时间维表关联查询
--维度:日
drop table dw_pvs_everyday;
create table dw_pvs_everyday(pvs bigint,month string,day string);
insert into table dw_pvs_everyday
select count(*) as pvs,a.month as month,a.day as day from (select distinct month, day from t_dim_time) a
join dw_weblog_detail b
on a.month=b.month and a.day=b.day
group by a.month,a.day;
--维度:月
drop table dw_pvs_everymonth;
create table dw_pvs_everymonth (pvs bigint,month string);
insert into table dw_pvs_everymonth
select count(*) as pvs,a.month from (select distinct month from t_dim_time) a
join dw_weblog_detail b on a.month=b.month group by a.month;
--另外,也可以直接利用之前的计算结果。比如从之前算好的小时结果中统计每一天的
Insert into table dw_pvs_everyday
Select sum(pvs) as pvs,month,day from dw_pvs_everyhour_oneday group by month,day having day='18';
按照来访维度统计pv
--统计每小时各来访url产生的pv量,查询结果存入:( "dw_pvs_referer_everyhour" )
drop table dw_pvs_referer_everyhour;
create table dw_pvs_referer_everyhour(referer_url string,referer_host string,month string,day string,hour string,pv_referer_cnt bigint) partitioned by(datestr string);
insert into table dw_pvs_referer_everyhour partition(datestr='20181101')
select http_referer,ref_host,month,day,hour,count(1) as pv_referer_cnt
from dw_weblog_detail
group by http_referer,ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,pv_referer_cnt desc;
--统计每小时各来访host的产生的pv数并排序
drop table dw_pvs_refererhost_everyhour;
create table dw_pvs_refererhost_everyhour(ref_host string,month string,day string,hour string,ref_host_cnts bigint) partitioned by(datestr string);
insert into table dw_pvs_refererhost_everyhour partition(datestr='20181101')
select ref_host,month,day,hour,count(1) as ref_host_cnts
from dw_weblog_detail
group by ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,ref_host_cnts desc;
3. 复合指标统计分析
人均浏览页数(平均访问深度)
- 需求描述:统计今日所有来访者平均请求的页面数。
- 总页面请求数pv/去重总人数uv
drop table dw_avgpv_user_everyday;
create table dw_avgpv_user_everyday(
day string,
avgpv string);
方式一:
insert into table dw_avgpv_user_everyday
select '20181101',pv/uv from dw_webflow_basic_info;
方式二:
insert into table dw_avgpv_user_everyday
select '20181101',sum(b.pvs)/count(b.remote_addr) from
(select remote_addr,count(*) as pvs from dw_weblog_detail where datestr='20181101' group by remote_addr) b;
平均访问频度
- 平均每个独立访客一天内访问网站的次数(产生的session个数)。
- 计算方式:访问次数vv/独立访客数uv
select count(session)/ count(distinct remote_addr) from ods_click_stream_visit where datestr ="20181101"; --符合逻辑
平均访问时长
- 平均每次访问(会话)在网站上的停留时间。
- 体现网站对访客的吸引程度。
- 平均访问时长=访问总时长/访问次数。
先计算每次会话的停留时长
select session, sum(page_staylong) as web_staylong from ods_click_pageviews where datestr ="20181101" group by session;
计算平均访问时长
select
sum(a.web_staylong)/count(a.session)
from
(select session, sum(page_staylong) as web_staylong from ods_click_pageviews where datestr ="20181101"
group by session) a;
跳出率
- 跳出率是指用户到网站上仅浏览了一个页面就离开的访问次数与所有访问次数的百分比。
- 是评价网站性能的重要指标。
/hadoop-mahout-roadmap/ 页面跳出率
总的访问次数vv
select count(*) from ods_click_stream_visit where datestr ="20181101";
--或者通过基础指标信息表直接获取
select vv from dw_webflow_basic_info where datestr ="20181101";
浏览/hadoop-mahout-roadmap/一个页面 并且离开的人数
select count(*) from ods_click_stream_visit where datestr ="20181101"
and pagevisits = 1 and outpage = "/hadoop-mahout-roadmap/";
合并计算结果:
select
(b.nums/a.vv)*100
from dw_webflow_basic_info a join (select count(*) as nums from ods_click_stream_visit where datestr ="20181101"
and pagevisits = 1 and outpage = "/hadoop-mahout-roadmap/") b;
4. 流量分析
统计pv总量最大的来源TOPN
- 需求:按照时间维度,统计一天内各小时产生最多pvs的来源topN
row_number函数
select ref_host,ref_host_cnts,concat(month,day,hour),
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour;
综上可以得出一张新表
drop table dw_pvs_refhost_topn_everyhour;
create table (
hour string,
toporder string,
ref_host string,
ref_host_cnts string
)partitioned by(datestr string);
取前三位的Top3
insert into table dw_pvs_refhost_topn_everyhour partition(datestr='20181101')
select t.hour,t.od,t.ref_host,t.ref_host_cnts from
(select ref_host,ref_host_cnts,concat(month,day,hour) as hour,
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour) t where od<=3;
4.受访分析
- 各页面访问统计
各页面PV
select request as request,count(request) as request_counts from
dw_weblog_detail group by request having request is not null order by request_counts desc limit 20;
- 热门页面统计
统计每日最热门的页面top10
drop table dw_hotpages_everyday;
create table dw_hotpages_everyday(day string,url string,pvs string);
insert into table dw_hotpages_everyday
select '20181101',a.request,a.request_counts from
(select request as request,count(request) as request_counts from dw_weblog_detail where datestr='20181101' group by request having request is not null) a
order by a.request_counts desc limit 10;
5. 访客分析
- 独立访客
需求:按照时间维度来统计独立访客及其产生的pv量
时间维度:时
drop table dw_user_dstc_ip_h;
create table dw_user_dstc_ip_h(
remote_addr string,
pvs bigint,
hour string);
insert into table dw_user_dstc_ip_h
select remote_addr,count(1) as pvs,concat(month,day,hour) as hour
from dw_weblog_detail
Where datestr='20181101'
group by concat(month,day,hour),remote_addr
order by hour asc,pvs desc;
在上述基础之上,可以继续分析,比如每小时独立访客总数
select count(1) as dstc_ip_cnts,hour from dw_user_dstc_ip_h group by hour;
时间维度:日
select remote_addr,count(1) as counts,concat(month,day) as day
from dw_weblog_detail
Where datestr='20181101'
group by concat(month,day),remote_addr;
时间维度: 月
select remote_addr,count(1) as counts,month
from dw_weblog_detail
group by month,remote_addr;
- 每日新访客
需求:将每天的新访客统计出来。
--历日去重访客累积表
drop table dw_user_dsct_history;
create table dw_user_dsct_history(
day string,
ip string
)
partitioned by(datestr string);
--每日新访客表
drop table dw_user_new_d;
create table dw_user_new_d (
day string,
ip string
)
partitioned by(datestr string);
--每日新用户插入新访客表
insert into table dw_user_new_d partition(datestr='20181101')
select tmp.day as day,tmp.today_addr as new_ip from
(
select today.day as day,today.remote_addr as today_addr,old.ip as old_addr
from
(select distinct remote_addr as remote_addr,"20181101" as day from dw_weblog_detail where datestr="20181101") today
left outer join
dw_user_dsct_history old
on today.remote_addr=old.ip
) tmp
where tmp.old_addr is null;
--每日新用户追加到累计表
insert into table dw_user_dsct_history partition(datestr='20181101')
select day,ip from dw_user_new_d where datestr='20181101';
6. 访客visit分析
回头/单次访客统计
drop table dw_user_returning;
create table dw_user_returning(
day string,
remote_addr string,
acc_cnt string)
partitioned by (datestr string);
insert overwrite table dw_user_returning partition(datestr='20181101')
select tmp.day,tmp.remote_addr,tmp.acc_cnt
from
(select '20181101' as day,remote_addr,count(session) as acc_cnt from ods_click_stream_visit group by remote_addr) tmp
where tmp.acc_cnt>1;
网友评论