id dt lowcarbon
1001 2021-12-12 123
1002 2021-12-12 45
1001 2021-12-13 43
1001 2021-12-13 45
1001 2021-12-13 23
1002 2021-12-14 45
1001 2021-12-14 230
1002 2021-12-15 45
1001 2021-12-15 23
找出连续3天及以上减少碳排放量在100以上的用户
1)按照用户ID及时间字段分组,计算每个用户单日减少的碳排放量
select
id,
dt,
sum(lowcarbon) lowcarbon
from test1
group by id,dt
having lowcarbon>100;t1
1001 2021-12-12 123
1001 2021-12-13 111
1001 2021-12-14 230
等差数列法:两个等差数列如果等差相同,则相同位置的数据相减等到的结果相同
2)按照用户分组,同时按照时间排序,计算每条数据的Rank值
select
id,
dt,
lowcarbon,
rank() over(partition by id order by dt) rk
from t1;t2
3)将每行数据中的日期减去Rank值
select
id,
dt,
lowcarbon,
date_sub(dt,rk) flag
from t2;t3
4)按照用户及Flag分组,求每个组有多少条数据,并找出大于等于3条的数据
select
id,
flag,
count(*) ct
from t3
group by id,flag
having ct>=3;
5)最终HQL
select
id,
flag,
count(*) ct
from
(select
id,
dt,
lowcarbon,
date_sub(dt,rk) flag
from
(select
id,
dt,
lowcarbon,
rank() over(partition by id order by dt) rk
from
(select
id,
dt,
sum(lowcarbon) lowcarbon
from test1
group by id,dt
having lowcarbon>100)t1)t2)t3
group by id,flag
having ct>=3;
image.png
id ts
1001 17523641234
1001 17523641256
1002 17523641278
1001 17523641334
1002 17523641434
1001 17523641534
1001 17523641544
1002 17523641634
1001 17523641638
1001 17523641654
时间间隔小于60秒,则分为同一个组
1001 17523641234 1
1001 17523641256 1
1001 17523641334 2
1001 17523641534 3
1001 17523641544 3
1001 17523641638 4
1001 17523641654 4
1002 17523641278 1
1002 17523641434 2
1002 17523641634 3
1)将上一行时间数据下移
lead:领导
lag:延迟
select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2;t1
1001 17523641234 0
1001 17523641256 17523641234
1001 17523641334 17523641256
1001 17523641534 17523641334
1001 17523641544 17523641534
1001 17523641638 17523641544
1001 17523641654 17523641638
1002 17523641278 0
1002 17523641434 17523641278
1002 17523641634 17523641434
2)将当前行时间数据减去上一行时间数据
select
id,
ts,
ts-lagts tsdiff
from
t1;t2
select
id,
ts,
ts-lagts tsdiff
from
(select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2)t1;t2
1001 17523641234 17523641234
1001 17523641256 22
1001 17523641334 78
1001 17523641534 200
1001 17523641544 10
1001 17523641638 94
1001 17523641654 16
1002 17523641278 17523641278
1002 17523641434 156
1002 17523641634 200
3)计算每个用户范围内从第一行到当前行tsdiff大于等于60的总个数(分组号)
select
id,
ts,
sum(if(tsdiff>=60,1,0)) over(partition by id order by ts) groupid
from
t2;
4)最终HQL
select
id,
ts,
sum(if(tsdiff>=60,1,0)) over(partition by id order by ts) groupid
from
(select
id,
ts,
ts-lagts tsdiff
from
(select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2)t1)t2;
image.png
id dt
1001 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1001 2021-12-19
1001 2021-12-20
1002 2021-12-12
1002 2021-12-16
1002 2021-12-17
结果
1001 5
1002 2
思路一:等差数列
1001 2021-12-12 1
1001 2021-12-13 2
1001 2021-12-14 3
1001 2021-12-16 4
1001 2021-12-19 5
1001 2021-12-20 6
1001 2021-12-12 1 2021-12-11
1001 2021-12-13 2 2021-12-11
1001 2021-12-14 3 2021-12-11
1001 2021-12-16 4 2021-12-12
1001 2021-12-19 5 2021-12-14
1001 2021-12-20 6 2021-12-14
1001 2021-12-11 3
1001 2021-12-12 1
1001 2021-12-14 1
1001 2021-12-11 3 1
1001 2021-12-12 1 2
1001 2021-12-14 1 3
1001 2021-12-11 3 1 2021-12-10
1001 2021-12-12 1 2 2021-12-10
1001 2021-12-14 1 3 2021-12-11
思路二:分组
1001 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1001 2021-12-19
1001 2021-12-20
1)将上一行时间数据下移
1001 2021-12-12 1970-01-01
1001 2021-12-13 2021-12-12
1001 2021-12-14 2021-12-13
1001 2021-12-16 2021-12-14
1001 2021-12-19 2021-12-16
1001 2021-12-20 2021-12-19
select
id,
dt,
lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from
test3;t1
2)将当前行时间减去上一行时间数据(datediff(dt1,dt2))
1001 2021-12-12 564564
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 2
1001 2021-12-19 3
1001 2021-12-20 1
select
id,
dt,
datediff(dt,lagdt) flag
from
t1;t2
3)按照用户分组,同时按照时间排序,计算从第一行到当前行大于2的数据的总条数(sum(if(flag>2,1,0)))
1001 2021-12-12 1
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 1
1001 2021-12-19 2
1001 2021-12-20 2
select
id,
dt,
sum(if(flag>2,1,0)) over(partition by id order by dt) flag
from
t2;t3
4)按照用户和flag分组,求最大时间减去最小时间并加上1
select
id,
flag,
datediff(max(dt),min(dt)) days
from
t3
group by id,flag;t4
5)取连续登录天数的最大值
select
id,
max(days)+1
from
t4
group by id;
6)最终HQL
select
id,
max(days)+1
from
(select
id,
flag,
datediff(max(dt),min(dt)) days
from
(select
id,
dt,
sum(if(flag>2,1,0)) over(partition by id order by dt) flag
from
(select
id,
dt,
datediff(dt,lagdt) flag
from
(select
id,
dt,
lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from
test3)t1)t2)t3
group by id,flag)t4
group by id;
image.png
id stt edt
oppo 2021-06-05 2021-06-09
oppo 2021-06-11 2021-06-21
vivo 2021-06-05 2021-06-15
vivo 2021-06-09 2021-06-21
redmi 2021-06-05 2021-06-21
redmi 2021-06-09 2021-06-15
redmi 2021-06-17 2021-06-26
huawei 2021-06-05 2021-06-26
huawei 2021-06-09 2021-06-15
huawei 2021-06-17 2021-06-21
1)将当前行以前的数据中最大的edt放置当前行
select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from test4;t1
redmi 2021-06-05 2021-06-21 null
redmi 2021-06-09 2021-06-15 2021-06-21
redmi 2021-06-17 2021-06-26 2021-06-21
2)比较开始时间与移动下来的数据,如果开始时间大,则不需要操作,
反之则需要将移动下来的数据加一替换当前行的开始时间
如果是第一行数据,maxEDT为null,则不需要操作
select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from t1;t2
redmi 2021-06-05 2021-06-21
redmi 2021-06-22 2021-06-15
redmi 2021-06-22 2021-06-26
3)将每行数据中的结束日期减去开始日期
select
id,
datediff(edt,stt) days
from
t2;t3
redmi 16
redmi -4
redmi 4
4)按照品牌分组,计算每条数据加一的总和
select
id,
sum(if(days>=0,days+1,0)) days
from
t3
group by id;
redmi 22
5)最终HQL
select
id,
sum(if(days>=0,days+1,0)) days
from
(select
id,
datediff(edt,stt) days
from
(select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from
(select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from test4)t1)t2)t3
group by id;
image.png
image.png
网友评论