美文网首页
2022-03-07 开窗函数

2022-03-07 开窗函数

作者: Denholm | 来源:发表于2022-03-08 08:14 被阅读0次
    ---------开窗函数(窗口函数)-分组排序开窗函数------------------------------
    -- 序号函数:row_number,rank,dense_rank
    
    -- 1.1准备数据test1.txt
    /*
    cookie1,2018-04-10,1
    cookie1,2018-04-11,5
    cookie1,2018-04-12,7
    cookie1,2018-04-13,3
    cookie1,2018-04-14,2
    cookie1,2018-04-15,4
    cookie1,2018-04-16,4
    cookie2,2018-04-10,2
    cookie2,2018-04-11,3
    cookie2,2018-04-12,5
    cookie2,2018-04-13,6
    cookie2,2018-04-14,3
    cookie2,2018-04-15,9
    cookie2,2018-04-16,7
    */
    
    create table test_ordinal_func
    (
        cookie_id   string comment 'cookie id',
        create_time string comment '创建时间',
        pv          int comment '访问次数'
    ) row format delimited fields terminated by ',';
    
    load data local inpath '/export/data/test_window.txt' into table test_ordinal_func;
    
    select *
    from test_ordinal_func;
    
    -- 使用开窗函数来实现分组并组内排序
    -- 需求1:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序
    -- partition by:分组
    select *,
           -- 1 2 3 4 5
           row_number() over (partition by cookie_id order by pv desc ) row_number,
           -- 1 2 3 3 5
           rank() over (partition by cookie_id order by pv desc )       rank,
           -- 1 2 3 3 4
           dense_rank() over (partition by cookie_id order by pv desc ) dense_rank
    from test_ordinal_func;
    
    -- 需求:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序,选出每一组排名前三的信息(TopN问题)
    
    -- 以下写法报错,因为SQL的执行顺序:from where select,where的时候,别名还没有出现
    /*
    select *,
           dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
    from test_ordinal_func
    where rk3 <= 3;
    */
    
    -- 通过子查询解决
    select *
    from (
             select *,
                    dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
             from test_ordinal_func
         ) as dense_rank
    where dense_rank.rk3 <= 3;
    
    
    -- 需求:查询新冠疫情数据表中每一个州确诊人数最多的县TopN
    select *
    from (
             select *,
                    -- partition by:后面可跟多个字段(同州同县分一组)
                    -- order by:后面可跟多个字段(主要条件相同比较次要条件。按确诊病例降序,确证病例相同则按死亡病例降序)
                    dense_rank() over (partition by state, county order by cases desc, deaths desc) as rk
             from covid2
         ) as dense_rank
    where dense_rank.rk = 1;
    
    -- 分组和排序的后边都可以指定多个字段
    /*
    select *,
           dense_rank() over (partition by cookie_id,pv order by cookie_id,create_time desc ) as rk3 -- 1 2 3 3 4
    from test_ordinal_func;
    */
    
    
    ---------开窗函数(窗口函数)-指定区间进行统计开窗函数(聚合开窗函数)------------------------------
    
    -- 3、使用聚合开窗函数进行统计
    -- 累加区间:从第1行累加到当前行(默认)
    select cookie_id,
           create_time,
           pv,
           sum(pv) over (partition by cookie_id order by create_time) as pv1
    from test_ordinal_func;
    
    -- 等价上边的写法
    select cookie_id,
           create_time,
           pv,
           sum(pv) over (partition by cookie_id order by create_time
               rows between unbounded preceding and current row) as pv2
    from test_ordinal_func;
    
    -- 累加区间:从前3行累加到当前行
    select cookie_id,
           create_time,
           pv,
           sum(pv) over (partition by cookie_id order by create_time
               rows between 3 preceding and current row)
    from test_ordinal_func;
    
    -- 累加区间:从前3行累加到下1行
    select cookie_id,
           create_time,
           pv,
           sum(pv) over (partition by cookie_id order by create_time
               rows between 3 preceding and 1 following) as pv5
    from test_ordinal_func;
    
    -- 累加区间:从当前行加到组的最后
    select cookie_id,
           create_time,
           pv,
           sum(pv) over (partition by cookie_id order by create_time
               rows between current row and unbounded following) as pv6
    from test_ordinal_func;
    
    -- 以上的sum可以替换为avg,max,min
    select cookie_id,
           create_time,
           pv,
           max(pv) over (partition by cookie_id order by create_time) as pv1
    from test_ordinal_func;
    
    select cookie_id,
           create_time,
           pv,
           min(pv) over (partition by cookie_id order by create_time) as pv1
    from test_ordinal_func;
    
    select cookie_id,
           create_time,
           pv,
           avg(pv) over (partition by cookie_id order by create_time) as pv1
    from test_ordinal_func;
    
    
    -- 前后函数:lag lead
    -- lag
    -- 将上1行数据放在当前行
    select cookie_id,
           create_time,
           pv,
           lag(create_time, 1) over (partition by cookie_id order by create_time)
    from test_ordinal_func;
    
    -- 将上2行数据放在当前行
    select cookie_id,
           create_time,
           pv,
           lag(create_time, 2) over (partition by cookie_id order by create_time)
    from test_ordinal_func;
    
    -- lead
    -- 将下1行数据放在当前行
    select cookie_id,
           create_time,
           pv,
           lead(create_time, 1) over (partition by cookie_id order by create_time)
    from test_ordinal_func;
    
    -- 将下2数据放在当前行
    select cookie_id,
           create_time,
           pv,
           lead(create_time, 2) over (partition by cookie_id order by create_time)
    from test_ordinal_func;
    

    相关文章

      网友评论

          本文标题:2022-03-07 开窗函数

          本文链接:https://www.haomeiwen.com/subject/ciynrrtx.html