美文网首页
数据仓库之拉链表(hive实现)

数据仓库之拉链表(hive实现)

作者: 幽蓝鑫晨 | 来源:发表于2018-04-24 16:07 被阅读0次

    预备知识:

    了解hive窗口函数:LAG 和 LEAD

    数据准备:

    cookie1,2015-04-10 10:00:02,url2

    cookie1,2015-04-10 10:00:00,url1

    cookie1,2015-04-10 10:03:04,1url3

    cookie1,2015-04-10 10:50:05,url6

    cookie1,2015-04-10 11:00:00,url7

    cookie1,2015-04-10 10:10:00,url4

    cookie1,2015-04-10 10:50:01,url5

    cookie2,2015-04-10 10:00:02,url22

    cookie2,2015-04-10 10:00:00,url11

    cookie2,2015-04-10 10:03:04,1url33

    cookie2,2015-04-10 10:50:05,url66

    cookie2,2015-04-10 11:00:00,url77

    cookie2,2015-04-10 10:10:00,url44

    cookie2,2015-04-10 10:50:01,url55

    CREATE EXTERNAL TABLE lxw1234 (

    cookieid string,

    createtime string,  --页面访问时间

    url STRING      --被访问页面

    ) ROW FORMAT DELIMITED

    FIELDS TERMINATED BY ','

    stored as textfile location '/tmp/lxw11/';

    hive> select * from lxw1234;

    OK

    cookie1 2015-04-10 10:00:02    url2

    cookie1 2015-04-10 10:00:00    url1

    cookie1 2015-04-10 10:03:04    1url3

    cookie1 2015-04-10 10:50:05    url6

    cookie1 2015-04-10 11:00:00    url7

    cookie1 2015-04-10 10:10:00    url4

    cookie1 2015-04-10 10:50:01    url5

    cookie2 2015-04-10 10:00:02    url22

    cookie2 2015-04-10 10:00:00    url11

    cookie2 2015-04-10 10:03:04    1url33

    cookie2 2015-04-10 10:50:05    url66

    cookie2 2015-04-10 11:00:00    url77

    cookie2 2015-04-10 10:10:00    url44

    cookie2 2015-04-10 10:50:01    url55

    LAG

    LAG(col,n,DEFAULT) 用于统计窗口内往上第n行值

    第一个参数为列名,第二个参数为往上第n行(可选,默认为1),第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)

    SELECT cookieid,

    createtime,

    url,

    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,

    LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,

    LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time

    FROM lxw1234;

    cookieid createtime            url    rn      last_1_time            last_2_time

    -------------------------------------------------------------------------------------------

    cookie1 2015-04-10 10:00:00    url1    1      1970-01-01 00:00:00    NULL

    cookie1 2015-04-10 10:00:02    url2    2      2015-04-10 10:00:00    NULL

    cookie1 2015-04-10 10:03:04    1url3  3      2015-04-10 10:00:02    2015-04-10 10:00:00

    cookie1 2015-04-10 10:10:00    url4    4      2015-04-10 10:03:04    2015-04-10 10:00:02

    cookie1 2015-04-10 10:50:01    url5    5      2015-04-10 10:10:00    2015-04-10 10:03:04

    cookie1 2015-04-10 10:50:05    url6    6      2015-04-10 10:50:01    2015-04-10 10:10:00

    cookie1 2015-04-10 11:00:00    url7    7      2015-04-10 10:50:05    2015-04-10 10:50:01

    cookie2 2015-04-10 10:00:00    url11  1      1970-01-01 00:00:00    NULL

    cookie2 2015-04-10 10:00:02    url22  2      2015-04-10 10:00:00    NULL

    cookie2 2015-04-10 10:03:04    1url33  3      2015-04-10 10:00:02    2015-04-10 10:00:00

    cookie2 2015-04-10 10:10:00    url44  4      2015-04-10 10:03:04    2015-04-10 10:00:02

    cookie2 2015-04-10 10:50:01    url55  5      2015-04-10 10:10:00    2015-04-10 10:03:04

    cookie2 2015-04-10 10:50:05    url66  6      2015-04-10 10:50:01    2015-04-10 10:10:00

    cookie2 2015-04-10 11:00:00    url77  7      2015-04-10 10:50:05    2015-04-10 10:50:01

    last_1_time: 指定了往上第1行的值,default为'1970-01-01 00:00:00' 

                cookie1第一行,往上1行为NULL,因此取默认值 1970-01-01 00:00:00

                cookie1第三行,往上1行值为第二行值,2015-04-10 10:00:02

                cookie1第六行,往上1行值为第五行值,2015-04-10 10:50:01

    last_2_time: 指定了往上第2行的值,为指定默认值

    cookie1第一行,往上2行为NULL

    cookie1第二行,往上2行为NULL

    cookie1第四行,往上2行为第二行值,2015-04-10 10:00:02

    cookie1第七行,往上2行为第五行值,2015-04-10 10:50:01

    LEAD

    与LAG相反

    LEAD(col,n,DEFAULT) 用于统计窗口内往下第n行值

    第一个参数为列名,第二个参数为往下第n行(可选,默认为1),第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)

    SELECT cookieid,

    createtime,

    url,

    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,

    LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,

    LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time

    FROM lxw1234;

    cookieid createtime            url    rn      next_1_time            next_2_time

    -------------------------------------------------------------------------------------------

    cookie1 2015-04-10 10:00:00    url1    1      2015-04-10 10:00:02    2015-04-10 10:03:04

    cookie1 2015-04-10 10:00:02    url2    2      2015-04-10 10:03:04    2015-04-10 10:10:00

    cookie1 2015-04-10 10:03:04    1url3  3      2015-04-10 10:10:00    2015-04-10 10:50:01

    cookie1 2015-04-10 10:10:00    url4    4      2015-04-10 10:50:01    2015-04-10 10:50:05

    cookie1 2015-04-10 10:50:01    url5    5      2015-04-10 10:50:05    2015-04-10 11:00:00

    cookie1 2015-04-10 10:50:05    url6    6      2015-04-10 11:00:00    NULL

    cookie1 2015-04-10 11:00:00    url7    7      1970-01-01 00:00:00    NULL

    cookie2 2015-04-10 10:00:00    url11  1      2015-04-10 10:00:02    2015-04-10 10:03:04

    cookie2 2015-04-10 10:00:02    url22  2      2015-04-10 10:03:04    2015-04-10 10:10:00

    cookie2 2015-04-10 10:03:04    1url33  3      2015-04-10 10:10:00    2015-04-10 10:50:01

    cookie2 2015-04-10 10:10:00    url44  4      2015-04-10 10:50:01    2015-04-10 10:50:05

    cookie2 2015-04-10 10:50:01    url55  5      2015-04-10 10:50:05    2015-04-10 11:00:00

    cookie2 2015-04-10 10:50:05    url66  6      2015-04-10 11:00:00    NULL

    cookie2 2015-04-10 11:00:00    url77  7      1970-01-01 00:00:00    NULL

    --逻辑与LAG一样,只不过LAG是往上,LEAD是往下。

    进入主题,hive实现拉链表示例:

    -----目标表

    create external table existing_time_series_table

    (

    primary_key string, ---业务主键(字段个数不限)

    effective_dt bigint, ----开始日期

    expired_dt bigint,  ----失效日期

    event_value string----业务员度量值

      )

      stored as parquet

      location 

       'hdfs://nameservice/it/ods/erp/existing_time_series_table';

     ----增量结果集   

    create external table new_time_series_table

    (

    primary_key string,---业务主键(字段个数不限)

    effective_dt bigint, ----开始日期

    event_value string----业务员度量值

      )

      stored as parquet

      location 

       'hdfs://nameservice/it/ods/erp/new_time_series_table';

    -----逻辑实现:lead函数实现了取下个日期作为本记录的失效日期

    insert overwrite table existing_time_series_table

     select primary_key,

             effective_dt,

     case

               when lead(effective_dt, 1, null)

                over(partition by primary_key order by effective_dt) is null then

                null

               else

                lead(effective_dt, 1, null)

                over(partition by primary_key order by effective_dt)

             end as expired_dt,

             event_value

        from (select primary_key, effective_dt, event_value

                from existing_time_series_table

               where expired_dt is null

              union all

              select primary_key, effective_dt, event_value

                from new_time_series_table) sub_1

      union all   

    -----历史已经失效的记录

      select primary_key, effective_dt, expired_dt, event_value

        from existing_time_series_table

       where expired_dt is not null

    相关文章

      网友评论

          本文标题:数据仓库之拉链表(hive实现)

          本文链接:https://www.haomeiwen.com/subject/abyjlftx.html