美文网首页数仓实战2(美团架构)
数仓实战07:数仓搭建-ADS层

数仓实战07:数仓搭建-ADS层

作者: 勇于自信 | 来源:发表于2020-08-13 02:13 被阅读0次
    1.设备主题

    1.1 活跃设备数(日、周、月)
    需求定义:
    日活:当日活跃的设备数
    周活:当周活跃的设备数
    月活:当月活跃的设备数
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_uv_count;
    
    CREATE external TABLE ads_uv_count (
        `dt` string COMMENT '统计日期',
        `day_count` BIGINT COMMENT '当日用户数量',
        `wk_count` BIGINT COMMENT '当周用户数量',
        `mn_count` BIGINT COMMENT '当月用户数量',
        `is_weekend` string COMMENT 'Y,N 是否是周末,用于得到本周最终结果',
        `is_monthend` string COMMENT 'Y,N 是否是月末,用于得到本月最终结果'
    ) COMMENT '活跃设备数' ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_uv_count/';
    
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_uv_count SELECT
        '2020-03-10' dt,
        daycount.ct,
        wkcount.ct,
        mncount.ct,
    
    IF (
        date_add(
            next_day ('2020-03-10', 'MO') ,- 1
        ) = '2020-03-10',
        'Y',
        'N'
    ),
    
    IF (
        last_day('2020-03-10') = '2020-03-10',
        'Y',
        'N'
    )
    FROM
        (
            SELECT
                '2020-03-10' dt,
                count(*) ct
            FROM
                dwt_uv_topic
            WHERE
                login_date_last = '2020-03-10'
        ) daycount
    JOIN (
        SELECT
            '2020-03-10' dt,
            count(*) ct
        FROM
            dwt_uv_topic
        WHERE
            login_date_last >= date_add(
                next_day ('2020-03-10', 'MO') ,- 7
            )
        AND login_date_last <= date_add(
            next_day ('2020-03-10', 'MO') ,- 1
        )
    ) wkcount ON daycount.dt = wkcount.dt
    JOIN (
        SELECT
            '2020-03-10' dt,
            count(*) ct
        FROM
            dwt_uv_topic
        WHERE
            date_format(login_date_last, 'yyyy-MM') = date_format('2020-03-10', 'yyyy-MM')
    ) mncount ON daycount.dt = mncount.dt;
    

    3)查询导入结果
    hive (gmall)> select * from ads_uv_count;

    1.2 每日新增设备

    1)建表语句
    hive (gmall) > DROP TABLE
    IF EXISTS ads_new_mid_count;
    
    CREATE external TABLE ads_new_mid_count (
        `create_date` string COMMENT '创建时间',
        `new_mid_count` BIGINT COMMENT '新增设备数量'
    ) COMMENT '每日新增设备信息数量' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_new_mid_count/';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_new_mid_count SELECT
        login_date_first,
        count(*)
    FROM
        dwt_uv_topic
    WHERE
        login_date_first = '2020-03-10'
    GROUP BY
        login_date_first;
    

    3)查询导入数据
    hive (gmall)> select * from ads_new_mid_count;

    1.3 沉默用户数
    需求定义:
    沉默用户:只在安装当天启动过,且启动时间是在 7 天前
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_silent_count;
    
    CREATE external TABLE ads_silent_count (
        `dt` string COMMENT '统计日期',
        `silent_count` BIGINT COMMENT '沉默设备数'
    ) ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_silent_count';
    

    2)导入 2020-03-20 数据

    hive (gmall) > INSERT INTO TABLE ads_silent_count SELECT
        '2020-03-15',
        count(*)
    FROM
        dwt_uv_topic
    WHERE
        login_date_first = login_date_last
    AND login_date_last <= date_add('2020-03-15' ,- 7);
    

    3)查询导入数据
    hive (gmall)> select * from ads_silent_count;

    1.4 本周回流用户数
    需求定义:
    本周回流用户:上周未活跃,本周活跃的设备,且不是本周新增设备
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_back_count;
    
    CREATE external TABLE ads_back_count (
        `dt` string COMMENT '统计日期',
        `wk_dt` string COMMENT '统计日期所在周',
        `wastage_count` BIGINT COMMENT '回流设备数'
    ) ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_back_count';
    

    2)导入数据:

    hive (gmall) > INSERT INTO TABLE ads_back_count SELECT
        '2020-03-15',
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                dwt_uv_topic
            WHERE
                login_date_last >= date_add(
                    next_day ('2020-03-15', 'MO') ,- 7
                )
            AND login_date_last <= date_add(
                next_day ('2020-03-15', 'MO') ,- 1
            )
            AND login_date_first < date_add(
                next_day ('2020-03-15', 'MO') ,- 7
            )
        ) current_wk
    LEFT JOIN (
        SELECT
            mid_id
        FROM
            dws_uv_detail_daycount
        WHERE
            dt >= date_add(
                next_day ('2020-03-15', 'MO') ,- 7 * 2
            )
        AND dt <= date_add(
            next_day ('2020-03-15', 'MO') ,- 7 - 1
        )
        GROUP BY
            mid_id
    ) last_wk ON current_wk.mid_id = last_wk.mid_id
    WHERE
        last_wk.mid_id IS NULL;
    

    3)查询结果
    hive (gmall)> select * from ads_back_count;

    1.5 流失用户数
    需求定义:
    流失用户:最近 7 天未活跃的设备
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_wastage_count;
    
    CREATE external TABLE ads_wastage_count (
        `dt` string COMMENT '统计日期',
        `wastage_count` BIGINT COMMENT '流失设备数'
    ) ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_wastage_count';
    

    2)导入 2020-03-20 数据

    hive (gmall) > INSERT INTO TABLE ads_wastage_count SELECT
        '2020-03-20',
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                dwt_uv_topic
            WHERE
                login_date_last <= date_add('2020-03-20' ,- 7)
            GROUP BY
                mid_id
        ) t1;
    

    3)查询结果
    hive (gmall)> select * from ads_wastage_count;

    1.6 留存率
    需求定义


    1)建表语句
    hive (gmall) > DROP TABLE
    IF EXISTS ads_user_retention_day_rate;
    
    CREATE external TABLE ads_user_retention_day_rate (
        `stat_date` string COMMENT '统计日期',
        `create_date` string COMMENT '设备新增日期',
        `retention_day` INT COMMENT '截止当前日期留存天数',
        `retention_count` BIGINT COMMENT '留存数量',
        `new_mid_count` BIGINT COMMENT '设备新增数量',
        `retention_ratio` DECIMAL (10, 2) COMMENT '留存率'
    ) COMMENT '每日用户留存情况' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_user_retention_day_rate SELECT
        '2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 1) ,-- 新增日期 1 ,-- 留存天数 sum(
    
            IF (
                login_date_first = date_add('2020-03-10' ,- 1)
                AND login_date_last = '2020-03-10',
                1,
                0
            )
        ) ,-- 2020 - 03 - 09 的 1 日留存数 sum(
    
            IF (
                login_date_first = date_add('2020-03-10' ,- 1),
                1,
                0
            )
        ) ,-- 2020 - 03 - 09 新增 sum(
    
            IF (
                login_date_first = date_add('2020-03-10' ,- 1)
                AND login_date_last = '2020-03-10',
                1,
                0
            )
        ) / sum(
    
            IF (
                login_date_first = date_add('2020-03-10' ,- 1),
                1,
                0
            )
        ) * 100
    FROM
        dwt_uv_topicunion ALL SELECT
            '2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 2) ,-- 新增日期 2 ,-- 留存天数 sum(
    
                IF (
                    login_date_first = date_add('2020-03-10' ,- 2)
                    AND login_date_last = '2020-03-10',
                    1,
                    0
                )
            ) ,-- 2020 - 03 - 08 的 2 日留存数 sum(
    
                IF (
                    login_date_first = date_add('2020-03-10' ,- 2),
                    1,
                    0
                )
            ) ,-- 2020 - 03 - 08 新增 sum(
    
                IF (
                    login_date_first = date_add('2020-03-10' ,- 2)
                    AND login_date_last = '2020-03-10',
                    1,
                    0
                )
            ) / sum(
    
                IF (
                    login_date_first = date_add('2020-03-10' ,- 2),
                    1,
                    0
                )
            ) * 100
        FROM
            dwt_uv_topic
        UNION ALL
            SELECT
                '2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 3) ,-- 新增日期 3 ,-- 留存天数 sum(
    
                    IF (
                        login_date_first = date_add('2020-03-10' ,- 3)
                        AND login_date_last = '2020-03-10',
                        1,
                        0
                    )
                ) ,-- 2020 - 03 - 07 的 3 日留存数 sum(
    
                    IF (
                        login_date_first = date_add('2020-03-10' ,- 3),
                        1,
                        0
                    )
                ) ,-- 2020 - 03 - 07 新增 sum(
    
                    IF (
                        login_date_first = date_add('2020-03-10' ,- 3)
                        AND login_date_last = '2020-03-10',
                        1,
                        0
                    )
                ) / sum(
    
                    IF (
                        login_date_first = date_add('2020-03-10' ,- 3),
                        1,
                        0
                    )
                ) * 100
            FROM
                dwt_uv_topic;
    

    3)查询导入数据
    hive (gmall)>select * from ads_user_retention_day_rate;

    1.7 最近连续三周活跃用户数
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_continuity_wk_count;
    
    CREATE external TABLE ads_continuity_wk_count (
        `dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日 期',
        `wk_dt` string COMMENT '持续时间',
        `continuity_count` BIGINT COMMENT '活跃次数'
    ) ROW format delimited FIELDS TERMINATED BY '\t' location '/warehouse/gmall/ads/ads_continuity_wk_count';
    

    2)导入 2020-03-20 所在周的数据

    hive (gmall) > INSERT INTO TABLE ads_continuity_wk_count SELECT
        '2020-03-15',
        concat(
            date_add(
                next_day ('2020-03-15', 'MO') ,- 7 * 3
            ),
            '_',
            date_add(
                next_day (' 2020-03-15', 'MO') ,- 1
            )
        ),
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                (
                    selectmid_id
                    FROM
                        dws_uv_detail_daycount
                    WHERE
                        dt >= date_add(
                            next_day ('2020-03-10', 'monday') ,- 7
                        )
                    AND dt <= date_add(
                        next_day ('2020-03-10', 'monday') ,- 1
                    )
                    GROUP BY
                        mid_id
                    UNION ALL
                        SELECT
                            mid_id
                        FROM
                            dws_uv_detail_daycount
                        WHERE
                            dt >= date_add(
                                next_day ('2020-03-10', 'monday') ,- 7 * 2
                            )
                        AND dt <= date_add(
                            next_day ('2020-03-10', 'monday') ,- 7 - 1
                        )
                        GROUP BY
                            mid_id
                        UNION ALL
                            SELECT
                                mid_id
                            FROM
                                dws_uv_detail_daycount
                            WHERE
                                dt >= date_add(
                                    next_day ('2020-03-10', 'monday') ,- 7 * 3
                                )
                            AND dt <= date_add(
                                next_day ('2020-03-10', 'monday') ,- 7 * 2 - 1
                            )
                            GROUP BY
                                mid_id
                ) t1
            GROUP BY
                mid_id
            HAVING
                count(*) = 3
        ) t2
    

    3)查询
    hive (gmall)> select * from ads_continuity_wk_count;

    1.8 最近七天内连续三天活跃用户数
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_continuity_uv_count;
    
    CREATE external TABLE ads_continuity_uv_count (
        `dt` string COMMENT '统计日期',
        `wk_dt` string COMMENT '最近 7 天日期',
        `continuity_count` BIGINT
    ) COMMENT '连续活跃设备数' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_continuity_uv_count';
    

    2)写出导入数据的 SQL 语句

    2)写出导入数据的 SQL 语句 hive (gmall) > INSERT INTO TABLE ads_continuity_uv_count SELECT
        '2020-03-12',
        concat(
            date_add('2020-03-12' ,- 6),
            '_',
            '2020-03-12'
        ),
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                (
                    SELECT
                        mid_id
                    FROM
                        (
                            SELECT
                                mid_id,
                                date_sub(dt, rank) date_dif
                            FROM
                                (
                                    SELECT
                                        mid_id,
                                        dt,
                                        rank () over (PARTITION BY mid_id ORDER BY dt) rank
                                    FROM
                                        dws_uv_detail_daycount
                                    WHERE
                                        dt >= date_add('2020-03-12' ,- 6)
                                    AND dt <= '2020-03-12'
                                ) t1
                        ) t2
                    GROUP BY
                        mid_id,
                        date_dif
                    HAVING
                        count(*) >= 3
                ) t3
            GROUP BY
                mid_id
        ) t4;
    

    3)查询
    hive (gmall)> select * from ads_continuity_uv_count;

    2.会员主题

    2.1 会员主题信息
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_user_topic;
    
    CREATE external TABLE ads_user_topic (
        `dt` string COMMENT '统计日期',
        `day_users` string COMMENT '活跃会员数',
        `day_new_users` string COMMENT '新增会员数',
        `day_new_payment_users` string COMMENT '新增消费会员数',
        `payment_users` string COMMENT '总付费会员数',
        `users` string COMMENT '总会员数',
        `day_users2users` DECIMAL (10, 2) COMMENT '会员活跃率',
        `payment_users2users` DECIMAL (10, 2) COMMENT '会员付费率',
        `day_new_users2users` DECIMAL (10, 2) COMMENT '会员新鲜度'
    ) COMMENT '会员主题信息表' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_user_topic';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_user_topic SELECT
        '2020-03-10',
        sum(
    
            IF (
                login_date_last = '2020-03-10',
                1,
                0
            )
        ),
        sum(
    
            IF (
                login_date_first = '2020-03-10',
                1,
                0
            )
        ),
        sum(
    
            IF (
                payment_date_first = '2020-03-10',
                1,
                0
            )
        ),
        sum(IF(payment_count > 0, 1, 0)),
        count(*),
        sum(
    
            IF (
                login_date_last = '2020-03-10',
                1,
                0
            )
        ) / count(*),
        sum(IF(payment_count > 0, 1, 0)) / count(*),
        sum(
    
            IF (
                login_date_first = '2020-03-10',
                1,
                0
            )
        ) / sum(
    
            IF (
                login_date_last = '2020-03-10',
                1,
                0
            )
        )
    FROM
        dwt_user_topic
    

    3)查询导入结果
    hive (gmall)> select * from ads_user_topic;
    4)在/home/atguigu/bin 目录下编写脚本 ads_user_topic.sh
    [atguigu@hadoop102 bin]$ vim ads_user_topic.sh
    添加如下内容:

    #!/bin/bash 
    APP=gmall 
    hive=/opt/module/hive/bin/hive 
    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天 
    if [ -n "$1" ] ;
      then 
    do
      _date=$1 
    else
    do
    _date=`date -d "-1 day" +%F` 
    fisql="
    WITH tmp_day_users AS (
        SELECT
            '$do_date' dt,
            count(*) day_users from$ { APP }.dwt_user_topic wherelogin_date_last = '$do_date'
    ),
     tmp_day_new_users AS (
        SELECT
            '$do_date' dt,
            count(*) day_new_users from$ { APP }.dwt_user_topic wherelogin_date_last = '$do_date'
        AND login_date_first = '$do_date'
    ),
     tmp_day_new_payment_users AS (
        SELECT
            '$do_date' dt,
            count(*) day_new_payment_users from$ { APP }.dwt_user_topic wherepayment_date_first = '$do_date'
    ),
     tmp_payment_users AS (
        SELECT
            '$do_date' dt,
            count(*) payment_users from$ { APP }.dwt_user_topic wherepayment_date_first IS NOT NULL
    ),
     tmp_users AS (
        SELECT
            '$do_date' dt,
            count(*) users from$ { APP }.dwt_user_topic tmp_users
    ) INSERT INTO TABLE $ { APP }.ads_user_topic SELECT
        '$do_date' dt,
        day_users,
        day_new_users,
        day_new_payment_users,
        payment_users,
        users,
        day_users / users,
        payment_users / users,
        day_new_users / users fromtmp_day_users
    JOIN tmp_day_new_users ON tmp_day_users.dt = tmp_day_new_users.dt
    JOIN tmp_day_new_payment_users ON tmp_day_users.dt = tmp_day_new_payment_users.dt
    JOIN tmp_payment_users ON tmp_day_users.dt = tmp_payment_users.dt
    JOIN tmp_users ontmp_day_users.dt = tmp_users.dt;
    "
    $hive -e "$sql"
    

    5)增加脚本执行权限

    [atguigu@hadoop102 bin]$ chmod 777 ads_user_topic.sh 
    

    6)执行脚本导入数据

    [atguigu@hadoop102 bin]$ ads_user_topic.sh 2020-03-11 
    

    7)查看导入数据
    hive (gmall)> select * from ads_user_topic;
    2.2 漏斗分析
    统计“浏览->购物车->下单->支付”的转化率
    思路:统计各个行为的人数,然后计算比值。
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_user_action_convert_day;
    
    CREATE external TABLE ads_user_action_convert_day (
        `dt` string COMMENT '统计日期',
        `total_visitor_m_count` BIGINT COMMENT '总访问人数',
        `cart_u_count` BIGINT COMMENT '加入购物车的人数',
        `visitor2cart_convert_ratio` DECIMAL (10, 2) COMMENT '访问到加入购物车转化率',
        `order_u_count` BIGINT COMMENT '下单人数',
        `cart2order_convert_ratio` DECIMAL (10, 2) COMMENT '加入购物车到下单转化率',
        `payment_u_count` BIGINT COMMENT '支付人数',
        `order2payment_convert_ratio` DECIMAL (10, 2) COMMENT '下单到支付的转化率'
    ) COMMENT '用户行为漏斗分析' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_user_action_convert_day/';
    

    2)数据装载

    hive (gmall) > INSERT INTO TABLE ads_user_action_convert_day SELECT
        '2020-03-10',
        uv.day_count,
        ua.cart_count,
        cast(
            ua.cart_count / uv.day_count AS DECIMAL (10, 2)
        ) visitor2cart_convert_ratio,
        ua.order_count,
        cast(
            ua.order_count / ua.cart_count AS DECIMAL (10, 2)
        ) visitor2order_convert_ratio,
        ua.payment_count,
        cast(
            ua.payment_count / ua.order_count AS DECIMAL (10, 2)
        ) order2payment_convert_ratio
    FROM
        (
            SELECT
                dt,
                sum(IF(cart_count > 0, 1, 0)) cart_count,
                sum(IF(order_count > 0, 1, 0)) order_count,
                sum(IF(payment_count > 0, 1, 0)) payment_count
            FROM
                dws_user_action_daycount
            WHERE
                dt = '2020-03-10'
            GROUP BY
                dt
        ) ua
    JOIN ads_uv_count uv ON uv.dt = ua.dt;
    

    3)查询导入结果
    hive (gmall)> select * from ads_user_action_convert_day;

    3.商品主题

    3.1 商品个数信息
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_product_info;
    
    CREATE external TABLE ads_product_info (
        `dt` string COMMENT '统计日期',
        `sku_num` string COMMENT 'sku 个数',
        `spu_num` string COMMENT 'spu 个数'
    ) COMMENT '商品个数信息' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_product_info';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_product_infoselect '2020-03-10' dt,
     sku_num,
     spu_num
    FROM
        (
            SELECT
                '2020-03-10' dt,
                count(*) sku_num fromdwt_sku_topic
        ) tmp_sku_num
    JOIN (
        SELECT
            '2020-03-10' dt,
            count(*) spu_num
        FROM
            (
                SELECT
                    spu_id fromdwt_sku_topic
                GROUP BY
                    spu_id
            ) tmp_spu_id
    ) tmp_spu_num ON tmp_sku_num.dt = tmp_spu_num.dt;
    

    3)查询导入结果
    hive (gmall)> select * from ads_product_info;

    3.2 商品销量排名
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_product_sale_topN;
    
    CREATE external TABLE ads_product_sale_topN (
        `dt` string COMMENT '统计日期',
        `sku_id` string COMMENT '商品 ID',
        `payment_amount` BIGINT COMMENT '销量'
    ) COMMENT '商品个数信息' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_product_sale_topN';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_product_sale_topN SELECT
        '2020-03-10' dt,
        sku_id,
        payment_amount fromdws_sku_action_daycount wheredt = '2020-03-10'
    ORDER BY
        payment_amount DESC
    LIMIT 10;
    

    3)查询导入结果
    hive (gmall)> select * from ads_product_sale_topN;

    3.3 商品收藏排名
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_product_favor_topN;
    
    CREATE external TABLE ads_product_favor_topN (
        `dt` string COMMENT '统计日期',
        `sku_id` string COMMENT '商品 ID',
        `favor_count` BIGINT COMMENT '收藏量'
    ) COMMENT '商品收藏 TopN' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_product_favor_topN';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_product_favor_topN SELECT
        '2020-03-10' dt,
        sku_id,
        favor_count from
    dws_sku_action_daycount wheredt = '2020-03-10'
    ORDER BY
        favor_count DESC
    LIMIT 10;
    

    3)查询导入结果
    hive (gmall)> select * from ads_product_favor_topN;

    3.4 商品加入购物车排名
    1)建表语句

    DROP TABLE
    IF EXISTS ads_product_cart_topN;
    
    CREATE external TABLE ads_product_cart_topN (
        `dt` string COMMENT '统计日期',
        `sku_id` string COMMENT '商品 ID',
        `cart_num` BIGINT COMMENT '加入购物车数量'
    ) COMMENT '商品加入购物车 TopN' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_product_cart_topN';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_product_cart_topN SELECT
        '2020-03-10' dt,
        sku_id,
        cart_num
    FROM
        dws_sku_action_daycount wheredt = '2020-03-10'
    ORDER BY
        cart_num DESC
    LIMIT 10;
    

    3)查询导入结果
    hive (gmall)> select * from ads_product_cart_topN;

    3.5 商品退款率排名(最近 30 天)
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_product_refund_topN;
    
    CREATE external TABLE ads_product_refund_topN (
        `dt` string COMMENT '统计日期',
        `sku_id` string COMMENT '商品 ID',
        `refund_ratio` DECIMAL (10, 2) COMMENT '退款率'
    ) COMMENT '商品退款率 TopN' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_product_refund_topN';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_product_refund_topN SELECT
        '2020-03-10',
        sku_id,
        refund_last_30d_count / payment_last_30d_count * 100 refund_ratio
    FROM
        dwt_sku_topic
    ORDER BY
        refund_ratio DESC
    LIMIT 10;
    

    3)查询导入结果
    hive (gmall)> select * from ads_product_refund_topN;

    3.6 商品差评率
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_appraise_bad_topN;
    
    CREATE external TABLE ads_appraise_bad_topN (
        `dt` string COMMENT '统计日期',
        `sku_id` string COMMENT '商品 ID',
        `appraise_bad_ratio` DECIMAL (10, 2) COMMENT '差评率'
    ) COMMENT '商品差评率 TopN' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_appraise_bad_topN';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_appraise_bad_topN SELECT
        '2020-03-10' dt,
        sku_id,
        appraise_bad_count / (
            appraise_good_count + appraise_mid_count + appraise_bad_coun t + appraise_default_count
        ) appraise_bad_ratio fromdws_sku_action_daycount
    WHERE
        dt = '2020-03-10'
    ORDER BY
        appraise_bad_ratio DESC
    LIMIT 10;
    

    3)查询导入结果
    hive (gmall)> select * from ads_appraise_bad_topN;

    4.营销主题(用户+商品+购买行为)

    4.1 下单数目统计
    需求分析:统计每日下单数,下单金额及下单用户数。

    hive (gmall) > DROP TABLE
    IF EXISTS ads_order_daycount;
    
    CREATE external TABLE ads_order_daycount (
        dt string COMMENT '统计日期',
        order_count BIGINT COMMENT '单日下单笔数',
        order_amount BIGINT COMMENT '单日下单金额',
        order_users BIGINT COMMENT '单日下单用户数'
    ) COMMENT '每日订单总计表' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_order_daycount';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_order_daycount SELECT
        '2020-03-10',
        sum(order_count),
        sum(order_amount),
        sum(IF(order_count > 0, 1, 0))
    FROM
        dws_user_action_daycount
    WHERE
        dt = '2020-03-10';
    

    3)查询导入结果
    hive (gmall)> select * from ads_order_daycount;

    4.2 支付信息统计
    每日支付金额、支付人数、支付商品数、支付笔数以及下单到支付的平均时长(取自 DWD)
    1)建表语句

    hive (gmall) > DROP TABLE
    IF EXISTS ads_payment_daycount;
    
    CREATE external TABLE ads_payment_daycount (
        dt string COMMENT '统计日期',
        order_count BIGINT COMMENT '单日支付笔数',
        order_amount BIGINT COMMENT '单日支付金额',
        payment_user_count BIGINT COMMENT '单日支付人数',
        payment_sku_count BIGINT COMMENT '单日支付商品数',
        payment_avg_time DOUBLE COMMENT '下单到支付的平均时长,取分钟数'
    ) COMMENT '每日订单总计表' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_payment_daycount';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_payment_daycount SELECT
        tmp_payment.dt,
        tmp_payment.payment_count,
        tmp_payment.payment_amount,
        tmp_payment.payment_user_count,
        tmp_skucount.payment_sku_count,
        tmp_time.payment_avg_time
    FROM
        (
            SELECT
                '2020-03-15' dt,
                sum(payment_count) payment_count,
                sum(payment_amount) payment_amount,
                sum(IF(payment_count > 0, 1, 0)) payment_user_count
            FROM
                dws_user_action_daycount
            WHERE
                dt = '2020-03-15'
        ) tmp_payment
    JOIN (
        SELECT
            '2020-03-15' dt,
            sum(IF(payment_count > 0, 1, 0)) payment_sku_count
        FROM
            dws_sku_action_daycount
        WHERE
            dt = '2020-03-15'
    ) tmp_skucount ON tmp_payment.dt = tmp_skucount.dt
    JOIN (
        SELECT
            '2020-03-15' dt,
            sum(
                unix_timestamp(payment_time) - unix_timestamp(create_time)
            ) / count(*) / 60 payment_avg_time
        FROM
            dwd_fact_order_info
        WHERE
            dt = '2020-03-15'
        AND payment_time IS NOT NULL
    ) tmp_time ON tmp_payment.dt = tmp_time.dt
    

    3)查询导入结果
    hive (gmall)> select * from ads_payment_daycount;

    4.3 复购率

    1)建表语句

    hive (gmall) > DROP TABLE ads_sale_tm_category1_stat_mn;
    
    CREATE external TABLE ads_sale_tm_category1_stat_mn (
        tm_id string COMMENT '品牌 id',
        category1_id string COMMENT '1 级品类 id ',
        category1_name string COMMENT '1 级品类名称 ',
        buycount BIGINT COMMENT '购买人数',
        buy_twice_last BIGINT COMMENT '两次以上购买人数',
        buy_twice_last_ratio DECIMAL (10, 2) COMMENT '单次复购率',
        buy_3times_last BIGINT COMMENT '三次以上购买人数',
        buy_3times_last_ratio DECIMAL (10, 2) COMMENT '多次复购率',
        stat_mn string COMMENT '统计月份',
        stat_date string COMMENT '统计日期'
    ) COMMENT '复购率统计' 
    ROW format delimited FIELDS TERMINATED BY '\t' 
    location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
    

    2)导入数据

    hive (gmall) > INSERT INTO TABLE ads_sale_tm_category1_stat_mn selectmn.sku_tm_id,
     mn.sku_category1_id,
     mn.sku_category1_name,
     sum(IF(mn.order_count >= 1, 1, 0)) buycount,
     sum(IF(mn.order_count >= 2, 1, 0)) buyTwiceLast,
     sum(IF(mn.order_count >= 2, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,
     sum(IF(mn.order_count >= 3, 1, 0)) buy3timeLast,
     sum(IF(mn.order_count >= 3, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buy3timeLastRatio,
     date_format('2019-02-10', 'yyyy-MM') stat_mn,
     '2019-02-10' stat_date
    FROM
        (
            SELECT
                user_id,
                sd.sku_tm_id,
                sd.sku_category1_id,
                sd.sku_category1_name,
                sum(order_count) order_count
            FROM
                dws_sale_detail_daycount sd
            WHERE
                date_format(dt, 'yyyy-MM') = date_format('2019-02-10', 'yyyy-MM')
            GROUP BY
                user_id,
                sd.sku_tm_id,
                sd.sku_category1_id,
                sd.sku_category1_name
        ) mn
    GROUP BY
        mn.sku_tm_id,
        mn.sku_category1_id,
        mn.sku_category1_name;
    

    3)查询导入结果
    hive (gmall)> select * from ads_sale_tm_category1_stat_mn;

    5.ADS 层导入脚本

    1)在/home/atguigu/bin 目录下创建脚本 dwt_to_ads.sh

    [atguigu@hadoop102 bin]$ vim dwt_to_ads.sh
    

    在脚本中填写如下内容

    #!/bin/bash 
    hive=/opt/module/hive/bin/hive 
    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天 
    if [ -n "$1" ] ;
    then 
    do
    _date=$1 
    else
    do_date=`date -d "-1 day" +%F` 
    fisql="use gmall;
    INSERT INTO TABLE ads_uv_count SELECT
        '$do_date',
        sum(
    
            IF (
                login_date_last = '$do_date',
                1,
                0
            )
        ),
        sum(
    
            IF (
                login_date_last >= date_add(
                    next_day ('$do_date', 'monday') ,- 7
                )
                AND login_date_last <= date_add(
                    next_day ('$do_date', 'monday') ,- 1
                ),
                1,
                0
            )
        ),
        sum(
    
            IF (
                date_format(login_date_last, 'yyyy-MM') = date_format('$do_date', 'yyyy-M M'),
                1,
                0
            )
        ),
    
    IF (
        '$do_date' = date_add(
            next_day ('$do_date', 'monday') ,- 1
        ),
        'Y',
        'N'
    ),
    
    IF (
        '$do_date' = last_day('$do_date'),
        'Y',
        'N'
    )
    FROM
        dwt_uv_topic;
    
    INSERT INTO TABLE ads_new_mid_count SELECT
        '$do_date',
        count(*)
    FROM
        dwt_uv_topic
    WHERE
        login_date_first = '$do_date';
    
    INSERT INTO TABLE ads_silent_count SELECT
        '$do_date',
        count(*)
    FROM
        dwt_uv_topic
    WHERE
        login_date_first = login_date_last
    AND login_date_last <= date_add('$do_date' ,- 7);
    
    INSERT INTO TABLE ads_back_count SELECT
        '$do_date',
        concat(
            date_add(
                next_day ('2020-03-10', 'MO') ,- 7
            ),
            '_',
            date_add(
                next_day ('2020- 03-10', 'MO') ,- 1
            )
        ),
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                dwt_uv_topic
            WHERE
                login_date_last >= date_add(
                    next_day ('$do_date', 'MO') ,- 7
                )
            AND login_date_last <= date_add(
                next_day ('$do_date', 'MO') ,- 1
            )
            AND login_date_first < date_add(
                next_day ('$do_date', 'MO') ,- 7
            )
        ) current_wk
    LEFT JOIN (
        SELECT
            mid_id
        FROM
            dws_uv_detail_daycount
        WHERE
            dt >= date_add(
                next_day ('$do_date', 'MO') ,- 7 * 2
            )
        AND dt <= date_add(
            next_day ('$do_date', 'MO') ,- 7 - 1
        )
        GROUP BY
            mid_id
    ) last_wk ON current_wk.mid_id = last_wk.mid_id
    WHERE
        last_wk.mid_id IS NULL;
    
    INSERT INTO TABLE ads_wastage_count SELECT
        '$do_date',
        count(*)
    FROM
        dwt_uv_topic
    WHERE
        login_date_last <= date_add('$do_date' ,- 7);
    
    INSERT INTO TABLE ads_user_retention_day_rate SELECT
        '$do_date',
        date_add('$do_date' ,- 3),
        3,
        sum(
    
            IF (
                login_date_first = date_add('$do_date' ,- 3)
                AND login_date_last = '$do_date',
                1,
                0
            )
        ),
        sum(
    
            IF (
                login_date_first = date_add('$do_date' ,- 3),
                1,
                0
            )
        ),
        sum(
    
            IF (
                login_date_first = date_add('$do_date' ,- 3)
                AND login_date_last = '$do_date',
                1,
                0
            )
        ) / sum(
    
            IF (
                login_date_first = date_add('$do_date', - 3),
                1,
                0
            )
        ) * 100
    FROM
        dwt_uv_topic
    UNION ALL
        SELECT
            '$do_date',
            date_add('$do_date' ,- 2),
            2,
            sum(
    
                IF (
                    login_date_first = date_add('$do_date' ,- 2)
                    AND login_date_last = '$do_date',
                    1,
                    0
                )
            ),
            sum(
    
                IF (
                    login_date_first = date_add('$do_date' ,- 2),
                    1,
                    0
                )
            ),
            sum(
    
                IF (
                    login_date_first = date_add('$do_date' ,- 2)
                    AND login_date_last = '$do_date',
                    1,
                    0
                )
            ) / sum(
    
                IF (
                    login_date_first = date_add('$do_date', - 2),
                    1,
                    0
                )
            ) * 100
        FROM
            dwt_uv_topic
        UNION ALL
            SELECT
                '$do_date',
                date_add('$do_date' ,- 1),
                1,
                sum(
    
                    IF (
                        login_date_first = date_add('$do_date' ,- 1)
                        AND login_date_last = '$do_date',
                        1,
                        0
                    )
                ),
                sum(
    
                    IF (
                        login_date_first = date_add('$do_date' ,- 1),
                        1,
                        0
                    )
                ),
                sum(
    
                    IF (
                        login_date_first = date_add('$do_date' ,- 1)
                        AND login_date_last = '$do_date',
                        1,
                        0
                    )
                ) / sum(
    
                    IF (
                        login_date_first = date_add('$do_date', - 1),
                        1,
                        0
                    )
                ) * 100
            FROM
                dwt_uv_topic;
    
    INSERT INTO TABLE ads_continuity_wk_count SELECT
        '$do_date',
        concat(
            date_add(
                next_day ('$do_date', 'MO') ,- 7 * 3
            ),
            '_',
            date_add(
                next_day ('$do_d ate', 'MO') ,- 1
            )
        ),
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                (
                    SELECT
                        mid_id
                    FROM
                        dws_uv_detail_daycount
                    WHERE
                        dt >= date_add(
                            next_day ('$do_date', 'monday') ,- 7
                        )
                    AND dt <= date_add(
                        next_day ('$do_date', 'monday') ,- 1
                    )
                    GROUP BY
                        mid_id
                    UNION ALL
                        SELECT
                            mid_id
                        FROM
                            dws_uv_detail_daycount
                        WHERE
                            dt >= date_add(
                                next_day ('$do_date', 'monday') ,- 7 * 2
                            )
                        AND dt <= date_add(
                            next_day ('$do_date', 'monday') ,- 7 - 1
                        )
                        GROUP BY
                            mid_id
                        UNION ALL
                            SELECT
                                mid_id
                            FROM
                                dws_uv_detail_daycount
                            WHERE
                                dt >= date_add(
                                    next_day ('$do_date', 'monday') ,- 7 * 3
                                )
                            AND dt <= date_add(
                                next_day ('$do_date', 'monday') ,- 7 * 2 - 1
                            )
                            GROUP BY
                                mid_id
                ) t1
            GROUP BY
                mid_id
            HAVING
                count(*) = 3
        ) t2;
    
    INSERT INTO TABLE ads_continuity_uv_count SELECT
        '$do_date',
        concat(
            date_add('$do_date' ,- 6),
            '_',
            '$do_date'
        ),
        count(*)
    FROM
        (
            SELECT
                mid_id
            FROM
                (
                    SELECT
                        mid_id
                    FROM
                        (
                            SELECT
                                mid_id,
                                date_sub(dt, rank) date_dif
                            FROM
                                (
                                    SELECT
                                        mid_id,
                                        dt,
                                        rank () over (PARTITION BY mid_id ORDER BY dt) rank
                                    FROM
                                        dws_uv_detail_daycount
                                    WHERE
                                        dt >= date_add('$do_date' ,- 6)
                                    AND dt <= '$do_date'
                                ) t1
                        ) t2
                    GROUP BY
                        mid_id,
                        date_dif
                    HAVING
                        count(*) >= 3
                ) t3
            GROUP BY
                mid_id
        ) t4;
    
    INSERT INTO TABLE ads_user_topic SELECT
        '$do_date',
        sum(
    
            IF (
                login_date_last = '$do_date',
                1,
                0
            )
        ),
        sum(
    
            IF (
                login_date_first = '$do_date',
                1,
                0
            )
        ),
        sum(
    
            IF (
                payment_date_first = '$do_date',
                1,
                0
            )
        ),
        sum(IF(payment_count > 0, 1, 0)),
        count(*),
        sum(
    
            IF (
                login_date_last = '$do_date',
                1,
                0
            )
        ) / count(*),
        sum(IF(payment_count > 0, 1, 0)) / count(*),
        sum(
    
            IF (
                login_date_first = '$do_date',
                1,
                0
            )
        ) / sum(
    
            IF (
                login_date_last = '$do_date',
                1,
                0
            )
        )
    FROM
        dwt_user_topic;
    
    INSERT INTO TABLE ads_user_action_convert_day SELECT
        '$do_date',
        uv.day_count,
        ua.cart_count,
        ua.cart_count / uv.day_count * 100 visitor2cart_convert_ratio,
        ua.order_count,
        ua.order_count / ua.cart_count * 100 visitor2order_convert_ratio,
        ua.payment_count,
        ua.payment_count / ua.order_count * 100 order2payment_convert_ratio
    FROM
        (
            SELECT
                '$do_date' dt,
                sum(IF(cart_count > 0, 1, 0)) cart_count,
                sum(IF(order_count > 0, 1, 0)) order_count,
                sum(IF(payment_count > 0, 1, 0)) payment_count
            FROM
                dws_user_action_daycount
            WHERE
                dt = '$do_date'
        ) ua
    JOIN ads_uv_count uv ON uv.dt = ua.dt;
    
    INSERT INTO TABLE ads_product_info SELECT
        '$do_date' dt,
        sku_num,
        spu_num
    FROM
        (
            SELECT
                '$do_date' dt,
                count(*) sku_num fromdwt_sku_topic
        ) tmp_sku_num
    JOIN (
        SELECT
            '$do_date' dt,
            count(*) spu_num
        FROM
            (
                SELECT
                    spu_id fromdwt_sku_topic
                GROUP BY
                    spu_id
            ) tmp_spu_id
    ) tmp_spu_num ON tmp_sku_num.dt = tmp_spu_num.dt;
    
    INSERT INTO TABLE ads_product_sale_topN SELECT
        '$do_date',
        sku_id,
        payment_amount
    FROM
        dws_sku_action_daycount
    WHERE
        dt = '$do_date'
    ORDER BY
        payment_amount DESC
    LIMIT 10;
    
    INSERT INTO TABLE ads_product_favor_topN SELECT
        '$do_date',
        sku_id,
        favor_count
    FROM
        dws_sku_action_daycount
    WHERE
        dt = '$do_date'
    ORDER BY
        favor_count
    LIMIT 10;
    
    INSERT INTO TABLE ads_product_cart_topN SELECT
        '$do_date' dt,
        sku_id,
        cart_num
    FROM
        dws_sku_action_daycount
    WHERE
        dt = '$do_date'
    ORDER BY
        cart_num
    LIMIT 10;
    
    INSERT INTO TABLE ads_product_refund_topN SELECT
        '$do_date',
        sku_id,
        refund_last_30d_count / payment_last_30d_count * 100 refund_ratio
    FROM
        dwt_sku_topic
    ORDER BY
        refund_ratio DESC
    LIMIT 10;
    
    INSERT INTO TABLE ads_appraise_bad_topN SELECT
        '$do_date' dt,
        sku_id,
        appraise_bad_count / (
            appraise_bad_count + appraise_good_count + appraise_mid_coun t + appraise_default_count
        ) * 100 appraise_bad_ratio
    FROM
        dws_sku_action_daycount
    WHERE
        dt = '$do_date'
    ORDER BY
        appraise_bad_ratio DESC
    LIMIT 10;
    
    INSERT INTO TABLE ads_order_daycount SELECT
        '$do_date',
        sum(order_count),
        sum(order_amount),
        sum(IF(order_count > 0, 1, 0))
    FROM
        dws_user_action_daycount
    WHERE
        dt = '$do_date';
    
    INSERT INTO TABLE ads_payment_daycount SELECT
        tmp_payment.dt,
        tmp_payment.payment_count,
        tmp_payment.payment_amount,
        tmp_payment.payment_user_count,
        tmp_skucount.payment_sku_count,
        tmp_time.payment_avg_time
    FROM
        (
            SELECT
                '$do_date' dt,
                sum(payment_count) payment_count,
                sum(payment_amount) payment_amount,
                sum(IF(payment_count > 0, 1, 0)) payment_user_count
            FROM
                dws_user_action_daycount
            WHERE
                dt = '$do_date'
        ) tmp_payment
    JOIN (
        SELECT
            '$do_date' dt,
            sum(IF(payment_count > 0, 1, 0)) payment_sku_count
        FROM
            dws_sku_action_daycount
        WHERE
            dt = '$do_date'
    ) tmp_skucount ON tmp_payment.dt = tmp_skucount.dt
    JOIN (
        SELECT
            '$do_date' dt,
            sum(
                unix_timestamp(payment_time) - unix_timestamp(create_time)
            ) / count(*) / 60 payment_avg_time
        FROM
            dwd_fact_order_info
        WHERE
            dt = '$do_date'
        AND payment_time IS NOT NULL
    ) tmp_time ON tmp_payment.dt = tmp_time.dt;
    
    INSERT INTO TABLE ads_sale_tm_category1_stat_mn SELECT
        mn.sku_tm_id,
        mn.sku_category1_id,
        mn.sku_category1_name,
        sum(IF(mn.order_count >= 1, 1, 0)) buycount,
        sum(IF(mn.order_count >= 2, 1, 0)) buyTwiceLast,
        sum(IF(mn.order_count >= 2, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,
        sum(IF(mn.order_count >= 3, 1, 0)) buy3timeLast,
        sum(IF(mn.order_count >= 3, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buy3timeLastRatio,
        date_format('$do_date', 'yyyy-MM') stat_mn,
        '$do_date' stat_date
    FROM
        (
            SELECT
                user_id,
                sd.sku_tm_id,
                sd.sku_category1_id,
                sd.sku_category1_name,
                sum(order_count) order_count
            FROM
                dws_sale_detail_daycount sd
            WHERE
                date_format(dt, 'yyyy-MM') = date_format('$do_date', 'yyyy-MM')
            GROUP BY
                user_id,
                sd.sku_tm_id,
                sd.sku_category1_id,
                sd.sku_category1_name
        ) mn
    GROUP BY
        mn.sku_tm_id,
        mn.sku_category1_id,
        mn.sku_category1_name;
    "
    $hive -e "$sql"
    

    相关文章

      网友评论

        本文标题:数仓实战07:数仓搭建-ADS层

        本文链接:https://www.haomeiwen.com/subject/ejgzrktx.html