用户行为分析 SQL 整理

作者: abbxyz_1223 | 来源:发表于2021-01-07 14:53 被阅读0次

    笔者从事大数据行业快三年时间,在用户行为分析方向也有所沉淀。现在整理一些常用的 SQL ,希望本文对用户行为分析的初学者的 SQL 学习有所帮助。

    行业内用的数据系统各式各样,在 《华为数据之道》一书中描述的 “华为” 的数据中台的五种数据中台联接方式:以业务流(事件)为中心联接、以对象(主体)为中心联接、智能标签、报告数据、算法模型。大部分企业的最基础的数据模型就是 “用户行为” 和 "用户主体"。

    现在假设大家的数据系统里都有两张大宽表,一张记录 “用户行为”,我们称 events 表,一张记录 “用户主体”,我们称 users 表。表的结构大致是:

    Events 表

    user_id event date time
    1 visit 2021-01-01 2021-01-01 01:31:07.474
    2 visit 2021-01-01 2021-01-01 01:32:03.674
    3 payorder 2021-01-01 2021-01-01 01:33:12.444

    Users 表

    id first_id second_id age sex city
    1 android_1 red 13 合肥,淮北
    2 android_1 green 14 北京,上海
    3 ios_1 blue 22 深圳

    日访问量( PV ):

    SELECT  
        COUNT(*) AS "今日总访问次数"
    FROM  
        EVENTS 
    WHERE 
        date=CURRENT_DATE() AND event='visit'
    
    

    日活跃用户数( UV ):

    SELECT
        COUNT(DISTINCT user_id) AS "今日独立用户数"
    FROM  
        EVENTS 
    WHERE 
        date=CURRENT_DATE() AND event='visit'
    

    最近七天日活:

    SELECT  
         date,COUNT(DISTINCT user_id) AS "今日独立用户数"
    FROM  
        EVENTS 
    WHERE 
        event='visit'  AND date  BETWEEN CURRENT_DATE() - INTERVAL '7' DAY AND CURRENT_DATE() 
    GROUP BY date
    

    分时活跃:

    SELECT
        HOUR(time) AS "小时数",
        COUNT(DISTINCT user_id) AS "独立用户数"
    FROM  
        EVENTS 
    WHERE 
        event='visit' AND date=CURRENT_DATE()
    GROUP BY HOUR(time)
    

    查询每天上午 10 点至 11 点的下单用户数

    SELECT 
        COUNT(*) AS "独立用户数"
    FROM 
        EVENTS
    WHERE 
        EXTRACT(HOUR FROM time) IN (10, 11) AND event = 'payorder'
    GROUP BY 1
    

    List 类型的查询,包含 xx 的 List 有哪些:

    SELECT 
        city  AS "城市"
    FROM
        users 
    WHERE
        CONTAINS('合肥', city);
    

    List 里的元素个数:

    SELECT 
        city,
        length(city)-length(replace(city, '\n',''))+1 AS "元素个数"
    FROM
        users  
    WHERE
        city IS NOT NULL
    

    根据生日得到年龄

    SELECT 
        id, 
        YEAR(NOW())-YEAR(TO_TIMESTAMP(birthday)) AS "年龄"
    FROM 
        users
    

    漏斗用户:

    visit(访问)—addtocart(加购)—payorder(支付)(窗口期 48 小时且严格满足事件先后顺序)

    SELECT
      COUNT(DISTINCT s3.user_id) AS "漏斗独立用户数"
    FROM
      events s3
    INNER JOIN (
        SELECT
          s2.user_id as user_id,
          s2.time as time,
          s1.endtime
        FROM
          EVENTS s2//s2 得到第二步用户、加购时间、窗口期时间
        INNER JOIN (
            SELECT user_id,time,HOURS_ADD(time, 48) as endtime //窗口期 2 天(+ 48 小时) FROM EVENTS
            WHERE event = 'visit' AND date between '2021-01-01' and '2021-01-02' //时间范围是 1 号到 3 号
            ) s1 //s1 得到用户、访问时间、访问时间+48 小时
        ON s2.user_id = s1.user_id AND s2.time > s1.time AND s2.time <= endtime
        WHERE
          s2.event = 'addtocart'
      ) a 
    ON
      s3.user_id = a.user_id AND s3.time > a.time AND s3.time <= a.endtime
    WHERE
      s3.event = 'payorder' 
    

    连续 n 天访问的用户(n=1,2,3,4,...):

    
    SELECT 
        COUNT(DISTINCT user_id) AS "连续 n 天访问的用户"
    FROM 
        (SELECT 
             user_id,
             date,
             TO_DATE(DAY_TO_DATE(LEAD(date,n-1,null)OVER(PARTITION BY user_id ORDER BY date ASC))) AS newdate
         FROM 
            (SELECT  user_id, date FROM  EVENTS  WHERE event='visit' GROUP BY user_id,date) a
         ) b
    WHERE DATEDIFF(newdate,date)=n-1
    
    

    PS:注意把 SQL 里的 n 替换为具体的天数

    用户连续 n 天做某事(主要不同用户得 n)--感谢同事提供:

    SELECT 
      user_id,
      continuous_days  AS "连续访问天数"
    FROM
      (-- 原理:得到用户 ymd1-ROW_NUMBER1=ymd2-ROW_NUMBER2 的数量,则为连续天数
       SELECT
         user_id,
         continuous_group,
         COUNT(1) AS continuous_days
       FROM
        (-- 原理:ROW_NUMBER 和日期天一样是步长为 1 的数列
         -- 如果用户是连续日期做这件事,日期减去ROW_NUMBER一定是相等的,以此构建分组
         SELECT
            ymd - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY ymd ASC) AS continuous_group,
            user_id,
            event
         FROM
            (-- 把用户做这个事件的日期取出来
             SELECT
               DISTINCT
               -- 由于数字格式在跨月时步长不相等,如 20191101-20191031>1,需要把日期换算成连续天数列
               -- FROM_UNIXTIME(UNIX_TIMESTAMP(date),'yyyyMMdd') AS ymd,
               DATEDIFF(date,'1970-01-01') AS ymd,
               user_id,
               event
             FROM
               events
             WHERE
               event='visit'
            ) event_log
       ) event_group
       GROUP BY
         user_id,
         continuous_group
    ) continuous_groups
    WHERE  
      continuous_days>1
    
    

    查询一段时间内的用户下单次数分布情况

    SELECT 
        CASE
            WHEN c < 10 THEN '<10'
            WHEN c < 20 THEN '<20'
            WHEN c < 100 THEN '<100'
            ELSE '>100'
        END,
        COUNT(*)
    FROM (
        SELECT user_id, COUNT(*) AS c FROM events
        WHERE date BETWEEN '2015-09-01' AND '2015-09-20' AND event = 'payorderr'
        GROUP BY 1
    )a 
    GROUP BY 1
    

    做过 addtocart (加购) 且没有 payorder (支付)的用户:

    SELECT 
        COUNT(DISTINCT a.user_id) AS "做过 addtocart (加购) 且没有 payorder (支付)的用户"
    FROM 
        (SELECT user_id FROM EVENTS WHERE event='addtocart') a
    LEFT JOIN 
        (SELECT user_id FROM EVENTS WHERE event='payorder') b
    ON 
        a.user_id=b.user_id 
    WHERE 
        b.user_id IS NULL
    
    

    做过 addtocart (加购) 且没有 payorder (支付)的用户和商品:

    SELECT  
        COUNT(DISTINCT addcart.mapid)  AS "做过 addtocart (加购) 且没有 payorder (支付)的用户和商品"
    FROM 
        (SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'addtocart')  addcart
    LEFT JOIN
        (SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM  EVENTS WHERE  event = 'payorder')  pay 
    ON 
        addcart.mapid = pay.mapid
    WHERE  
        pay.user_id is null
    

    不同支付方式的支付金额总和(微信支付+支付宝支付):

    SELECT
        user_id,
        SUM(value) AS "支付金额总和"
    FROM
        (
        -- 支付宝支付金额
        SELECT
            user_id,
            SUM(events.alipay_amount) AS value
        FROM
            events 
        WHERE
            date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
            AND events.event='alipay_detail'
        GROUP BY
            user_id
    
        UNION ALL
       -- 微信支付金额
        SELECT
            user_id,
            SUM(events.wechat_amount) AS value
        FROM
            events 
        WHERE
            date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
            AND events.event='wechat_pay_detail'
        GROUP BY
            user_id
        ) t
    GROUP BY
        user_id
    

    正则表达式之使用 QQ 邮箱为邮件的用户数

    SELECT
        COUNT(*)  AS "独立用户数"
    FROM 
        users
    WHERE 
        regexp_like(email, '@qq.com$')
    

    正则表达式之匹配 ID 规则

    -- users 表的 first_id 记录用户的设备 id,second_id 来记录用户的登录 id
    -- 匹配 first_id 为 Android_id,Android_id 一般是 16 位字母和数字的组合
    SELECT
        id,
        first_id AS "安卓设备 ID"
    FROM 
        users
    WHERE 
        REGEXP_LIKE(first_id, '^([0-9a-z]{1,16})$')
    
    -- 匹配 first_id 为 IDFA/IDFV,一般是 32 位字母和数字的组合
    SELECT
        id,
        first_id AS "🍎 设备 ID"
    FROM 
        users
    WHERE 
        REGEXP_LIKE(first_id, '^([0-9A-Z]{8})(([/\s-][0-9A-Z]{4}){3})([/\s-][0-9A-Z]{12})$')
    
    -- 匹配 first_id 为小程序的 open_id
    SELECT
        id,
        first_id AS "小程序设备 ID"
    FROM 
        users
    WHERE 
        REGEXP_LIKE(first_id, '^o[0-9a-zA-Z_-]{27}$')
    
    --这里不再一一列举,只列举几个常用的
    ^\d{n}$   (验证 n 位数字,n 输入具体的值)
    ^[0-9a-zA-Z]{n,m}$ (n ~m 个数字、字母组成的字符串)
    ^[a-zA-Z]{n,m}$(n ~m 个字母组成的字符串)
    ^([A-Za-z0-9_\-\.])+\@[a-zA-Z0-9_\-]+([a-zA-Z0-9_\-\.])+$(验证是否是邮箱)
    
    

    间隔计算,计算两个事件的间隔时间(超过 10 分钟则不计算)

    SELECT
        user_id,
        SUM(
          CASE WHEN
           end_time - begin_time < 600
          THEN
           end_time - begin_time
          ELSE
           0
          END
        ) AS "间隔时长(秒)"
    FROM (
        SELECT
            user_id,
            EXTRACT(EPOCH FROM time) AS end_time,
            LAG(EXTRACT(EPOCH FROM time), 1, NULL) OVER (PARTITION BY user_id ORDER BY time ASC) AS begin_time 
        FROM events ) a
    GROUP BY 1
    

    计算支付行为间隔天数

    SELECT 
        user_id,
        datex,
        DATEDIFF(datex, LAG(datex,1,NULL) OVER(PARTITION BY user_id ORDER BY datex ASC)) AS diff
    FROM 
        (SELECT user_id,trunc(time, 'DD') AS datex FROM events WHERE event = 'payorder' GROUP BY user_id,datex) a
    

    超级微笑曲线

    SELECT 
        visit_days AS "访问天数", 
        COUNT(user_id) AS "独立用户数"
    FROM
       (SELECT user_id, COUNT(DISTINCT date) AS visit_days FROM events WHERE date BETWEEN CURRENT_DATE() - INTERVAL '30' DAY AND CURRENT_DATE() - INTERVAL '1' DAY GROUP BY 1) a
    GROUP BY 1
    

    用户首个购买和第二次购买的日期间隔

    SELECT 
        user_id,
        DATEDIFF(first_time_value(time,next_time),MIN(time)) AS "首次购买和第二次购买的时间差"
    FROM  
        (SELECT user_id,time,LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time FROM events WHERE  event='payorder' ) a
    GROUP BY 
        user_id
    

    PS:first_time_value 是自定义的函数,使用 first_time_value(time, 其他属性) 聚合函数来获取第一次发生某行为时的相关属性,建议把这个函数内置。如果没有的话,需要参考:

    SELECT 
        user_id,
        time,
        next_time,
        DATEDIFF(next_time,time) 
    FROM
        (SELECT user_id,time,next_time,ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY time ASC)     AS rank FROM  (SELECT
        user_id,time,distinct_id,
        LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time
        FROM events WHERE  event='payOrder') a
        WHERE  next_time IS NOT NULL)b
    WHERE rank=1
    

    寻找流失用户:最近两次访问时间间隔在 30 天以上

    SELECT
        COUNT(user_id) AS "独立用户数"
    FROM (
    SELECT user_id,(UNIX_TIMESTAMP(MAX(t2)) - UNIX_TIMESTAMP(MAX(t1))) / 86400 AS d1
    FROM (
    SELECT user_id,LAG(time,1) OVER(PARTITION BY user_id ORDER BY time ASC) AS t1,time AS t2 FROM events
    WHERE  event='visit' ) t
    GROUP  BY user_id ) r
    

    过去 7 天浏览偏好的商品类型(前 3)

    /* 假设当前基准时间为 2019-06-19 */
    /* 集合类型标签*/ 
    
    SELECT id, MAX(distinct_id) AS distinct_id,
        GROUP_CONCAT(product_type, '\n') AS value
    FROM (
        SELECT id, distinct_id, product_type, 
            RANK() OVER (PARTITION BY id ORDER BY cnt DESC) AS rank_num
        FROM (
            SELECT user_id AS id, product_type,
                MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
            FROM events
            WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
                AND event = 'payorder'
            GROUP BY 1, 2
        ) a
    ) b
    WHERE rank_num <= 3
    GROUP BY 1
    
    /* 其中 group_concat(product_type, '\n') 表示用户前三的商品类型。 */
    /* 返回值是 list 类型,需要创建为集合类型的标签 */
    

    过去 7 天中用户最近一次访问距今时间(天)

    /* 假设当前基准时间为 2019-06-19 */
    /* 数值类型标签 */
    
    SELECT id, distinct_id, DATEDIFF(now(), time) AS value
    FROM (
        SELECT user_id AS id, MAX(distinct_id) AS distinct_id, MAX(time) AS time
        FROM events
        WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
            AND event = 'View'
        GROUP BY 1
    ) a
    
    /* 其中 View 为用户访事件,datediff(now(), time) as value 表示事件发生的距今天数 */
    

    过去 7 天浏览最多的商品类型

    /* 字符串类型标签 */
    
    SELECT id, distinct_id, product_type AS value
    FROM (
        SELECT id, distinct_id, product_type, 
            ROW_NUMBER() OVER (PARTITION BY id ORDER BY cnt DESC) AS row_num
        FROM (
            SELECT user_id AS id, product_type, MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
            FROM events
            WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
                AND event = 'ProductDetails'
            GROUP BY 1, 2
        ) a
    ) b
    WHERE row_num <= 1
    

    暂时就这么多,我会不定期补充的~

    相关文章

      网友评论

        本文标题:用户行为分析 SQL 整理

        本文链接:https://www.haomeiwen.com/subject/ufmnoktx.html