美文网首页
MySQL求连续登陆天数-鹅厂面试题

MySQL求连续登陆天数-鹅厂面试题

作者: 只是甲 | 来源:发表于2020-12-28 11:28 被阅读0次

    一.需求

    求每个用户的最长连续登录天数,两个日期的间隔小于或等于 3 均视为连续登录。比如 01-01 号登录,最近的下一次登录是 01-04 号,两个日期的间隔等于 3 天,因此这两个日期之间的天数都算作活跃天数,一共 4 天。

    因为考虑MySQL版本问题,暂时不能使用分析函数。

    二.解决方案

    测试数据:

    create table test_login(user_id int,login_date timestamp);
    
    insert into test_login values (1,'2020-01-01 00:01:00');
    insert into test_login values (1,'2020-01-01 00:02:00');
    insert into test_login values (1,'2020-01-01 00:03:00');
    insert into test_login values (1,'2020-01-02 00:03:00');
    insert into test_login values (1,'2020-01-05 00:03:00');
    insert into test_login values (1,'2020-01-07 00:03:00');
    insert into test_login values (1,'2020-01-11 00:03:00');
    insert into test_login values (1,'2020-01-12 00:03:00');
    insert into test_login values (1,'2020-01-13 00:03:00');
    insert into test_login values (1,'2020-01-14 00:03:00');
    insert into test_login values (1,'2020-01-17 00:03:00');
    insert into test_login values (1,'2020-01-18 00:03:00');
    insert into test_login values (1,'2020-01-19 00:03:00');
    insert into test_login values (1,'2020-01-22 00:03:00');
    
    insert into test_login values (2,'2020-01-01 00:04:00');
    insert into test_login values (2,'2020-01-01 00:05:00');
    insert into test_login values (2,'2020-01-01 00:06:00');
    insert into test_login values (2,'2020-01-02 00:06:00');
    insert into test_login values (2,'2020-01-07 00:06:00');
    insert into test_login values (2,'2020-01-11 00:06:00');
    insert into test_login values (2,'2020-01-12 00:06:00');
    insert into test_login values (2,'2020-01-14 00:06:00');
    insert into test_login values (2,'2020-01-18 00:06:00');
    insert into test_login values (2,'2020-01-19 00:06:00');
    insert into test_login values (2,'2020-01-22 00:06:00');
    

    代码

    SELECT user_id,
           max(diff_days) + 1 max_diff_days
      from 
    (
    SELECT user_id,
           flag2,
           min(login_date) min_login_date,
           max(login_date) max_login_date,
           datediff(max(login_date),min(login_date)) diff_days
      from 
    (
    SELECT tmp3.user_id,
           tmp3.login_date,
           tmp3.flag1,
           sum(tmp4.flag1) flag2
      from 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    left join 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp4  -- 功能同tmp3 
    on  tmp3.user_id = tmp4.user_id
    and tmp4.login_date <= tmp3.login_date
    group by tmp3.user_id,tmp3.login_date,tmp3.flag1
    ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
    group by user_id,flag2
    ) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
    group by user_id
    

    这样看起来是不是觉得非常的复杂,下面我们拆解开

    2.1 同一天多次登陆去重

    考虑同一天会有多次登陆的,此处用group by语句进行去重,同一天只保留一条记录

    代码:

    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    

    测试记录:

    mysql> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d');
    +---------+------------+
    | user_id | login_date |
    +---------+------------+
    |       1 | 2020-01-01 |
    |       1 | 2020-01-02 |
    |       1 | 2020-01-05 |
    |       1 | 2020-01-07 |
    |       1 | 2020-01-11 |
    |       1 | 2020-01-12 |
    |       1 | 2020-01-13 |
    |       1 | 2020-01-14 |
    |       1 | 2020-01-17 |
    |       1 | 2020-01-18 |
    |       1 | 2020-01-19 |
    |       1 | 2020-01-22 |
    |       2 | 2020-01-01 |
    |       2 | 2020-01-02 |
    |       2 | 2020-01-07 |
    |       2 | 2020-01-11 |
    |       2 | 2020-01-12 |
    |       2 | 2020-01-14 |
    |       2 | 2020-01-18 |
    |       2 | 2020-01-19 |
    |       2 | 2020-01-22 |
    +---------+------------+
    21 rows in set (0.00 sec)
    
    mysql> 
    

    2.2 求上一次的登陆日期

    我们需要找到上一次登陆日期,与此次的登陆日期对比,才可以判断是否符合小于等于3天的条件
    此处,我们可以使用标量子查询来实现
    需要记录的每个用户的第一条登陆信息上一天为null

    代码:

    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    

    测试记录:

    mysql> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ;
    +---------+------------+---------------+
    | user_id | login_date | up_login_date |
    +---------+------------+---------------+
    |       1 | 2020-01-01 | NULL          |
    |       1 | 2020-01-02 | 2020-01-01    |
    |       1 | 2020-01-05 | 2020-01-02    |
    |       1 | 2020-01-07 | 2020-01-05    |
    |       1 | 2020-01-11 | 2020-01-07    |
    |       1 | 2020-01-12 | 2020-01-11    |
    |       1 | 2020-01-13 | 2020-01-12    |
    |       1 | 2020-01-14 | 2020-01-13    |
    |       1 | 2020-01-17 | 2020-01-14    |
    |       1 | 2020-01-18 | 2020-01-17    |
    |       1 | 2020-01-19 | 2020-01-18    |
    |       1 | 2020-01-22 | 2020-01-19    |
    |       2 | 2020-01-01 | NULL          |
    |       2 | 2020-01-02 | 2020-01-01    |
    |       2 | 2020-01-07 | 2020-01-02    |
    |       2 | 2020-01-11 | 2020-01-07    |
    |       2 | 2020-01-12 | 2020-01-11    |
    |       2 | 2020-01-14 | 2020-01-12    |
    |       2 | 2020-01-18 | 2020-01-14    |
    |       2 | 2020-01-19 | 2020-01-18    |
    |       2 | 2020-01-22 | 2020-01-19    |
    +---------+------------+---------------+
    21 rows in set (0.00 sec)
    

    2.3 打标记判断是否间隔天数<=3

    求出上一个登陆日期,与本次登陆日期比较,如果<=3,则标记为0,否则标记为1
    需要注意的是上一步的空值直接标记为1
    这样通过flag1 我们就可以看到连续登陆的区间了

    代码:

    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    

    测试记录:

    mysql> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ;
    +---------+------------+-------+
    | user_id | login_date | flag1 |
    +---------+------------+-------+
    |       1 | 2020-01-01 |     1 |
    |       1 | 2020-01-02 |     0 |
    |       1 | 2020-01-05 |     0 |
    |       1 | 2020-01-07 |     0 |
    |       1 | 2020-01-11 |     1 |
    |       1 | 2020-01-12 |     0 |
    |       1 | 2020-01-13 |     0 |
    |       1 | 2020-01-14 |     0 |
    |       1 | 2020-01-17 |     0 |
    |       1 | 2020-01-18 |     0 |
    |       1 | 2020-01-19 |     0 |
    |       1 | 2020-01-22 |     0 |
    |       2 | 2020-01-01 |     1 |
    |       2 | 2020-01-02 |     0 |
    |       2 | 2020-01-07 |     1 |
    |       2 | 2020-01-11 |     1 |
    |       2 | 2020-01-12 |     0 |
    |       2 | 2020-01-14 |     0 |
    |       2 | 2020-01-18 |     1 |
    |       2 | 2020-01-19 |     0 |
    |       2 | 2020-01-22 |     0 |
    +---------+------------+-------+
    21 rows in set (0.01 sec)
    

    2.4 打分组的标记

    因为同一个用户存在多个连续登陆的区间,根据上一个步骤的flag1没办法区分开,此时需要区分开同一个用户的不同连续登陆区间。

    因为MySQL版本不支持分析函数,只能通过临时表表连接的方式实现,再考虑去除重复,需要用到分组语句进行去重。

    其实flag2是这个解决方案核心所在,同一个用户第一次连续登陆区间标记为1,第二次则累加为2,以此类推。

    代码:

    SELECT tmp3.user_id,
           tmp3.login_date,
           tmp3.flag1,
           sum(tmp4.flag1) flag2
      from 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    left join 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp4  -- 功能同tmp3 
    on  tmp3.user_id = tmp4.user_id
    and tmp4.login_date <= tmp3.login_date
    group by tmp3.user_id,tmp3.login_date,tmp3.flag1
    

    测试记录:

    mysql> SELECT tmp3.user_id,
        ->        tmp3.login_date,
        ->        tmp3.flag1,
        ->        sum(tmp4.flag1) flag2
        ->   from 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
        -> left join 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp4  -- 功能同tmp3 
        -> on  tmp3.user_id = tmp4.user_id
        -> and tmp4.login_date <= tmp3.login_date
        -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1;
    +---------+------------+-------+-------+
    | user_id | login_date | flag1 | flag2 |
    +---------+------------+-------+-------+
    |       1 | 2020-01-01 |     1 |     1 |
    |       1 | 2020-01-02 |     0 |     1 |
    |       1 | 2020-01-05 |     0 |     1 |
    |       1 | 2020-01-07 |     0 |     1 |
    |       1 | 2020-01-11 |     1 |     2 |
    |       1 | 2020-01-12 |     0 |     2 |
    |       1 | 2020-01-13 |     0 |     2 |
    |       1 | 2020-01-14 |     0 |     2 |
    |       1 | 2020-01-17 |     0 |     2 |
    |       1 | 2020-01-18 |     0 |     2 |
    |       1 | 2020-01-19 |     0 |     2 |
    |       1 | 2020-01-22 |     0 |     2 |
    |       2 | 2020-01-01 |     1 |     1 |
    |       2 | 2020-01-02 |     0 |     1 |
    |       2 | 2020-01-07 |     1 |     2 |
    |       2 | 2020-01-11 |     1 |     3 |
    |       2 | 2020-01-12 |     0 |     3 |
    |       2 | 2020-01-14 |     0 |     3 |
    |       2 | 2020-01-18 |     1 |     4 |
    |       2 | 2020-01-19 |     0 |     4 |
    |       2 | 2020-01-22 |     0 |     4 |
    +---------+------------+-------+-------+
    21 rows in set (0.01 sec)
    

    2.5 分组求最大的间隔天数

    其实有了上一个步骤的flag2,就可以判断区分开每一个用户的每一个连续登陆区间,直接进行分组就可以求出该连续区间 最大和最小的登陆日期,两个日期差值就是间隔天数。最后根据user_id进行分组,求最大的间隔天数就是最终结果。

    需要审题 01-01到01-04 间隔是3天,但是算4天连续登陆 所以datediff函数的结果要+1

    代码:

    SELECT user_id,
           max(diff_days) +1 max_diff_days
      from 
    (
    SELECT user_id,
           flag2,
           min(login_date) min_login_date,
           max(login_date) max_login_date,
           datediff(max(login_date),min(login_date)) diff_days
      from 
    (
    SELECT tmp3.user_id,
           tmp3.login_date,
           tmp3.flag1,
           sum(tmp4.flag1) flag2
      from 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    left join 
    (
    SELECT user_id,
           login_date,
           case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
      from 
    (
    SELECT  user_id,
            login_date,
            (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    from 
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    ) tmp4  -- 功能同tmp3 
    on  tmp3.user_id = tmp4.user_id
    and tmp4.login_date <= tmp3.login_date
    group by tmp3.user_id,tmp3.login_date,tmp3.flag1
    ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
    group by user_id,flag2
    ) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
    group by user_id
    

    测试记录:

    mysql> SELECT user_id,
        ->        flag2,
        ->        min(login_date) min_login_date,
        ->        max(login_date) max_login_date,
        ->        datediff(max(login_date),min(login_date)) diff_days
        ->   from 
        -> (
        -> SELECT tmp3.user_id,
        ->        tmp3.login_date,
        ->        tmp3.flag1,
        ->        sum(tmp4.flag1) flag2
        ->   from 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
        -> left join 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp4  -- 功能同tmp3 
        -> on  tmp3.user_id = tmp4.user_id
        -> and tmp4.login_date <= tmp3.login_date
        -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
        -> ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
        -> group by user_id,flag2;
    +---------+-------+----------------+----------------+-----------+
    | user_id | flag2 | min_login_date | max_login_date | diff_days |
    +---------+-------+----------------+----------------+-----------+
    |       1 |     1 | 2020-01-01     | 2020-01-07     |         6 |
    |       1 |     2 | 2020-01-11     | 2020-01-22     |        11 |
    |       2 |     1 | 2020-01-01     | 2020-01-02     |         1 |
    |       2 |     2 | 2020-01-07     | 2020-01-07     |         0 |
    |       2 |     3 | 2020-01-11     | 2020-01-14     |         3 |
    |       2 |     4 | 2020-01-18     | 2020-01-22     |         4 |
    +---------+-------+----------------+----------------+-----------+
    6 rows in set (0.01 sec)
    
    mysql>
    
    
    mysql> SELECT user_id,
        ->        max(diff_days) +1 max_diff_days
        ->   from 
        -> (
        -> SELECT user_id,
        ->        flag2,
        ->        min(login_date) min_login_date,
        ->        max(login_date) max_login_date,
        ->        datediff(max(login_date),min(login_date)) diff_days
        ->   from 
        -> (
        -> SELECT tmp3.user_id,
        ->        tmp3.login_date,
        ->        tmp3.flag1,
        ->        sum(tmp4.flag1) flag2
        ->   from 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
        -> left join 
        -> (
        -> SELECT user_id,
        ->        login_date,
        ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
        ->   from 
        -> (
        -> SELECT  user_id,
        ->         login_date,
        ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
        -> from 
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login 
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
        -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
        -> ) tmp4  -- 功能同tmp3 
        -> on  tmp3.user_id = tmp4.user_id
        -> and tmp4.login_date <= tmp3.login_date
        -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
        -> ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
        -> group by user_id,flag2
        -> ) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
        -> group by user_id
        -> ;
    +---------+---------------+
    | user_id | max_diff_days |
    +---------+---------------+
    |       1 |            12 |
    |       2 |             5 |
    +---------+---------------+
    2 rows in set (0.00 sec)
    
    mysql> 
     
    

    三.MySQL 8.0写法

    MySQL 8.0的with语句以及分析窗口函数,可以使上面的解决方案的代码简洁度大大提升

    代码:

    with tmp1 AS
    -- tmp1临时表 一天多次登陆算一次
    (
    SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
      from test_login 
     group by user_id,date_format(login_date,'%Y-%m-%d')
    ),
    tmp2 as
    -- tmp2临时表  通过lag获取上一次登陆时间  如是第一条给默认值
    (
    select user_id,
           login_date,
           lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_date
      from tmp1
    ),
    tmp3 AS
    -- 判断是否符合3天内标准  打标记flag1
    (
    select  user_id,
            login_date,
            case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1  end as flag1
      from  tmp2
    ),
    tmp4 AS
    -- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
    (
    select  user_id,
            login_date,
            flag1,
            sum(flag1) over(partition by user_id order by login_date) as flag2
      from tmp3
    ),
    tmp5 AS
    -- 通过user_id flag2进行分组
    (
    SELECT user_id,
           flag2,
           min(login_date) min_login_date,
           max(login_date) max_login_date,
           datediff(max(login_date),min(login_date)) diff_days
      from tmp4
     group by user_id,flag2
    )
    SELECT user_id,
           max(diff_days) + 1 as diff_days
      from tmp5
     group by user_id;
    

    测试记录:

    mysql> select version();
    +-----------+
    | version() |
    +-----------+
    | 8.0.19    |
    +-----------+
    1 row in set (0.00 sec)
    
    mysql> with tmp1 AS
        -> -- tmp1临时表 一天多次登陆算一次
        -> (
        -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
        ->   from test_login
        ->  group by user_id,date_format(login_date,'%Y-%m-%d')
        -> ),
        -> tmp2 as
        -> -- tmp2临时表  通过lag获取上一次登陆时间  如是第一条给默认值
        -> (
        -> select user_id,
        ->        login_date,
        ->        lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_d
        ->   from tmp1
        -> ),
        -> tmp3 AS
        -> -- 判断是否符合3天内标准  打标记flag1
        -> (
        -> select  user_id,
        ->         login_date,
        ->         case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1  end as flag1
        ->   from  tmp2
        -> ),
        -> tmp4 AS
        -> -- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
        -> (
        -> select  user_id,
        ->         login_date,
        ->         flag1,
        ->         sum(flag1) over(partition by user_id order by login_date) as flag2
        ->   from tmp3
        -> ),
        -> tmp5 AS
        -> -- 通过user_id flag2进行分组
        -> (
        -> SELECT user_id,
        ->        flag2,
        ->        min(login_date) min_login_date,
        ->        max(login_date) max_login_date,
        ->        datediff(max(login_date),min(login_date)) diff_days
        ->   from tmp4
        ->  group by user_id,flag2
        -> )
        -> SELECT user_id,
        ->        max(diff_days) + 1 as diff_days
        ->   from tmp5
        ->  group by user_id;
    +---------+-----------+
    | user_id | diff_days |
    +---------+-----------+
    |       1 |        12 |
    |       2 |         5 |
    +---------+-----------+
    2 rows in set (0.00 sec)
    
    

    相关文章

      网友评论

          本文标题:MySQL求连续登陆天数-鹅厂面试题

          本文链接:https://www.haomeiwen.com/subject/ismznktx.html