#原始数据
##要求使用SQL统计出每个用户的累积访问次数,如下表所示:
## 建表语句
1. 创建动态分区表
create table test01_visit(userId string, visitData string, visitCount int) partitioned by(x string);
2. 设置动态分区属性
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
3. 创建临时表并加载数据
create table test_visit(userId string, visitData string, visitCount int) row format delimited fields terminated by '\t' location '/data';
+--------------------+-----------------------+------------------------+--+
| test_visit.userid | test_visit.visitdata | test_visit.visitcount |
+--------------------+-----------------------+------------------------+--+
| u01 | 2017/1/21 | 5 |
| u02 | 2017/1/23 | 6 |
| u03 | 2017/1/22 | 8 |
| u04 | 2017/1/20 | 3 |
| u01 | 2017/1/23 | 6 |
| u01 | 2017/2/21 | 8 |
| U02 | 2017/1/23 | 6 |
| U01 | 2017/2/22 | 4 |
+--------------------+-----------------------+------------------------+--+
4. 将临时表中的数据插入到动态分区表中
insert overwrite table test01_visit partition(x) select userId, visitData, visitCount, visitData from test_visit;
+----------------------+-------------------------+--------------------------+-----------------+--+
| test01_visit.userid | test01_visit.visitdata | test01_visit.visitcount | test01_visit.x |
+----------------------+-------------------------+--------------------------+-----------------+--+
| u04 | 2017/1/20 | 3 | 2017/1/20 |
| u01 | 2017/1/21 | 5 | 2017/1/21 |
| u03 | 2017/1/22 | 8 | 2017/1/22 |
| u02 | 2017/1/23 | 6 | 2017/1/23 |
| u01 | 2017/1/23 | 6 | 2017/1/23 |
| U02 | 2017/1/23 | 6 | 2017/1/23 |
| u01 | 2017/2/21 | 8 | 2017/2/21 |
| U01 | 2017/2/22 | 4 | 2017/2/22 |
+----------------------+-------------------------+--------------------------+-----------------+--+
5. 将时间字段格式替换
select date_format(regexp_replace(visitData,'/','-'),'yyyy-MM')from test01_visit;
+----------+--+
| _c0 |
+----------+--+
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-02 |
| 2017-02 |
+----------+--+
select from_unixtime(unix_timestamp(visitdata,'yyyy/MM/dd'),'yyyy-MM') from test01_visit;
+----------+--+
| _c0 |
+----------+--+
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-01 |
| 2017-02 |
| 2017-02 |
+----------+--+
6.字符串格式化
select lower(userId) from test01_visit;
+------+--+
| _c0 |
+------+--+
| u04 |
| u01 |
| u03 |
| u02 |
| u01 |
| u02 |
| u01 |
| u01 |
+------+--+
7.统计出每个用户的每个月的访问次数
select lower(userId) as uid, date_format(regexp_replace(visitdata,'/','-'),'yyyy-MM') as date_month, sum(visitcount) as visitcount from test01_visit group by lower(userId), date_format(regexp_replace(visitdata,'/','-'),'yyyy-MM');
+------+-------------+-------------+--+
| uid | date_month | visitcount |
+------+-------------+-------------+--+
| u01 | 2017-01 | 11 |
| u01 | 2017-02 | 12 |
| u02 | 2017-01 | 12 |
| u03 | 2017-01 | 8 |
| u04 | 2017-01 | 3 |
+------+-------------+-------------+--+
8.统计出每个用户的累积访问次数
select *, sum(t1.visitcount) over(partition by uid order by date_month rows between unbounded preceding and current row) from (select lower(userId) as uid, date_format(regexp_replace(visitdata,'/','-'),'yyyy-MM') as date_month, sum(visitcount) as visitcount from test01_visit group by lower(userId), date_format(regexp_replace(visitdata,'/','-'),'yyyy-MM')) t1;
+---------+----------------+----------------+---------------+--+
| t1.uid | t1.date_month | t1.visitcount | sum_window_0 |
+---------+----------------+----------------+---------------+--+
| u01 | 2017-01 | 11 | 11 |
| u01 | 2017-02 | 12 | 23 |
| u02 | 2017-01 | 12 | 12 |
| u03 | 2017-01 | 8 | 8 |
| u04 | 2017-01 | 3 | 3 |
+---------+----------------+----------------+---------------+--+
网友评论