1.HiveServer 和 HiveServer2 服务
Hive 内置了 HiveServer 和 HiveServer2 服务,两者都允许客户端使用多种编程语言进行连接,但是 HiveServer 不能处理多个客户端的并发请求,因此产生了 HiveServer2。HiveServer2(HS2)允许远程客户端可以使用各种编程语言向 Hive 提交请求并检索结果,支持多客户端并发访问和身份验证。HS2 是由多个服务组成的单个进程,其包括基于 Thrift 的 Hive 服务(TCP 或 HTTP)和用于 Web UI 的 Jetty Web 服务。
修改 hadoop 集群的 core-site.xml 配置文件,增加如下配置,指定 hadoop 的 root 用户可以代理本机上所有的用户。
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
//复习窗口函数,发现窗口函数忘得差不多的了........
20191020,11111,85
20191020,22222,83
20191020,33333,86
20191021,11111,87
20191021,22222,65
20191021,33333,98
20191022,11111,67
20191022,22222,34
20191022,33333,88
20191023,11111,99
20191023,22222,33
create table demo_2
(logday string,
userid string,
score int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
#加载数据
load data local inpath '/root/2.txt' into table demo_2;
1、使用 over() 函数进行数据统计, 统计每个用户及表中数据的总数
//犯了一个错误? count函数是分组后求,窗口函数和聚合函数的区别在于聚合和不聚合
select
count() over(partition by userid) as num
from
demo_2;
//注意sum()是聚合函数
select
userid,
sum(1) over() as num
from
demo_2;
2、求用户明细并统计每天的用户总数
//这个题目没什么意义
select
userid,
count() over(partition by logday) as num
from
demo_2;
3.计算从第一天到现在的所有 score 大于80分的用户总数
//分析思路
select
count(userid)
from
demo_2
where score > 80
//group by userid;
//采用博客中的写法 不分组全局排序
select
*,
count() over(order by logday rows between unbounded preceding and current row) as num
from
demo_2
where score > 80;
//范围的总结
rows between current preceding/following and
4.计算每个用户到当前日期分数大于80的天数
select
*,
count() over(partition by userid order by logday rows between unbounded preceding and current row )
from
demo_2
where score > 80
order by userid,logday;
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
create table demo_3
(
name string,
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
#加载数据
load data local inpath "/root/3.txt" into table demo_3;
1.查询在2017年4月份购买过的顾客及总人数
//substr()函数应用
select
*,
count() over(order by orderdate)
from
demo_3
where substr(orderdate,1,7) = '2017-04'
order by orderdate;
2.查询顾客的购买明细及月购买总额
//理解 partition by分组的标准 order by 可以理解为函数操作的窗口标准 窗口大小可以理解为对字段排序的的范围标准
select
*,
sum(cost) over(partition by name order by substr(orderdate,1,7) rows between unbounded preceding and current row ) as result
from
demo_3;
3.查询顾客的购买明细及到目前为止每个顾客购买总金额
select
*,
sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row) as result
from
demo_3;
4.查询顾客上次的购买时间----lag()over()偏移量分析函数的运用
//函数偏移量分析的总结:注意之间的区别
//没有排序按默认的顺序偏移
select
*,
lag(orderdate,1) over(partition by name )
from
demo_3;
//按照排序之后的顺序偏移
select
*,
lag(orderdate,1) over(partition by name order by orderdate )
from
demo_3;
//这是错误的写法
select
*,
lag(orderdate,1) over(partition by name order by orderdate rows between unbounded preceding and current row ) num
from
demo_3;
5.查询前20%时间的订单信息
// ntile() 函数的应用 按照某个字段排序获取前%
with x as (
select
* ,
ntile(5) over(order by cost) as t1
from
demo_3)
select *
from x
where x.t1 = 1;
孙悟空,语文,87
孙悟空,数学,95
孙悟空 英语,68
大海,语文,94
大海,数学,56
大海,英语,84
宋宋,语文,64
宋宋,数学,86
宋宋,英语,84
婷婷,语文,65
婷婷,数学,85
婷婷,英语,78
create table demo_4
(
name string,
subject string,
score int
) row format delimited fields terminated by ",";
#加载数据
load data local inpath '/root/4.txt' into table demo_4;
1.每门学科学生成绩排名(是否并列排名、空位排名三种实现)
对排序函数的简单应用
row_number() 连续值相同的不同序号
rank() 不连续
dense_rank() 连续
2.每门学科成绩排名top n的学生 使用排序函数
with x as (
select
*,
row_number() over( partition by subject order by score) as num
from
demo_4)
select
*
from x
where x.num <= 3
a,2017-02-05,200
a,2017-02-06,300
a,2017-02-07,200
a,2017-02-08,400
a,2017-02-10,600
b,2017-02-05,200
b,2017-02-06,300
b,2017-02-08,200
b,2017-02-09,400
b,2017-02-10,600
c,2017-01-31,200
c,2017-02-01,300
c,2017-02-02,200
c,2017-02-03,400
c,2017-02-10,600
a,2017-03-01,200
a,2017-03-02,300
a,2017-03-03,200
a,2017-03-04,400
a,2017-03-05,600
create table demo_5 (
name string ,
dt string ,
money double
)
row format delimited fields terminated by "," ;
load data local inpath "/root/5.txt" into table demo_5 ;
1.连续n天销售记录的店铺案例(7天)
//使用dense_rank() 函数
//date_sub() 日期函数相减 select date_sub(cast("2020-04-09",1)) 日期函数
select
distinct t2.name
from
(
select
t1.name
from
(
with x as (
select
*,
dense_rank() over(partition by name order by dt ) as num
from
demo_5)
select
*,
date_sub(cast(x.dt as Date ),x.num) as result
from x) as t1
group by t1.result,t1.name
having count(result) >= 3
) as t2
//方法二 使用lag偏移?忘了
case when 函数的使用
悟空,A,男
娜娜,A,男
宋宋,B,男
凤姐,A,女
热巴,B,女
慧慧,B,女
create table demo_6(
name string ,
dname string ,
gender string
)
row format delimited fields terminated by "," ;
load data local inpath "/root/6.txt" into table demo_6 ;
dpt 男 女
A 2 1
B 1 2
select
dname ,gender ,count(gender)
from
demo_6
group by dname ,gender
//使用聚合函数
select
dname,
sum(case gender when "男" then 1 else 0 end ) as nan,
sum(case gender when "女" then 1 else 0 end ) as nv
from
demo_6
group by dname;
CONCAT()的使用 split()切分的使用 explode()炸裂函数的使用
网友评论