美文网首页
2020-11-08-Hive-8(复习hive的基本概念与窗口

2020-11-08-Hive-8(复习hive的基本概念与窗口

作者: 冰菓_ | 来源:发表于2020-11-10 07:51 被阅读0次

1.HiveServer 和 HiveServer2 服务

Hive 内置了 HiveServer 和 HiveServer2 服务,两者都允许客户端使用多种编程语言进行连接,但是 HiveServer 不能处理多个客户端的并发请求,因此产生了 HiveServer2。HiveServer2(HS2)允许远程客户端可以使用各种编程语言向 Hive 提交请求并检索结果,支持多客户端并发访问和身份验证。HS2 是由多个服务组成的单个进程,其包括基于 Thrift 的 Hive 服务(TCP 或 HTTP)和用于 Web UI 的 Jetty Web 服务。
修改 hadoop 集群的 core-site.xml 配置文件,增加如下配置,指定 hadoop 的 root 用户可以代理本机上所有的用户。

<property>
 <name>hadoop.proxyuser.root.hosts</name>
 <value>*</value>
</property>
<property>
 <name>hadoop.proxyuser.root.groups</name>
 <value>*</value>
</property>
//复习窗口函数,发现窗口函数忘得差不多的了........

20191020,11111,85
20191020,22222,83
20191020,33333,86
20191021,11111,87
20191021,22222,65
20191021,33333,98
20191022,11111,67
20191022,22222,34
20191022,33333,88
20191023,11111,99
20191023,22222,33

create table demo_2
(logday string,    
userid string, 
score int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

#加载数据
load data local inpath '/root/2.txt' into table demo_2;

1、使用 over() 函数进行数据统计, 统计每个用户及表中数据的总数

//犯了一个错误? count函数是分组后求,窗口函数和聚合函数的区别在于聚合和不聚合
select 
count() over(partition by userid) as num
from 
demo_2;

//注意sum()是聚合函数 
select 
userid,
sum(1) over() as num
from 
demo_2;

2、求用户明细并统计每天的用户总数

//这个题目没什么意义
select
userid,
count() over(partition by logday) as num
from 
demo_2;

3.计算从第一天到现在的所有 score 大于80分的用户总数
//分析思路

select 
count(userid) 
from 
demo_2
where  score > 80
//group by  userid;

//采用博客中的写法 不分组全局排序
select 
*,
count() over(order by logday  rows between unbounded preceding and current row) as num
from
demo_2
where score > 80;

//范围的总结
rows  between  current  preceding/following  and  


4.计算每个用户到当前日期分数大于80的天数
select
*,
count() over(partition by  userid order by logday rows between unbounded preceding and current row )
from 
demo_2
where score > 80
order by userid,logday;

jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94

create table demo_3
(
name string, 
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

#加载数据
load  data local inpath "/root/3.txt" into table demo_3;


1.查询在2017年4月份购买过的顾客及总人数
//substr()函数应用
select 
*,
count() over(order by orderdate)
from
demo_3
where substr(orderdate,1,7) = '2017-04'
order by orderdate;

2.查询顾客的购买明细及月购买总额

//理解 partition by分组的标准 order by 可以理解为函数操作的窗口标准 窗口大小可以理解为对字段排序的的范围标准
select 
*,
sum(cost) over(partition by name order by  substr(orderdate,1,7) rows between unbounded preceding and current row ) as result   
from
demo_3;

3.查询顾客的购买明细及到目前为止每个顾客购买总金额

select 
*,
sum(cost)  over(partition by name order by orderdate rows between unbounded preceding and current row) as  result 
from
demo_3;

4.查询顾客上次的购买时间----lag()over()偏移量分析函数的运用
//函数偏移量分析的总结:注意之间的区别

//没有排序按默认的顺序偏移
select 
*,
lag(orderdate,1) over(partition by name )
from
demo_3;

//按照排序之后的顺序偏移
select 
*,
lag(orderdate,1) over(partition by name order by orderdate )
from
demo_3;

//这是错误的写法
select 
*,
lag(orderdate,1) over(partition by name order by orderdate rows between unbounded preceding and current row  ) num 
from
demo_3;

5.查询前20%时间的订单信息
// ntile() 函数的应用 按照某个字段排序获取前%
with x as (
select 
* ,
ntile(5) over(order  by  cost) as t1
from
demo_3)
select *
from  x
where  x.t1 = 1;



孙悟空,语文,87
孙悟空,数学,95
孙悟空 英语,68
大海,语文,94
大海,数学,56
大海,英语,84
宋宋,语文,64
宋宋,数学,86
宋宋,英语,84
婷婷,语文,65
婷婷,数学,85
婷婷,英语,78

create table demo_4
(
name string,
subject string, 
score int
) row format delimited fields terminated by ",";

#加载数据
load data local inpath '/root/4.txt' into table demo_4;

1.每门学科学生成绩排名(是否并列排名、空位排名三种实现)

对排序函数的简单应用
row_number()  连续值相同的不同序号
rank()       不连续 
dense_rank() 连续

2.每门学科成绩排名top n的学生  使用排序函数
with x as (
select 
*,
row_number() over( partition by subject order by score) as num
from
demo_4)
select
* 
from x
where x.num <= 3

a,2017-02-05,200
a,2017-02-06,300
a,2017-02-07,200
a,2017-02-08,400
a,2017-02-10,600
b,2017-02-05,200
b,2017-02-06,300
b,2017-02-08,200
b,2017-02-09,400
b,2017-02-10,600
c,2017-01-31,200
c,2017-02-01,300
c,2017-02-02,200
c,2017-02-03,400
c,2017-02-10,600
a,2017-03-01,200
a,2017-03-02,300
a,2017-03-03,200
a,2017-03-04,400
a,2017-03-05,600

create  table  demo_5 (
name string ,
dt string ,
money double 
)
row format delimited fields terminated by "," ;
load data local inpath "/root/5.txt"  into table demo_5 ;


1.连续n天销售记录的店铺案例(7天)
//使用dense_rank() 函数
//date_sub() 日期函数相减  select date_sub(cast("2020-04-09",1)) 日期函数 

select 
distinct t2.name
from 
(
select 
t1.name
from
(
with x as (
select 
*,
dense_rank() over(partition by name order by dt ) as num
from
demo_5)
select 
*,
date_sub(cast(x.dt as Date ),x.num) as result
from x) as t1
group by t1.result,t1.name
having  count(result)  >= 3  
) as t2 

//方法二 使用lag偏移?忘了


case when 函数的使用

悟空,A,男
娜娜,A,男
宋宋,B,男
凤姐,A,女
热巴,B,女
慧慧,B,女
create table demo_6(
name string ,
dname string ,
gender string 
)
row format delimited fields terminated by "," ;
 
load data local inpath "/root/6.txt" into table demo_6 ;

dpt   男      女
A     2       1
B     1       2

select 
dname ,gender ,count(gender)
from
demo_6
group by dname ,gender

//使用聚合函数
select 
dname,
sum(case gender when  "男"  then 1 else  0 end  ) as  nan,
sum(case gender when   "女"  then 1 else  0 end  ) as  nv
from
demo_6
group by dname;


CONCAT()的使用  split()切分的使用  explode()炸裂函数的使用

相关文章

网友评论

      本文标题:2020-11-08-Hive-8(复习hive的基本概念与窗口

      本文链接:https://www.haomeiwen.com/subject/xrbcbktx.html