select
聚合函数
max/min/count/sum/avg 多进一出
分组函数 group by
求每个部门的平均工资
select deptno, avg(salary) from ruozedata_emp group by deptno;
Expression not in GROUP BY key 'ename'
select中出现的字段,如果没有出现在组函数/聚合函数中,必须出现在group by里面
求每个部门(deptno)、工作岗位(job)的最高工资(salary)
求每个部门的平均工资大于2000的部门
select deptno, avg(salary) from ruozedata_emp group by deptno where avg(salary)>2000;
where是需要写在group by之前
where和having的执行之在什么位置?
where 和having执行是在分组之前,因为先筛选完在分组,需要分组的数据就变少了,效率会高
case when then
select ename,salary,
case
when salary>1 and salary<=1000 then 'lower'
when salary>1000 and salary<=2000 then 'middle'
when salary>2000 and salary<=4000 then 'high'
else 'highest'
end
from ruozedata_emp;
join
1 ruoze
2 j
3 k
1 30
2 29
4 21
create table a(
id int, name string
) row format delimited fields terminated by '\t';
create table b(
id int, age int
) row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/join_a.txt' overwrite into table a;
load data local inpath '/home/hadoop/data/join_b.txt' overwrite into table b;
inner join = join
1 ruoze 30
2 j 29
outer join : left right full
1 ruoze 30
2 j 29
3 k NULL
分区表 partition
who when what
click_log
click_log_yyyyMMdd
hive hdfs + partition <== where partition
==> reduce io
分区表存在的意义
静态分区
create table order_partition(
ordernumber string,
eventtime string
)
partitioned by (event_month string)
row format delimited fields terminated by '\t';
LOAD DATA LOCAL INPATH '/home/hadoop/data/order.txt'
OVERWRITE INTO TABLE order_partition
PARTITION(event_month='2014-05');
alter table PARTITIONS convert to character set latin1;
alter table PARTITION_KEYS convert to character set latin1;
ALTER TABLE order_partition ADD IF NOT EXISTS
PARTITION (event_month='2014-07') ;
create table order_4_partition(
ordernumber string,
eventtime string
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/order.txt' overwrite into table order_4_partition;
insert overwrite table order_partition
partition(event_month='2014-08')
select * from order_4_partition;
以上讲解的是静态分区中的单级分区
多级分区
create table order_mulit_partition(
ordernumber string,
eventtime string
)
partitioned by (event_month string,event_day string)
row format delimited fields terminated by '\t';
LOAD DATA LOCAL INPATH '/home/hadoop/data/order.txt'
OVERWRITE INTO TABLE order_mulit_partition
PARTITION(event_month='2014-05', event_day='01');
动态分区
create table ruozedata_static_emp
(empno int, ename string, job string, mgr int, hiredate string, salary double, comm double)
PARTITIONED by(deptno string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t' ;
insert into table ruozedata_static_emp partition(deptno='20')
select empno,ename,job,mgr,hiredate,salary,comm from ruozedata_emp
where deptno=20;
create table ruozedata_dynamic_emp
(empno int, ename string, job string, mgr int, hiredate string, salary double, comm double)
PARTITIONED by(deptno string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t' ;
动态分区明确要求:分区字段写在select的最后面
insert into table ruozedata_dynamic_emp partition(deptno)
select empno,ename,job,mgr,hiredate,salary,comm,deptno from ruozedata_emp ;
set hive.exec.dynamic.partition.mode=nonstrict;
这是hive中常用的设置key=value的方式
语法格式:
set key=value; 设置
set key; 取值
函数(内置函数):function
官网地址:
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-DateFunctions
build-in
show functions
cast
作业:
1) hadoop fs -ls vs hadoop fs -ls /
hadoop fs -ls相当于加点会直接查找当前目录,但是hadoop fs -ls /会查找当前目录下内容;
2) left semi join
参考:
https://blog.csdn.net/bbbeoy/article/details/62236729
网友评论