Hive2

作者: lufaqiang | 来源:发表于2017-11-19 08:29 被阅读0次

beeline的连接
HDP版本hive升级
安全知识&kerberos初识
Hive2
Hive2
Hive2
oozie-hive-ssh调用
Impala常见问题笔记
hive jdbc 连接串
hive2的客户端连接

Hive的部署
解压缩、重命名、设置环境变量
在目录$HIVE_HOME/conf/下，执行命令mv hive-default.xml.template hive-site.xml重命名
hive.metastore.warehouse.dir： Hive数据存储目录
hive.exec.scratchdir：Hive数据临时文件目录
在目录$HIVE_HOME/conf/下，执行命令mv hive-env.sh.template hive-env.sh重命名
修改hadoop的配置文件hadoop-env.sh，修改内容如下：
Export HADOOP_CLASSPATH=.:$CLASSPATH:$HADOOP_CLASSPATH:$HADOOP_HOME/bin
在目录$HIVE_HOME/bin下面，修改文件hive-config.sh（可跳过）
hive-site.xml

<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>

查看hive版本：
mysql> select * from VERSION;

查看有哪些表：
mysql> select * from TBLS \G;

查看表对应的hdfs目录的metedata：
mysql> select * from SDS \G;

查看某个表的partitions：
mysql> select * from PARTITIONS where TBL_ID=1 \G;

查看某个表的列：
mysql> select * from COLUMNS_V2;

查看某个表的partition：
mysql> select * from PARTITION_KEYS;

连接bigdata 元数据库

从/usr/local/hive/conf/hive-site.xml找ConnectionUserName和ConnectionPassword（hive/hive2017）

跳到bigdata003

mysql -u hive -phive2017 -D hive

Hive与HBase对比
共同点
1.HBase与Hive都架构在Hadoop之上，都是用hdfs作为底层存储。
2.职能上都能对外提供表形式的数据查询等服务。

区别
1.Hive是建立在Hadoop之上为了减少MapReduce jobs编写工作的批处理系统，HBase是为了支持弥补Hadoop对实时操作的缺陷的项目。
2.Hive本身不存储和计算数据，它完全依赖于HDFS和MapReduce，Hive中的表是纯逻辑表，是对hdfs文件的一种记录方式。
3.Hbase是物理表，不是逻辑表，提供一个超大的内存hash表，搜索引擎通过它来存储索引，方便查询操作。

create table city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/bigdata/hive/city.txt' into table city; //将数据从‘’中取出来，放到city表中。

alter table city rename to city_new;

select * from city_new limit 10; //查询十条数据

select count(*) from city_new; //查询一共多少条数据

select * from city_new order by city_code desc limit 10 ; //最大值

select province_name,count(*) as cnt from city_new group by province_name order by cnt desc limit 10;
//聚合函数在后面跟着order by的话，必须将聚合函数，设置别名。

select count(distinct province_name) from city_new ; //查询省份的个数

select
count(*)
from
(
select
province_name
from
city_new
group by
province_name
)a; //查询省份的个数

//查询只有一个市的省份
1.首先，查出来所有省份及其市的个数
2.查询市的个数为1的省份

select province_name,count(*) as su from city_new group by province_name having su = 1;

select province_name,su from (
select province_name,count(*) as su from city_new group by province_name

)a
where su = 1;

外表与内表
//创建一个外表，将'/user/cll/city/'这个路径下的文件，倒进来
create EXTERNAl table city_ex(
province_code int,
province_name String,
city_code int,
city_name String

)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'
location '/user/cll/city/';

//创建一个内表，将'/user/cll/city/'这个路径下的文件，倒进来
//'/user/cll/city/'这个路径是将city.txt文件put到服务器上的
create table city_in(
province_code int,
province_name String,
city_code int,
city_name String

)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'
location '/user/cll/city/';

show create table city_ex;
show create table city_in;
//将外表删除以后，倒进来的文件的路径，去服务器上查询，仍然存在；hadoop fs -ls /user/cll/city 文件下依然有文件
//将内表删除以后，倒进来的文件的路径，去服务器上查询，文件也不存在。

分区：
CREATE TABLE user(
uid INT,
city_code INT,
model string,
access string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/bigdata/tanqi/hive/user.txt' into table user;

创建分区表：
create table user_daily(
uid int,
city_code int,
model String,
access String

)
partitioned by (p_date String);

//从user表中查出来数据，然后将其插入到表user_daily 2017.09.01这一天里面去
//静态分区
insert OVERWRITE table user_daily partition (p_date = '2017-09-01')
select * from user;

//插进来以后，可以展示查询知道，有几个分区
show partitions user_daily;

//动态分区（不指定日期）
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,'2017-09-02' FROM user
UNION ALL
SELECT *,'2017-09-03' FROM user
;

ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');

insert OVERWRITE table user_daily partition (p_date = '2017-01-01')
select * from user;

alter table user_daily drop partition (p_date='20170101');

select * from user_daily where p_date='2017-09-02';

select p_date,count(*) from user_daily group by p_date;

作业1：

a)统计WIFI环境下用户最多的5个城市
select city_code, count(uid) as cun from user where access='WIFI' group by city_code order by cun desc limit 5; //在前面加一个explain，可以查看执行过程
b)统计用户数>=3的access和city_code组合

select city_code,access,count(uid) as cun from user group by city_code,access having cun>=3;
c)机型中带有ne（不区分大小写），不同access及对应用户数
selectaccess,count(uid) from user where lower(model) like '%n%e%' group by access order by access desc;

select sum(if(access='2G',1,0))/count(*) from user ; //求占比（WIFI人数与总数的比值）

//计算尾数为[0-3]、[4-7]、[8-9]的人数
select
case
when uid % 10 in (0,1,2,3) then '0-3'
when uid % 10 in (4,5,6,7) then '4-7'
else '8-9'
end as interval,
count(*) as cun
from user
group by
case
when uid % 10 in (0,1,2,3) then '0-3'
when uid % 10 in (4,5,6,7) then '4-7'
else '8-9'
end;

//集合
select collect_set(access) from user; //去重
select collect_list(access) from user; //不去重

//join

select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;

select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;

select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
full join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;

窗口函数：(a\、b)
a\分组topN：(窗口函数)
1、分组 access
2、排序序号
3、top1 序号=1
select access,city_code,uid
from
(
select uid,access,city_code,
row_number() over (partition by access order by
city_code desc) as row_num
from user
)a
where row_num = 1;

b\累积和
select p_date,
sum(cun) over (order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date,count(*) as cun
from user_daily
where p_date between '2017-09-01' and '2017-09-30'
group by p_date
)a
;

create table phone (
model String,
os String,
brand String,
resolution String,
launch_date String
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'
location '/user/cll/phone/';

select * from phone limit 10;
set hive.cli.print.header=true;

select user.uid, user.city_code, city.province_name
from user left join city
on (user.city_code = city.city_code)
limit 20;

select user.model,user.access, user.city_code, phone.launch_date
from user left join phone
on (user.model = phone.model)
limit 20;

create table user_info(
uid int,
access String,
city_code int,
province_name String,
model String,
launch_date String
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'

insert into //如果插入错了，在来一遍的话，需要覆盖式插入(insert overwrite) table user_info
select user.uid, user.access,user.city_code, city.province_name,user.model,phone.launch_date
from user
left join city on (user.city_code = city.city_code)
left join phone on (user.model = phone.model)
;

select * from phone;

select * from user_info limit 10;

作业2
create table phone (
model string,
os string,
brand string,
resolution string,
launch_data string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/bigdata/tanqi/hive/phone.txt' into table phone;
set hive.cli.print.header=true;

create table user_info (
uid int,
access string,
city_code string,
provice_name string,
model string,
launch_data string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'
;
加载数据
以user为主表，分别左连接phone和city表，生成新表user_info
a)user和city连接时，利用city_code相等；user和phone连接时，利用model相等
b)user_info表包含字段：uid, access, city_code, province_name, model,launch_date

insert into table user_info
select user.uid, user.access, user.city_code, city.provice_name, user.model,phone.launch_data
from
user
left join
city
on (user.city_code = city.city_code)
left join
phone
on (user.model = phone.model);

a)统计不同省份下uid最大时的：province_name,uid,model
select provice_name,uid,model from
(
select provice_name,uid,model,
row_number() over (partition by provice_name order by uid desc) as row_num
from user_info
)a
where row_num=1;

b)按launch_date统计每年对应的累计总用户数。结果类似：
select launch_data,
sum(cnt) over (order by launch_data asc rows between unbounded preceding and current row)
from
(
select launch_data,count(*) as cnt from user_info
where launch_data between '2002' and '2016'
group by launch_data
)a
;