[toc]
语法
https://cwiki.apache.org/confluence/display/Hive/Home#Home-UserDocumentation
url ref_url
userid url ref
user1 url1 url0
user1 url2 url1
user1 url3 url2
ref url next_url
url0 url1 url2
select ref,url,lead(url,1,0) over (partition by userid order by time) as next_url
url0 url1 url2
url1 url2 url3
explode
explode(ARRAY) 列表中的每个元素生成一行
explode(MAP) map中每个key-value对,生成一行,key为一列,value为一列
外部表
external:外部的
区别:
如果表为外部表,删除表时,不会删除数据文件,只会删除元数据 如果表为管理表,删除表时,数据文件与元数据会同时删除
应用:
外部表+分区表
创建表
create [external] table [ if not exists ] tb_name(
col1 type comment ''
……
)
partitioned by (col type)
row format delimited fields terminated by '\001'
lines terminated by '\n'
stored as textfile
location 'hdfs_path';--用于指定表对应的hdfs目录
load data [local] inpath 'path' [overwrite] into table tb_name partition by (daystr='20180722')
手动创建分区并加载数据
手动创建一个分区目录
daystr=20180722
手动将数据放入分区目录
两种方案
第一种
alter table add partition (daystr='20180722')
第二种
msck repair tb_name
SQL
select …… from …… where group by having order by desc limit
四种排序
order by : 全局排序
sort by:局部排序,多个reduce每个reduce内部有序
distribute by :分区
cluster by:当sortby与distribute by指向同一字段时,可使用cluster by代替
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
设置每个reduce最多处理多大的数据量
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
设置reduce最多有多少个
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
设置reduce的个数
set mapreduce.job.reduces=<number>
Hive数据加载
数据源
logs/20180722/2018072200.log
2018072201.log
……
2018072223.log
hive表
log_source
day
hour
需求:每天00点开始将昨天的数据加载到hive表对应的分区中
思路
获取数据源
目录是动态变化的
目标
hive表
实现
bin/hive -e/-f "load"
加载数据到hive表
load
insert
将数据保存到hive表中
insert into/overwrite table tb_name [partition ()]
select……
将数据保存到文件系统
insert overwrite [local] directory ''
create
三、Hive的客户端及元数据管理
hive客户端
hive cli
beeline:通用jdbc客户端 驱动 hostname:port 用户名 密码
启动hiveserver
bin/hiveserver2
启动beeline
bin/beeline
!connect jdbc:hive2://hostname:10000
jdbc
https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-JDBC
元数据服务
derby
mysql
beeline hiveserver2 mysql(metadata)
metastore:服务,用于统一管理元数据
beeline hiveserver2 metastore mysql
配置metastore服务
hive.metastore.uris
thrift://hostname:9083
启动metastore服务
bin/hive --service metastore
启动顺序
启动metastore
启动hiveserver2
启动客户端
注意:只要配置了uris这个参数,就必须先启动metastore服务
四、Hive中数据存储格式与压缩
压缩
snappy
/lzo
/lz4
hive中设置压缩
与MapReduce一致,修改mapreduce的参数, mapreduce程序的map的输出进行压缩
mapreduce.map.output.compress=true
mapreduce.Lz4Codec
hive.exec.compress.intermediate:true
代表如果一个SQL语句需要多个MapReduce程序执行的话,中间的MapReduce的输入是否进行压缩
文件类型
create table tb_name(
)
stored as textfile;
1 2 3
a b c
d e f
……
按行存储的文件格式:textfile
abcdef
按列存储的文件格式:orc
/parquet
ad……be……cf……
userid time url
HGCM21587IUT 2018-07-22 17:00:00 www.baidu.com
([^ ]*) ([^\?]*) ([^ ]*)
解决方案:通过正则加载对应的列
https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-ApacheWeblogData
create database if not exists file_test;
use file_test;
create table if not exists source_file(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t';
load data local inpath '/opt/datas/2015082818' into table source_file;
parquet
create table if not exists source_file_p(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as PARQUET;
insert overwrite table source_file_p
select * from source_file;
orc
create table if not exists source_file_o(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as orc;
insert into table source_file_o
select * from source_file;
create table Addresses (
name string,
street string,
city string,
state string,
zip int
) stored as orc tblproperties ("orc.compress"="SNAPPY");
create table if not exists source_file_o_c(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as orc tblproperties ("orc.compress"="SNAPPY");
insert into table source_file_o_c
select * from source_file;
注意:parquet压缩:
set parquet.compression=SNAPPY;
五、复杂案例实现
六、Hive中的Join与数据倾斜
作业:
1-实现电商日志分析的PV、UV统计并将结果导出到mysql
创建数据库、创建表
create database if not exists log_analysis;
use log_analysis;
create table if not exists log_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
) partitioned by (datstr string,hour string)
row format delimited fields terminated by '\t';
每天pv,uv保存到结果表
daystr pv uv
导出到mysql
网友评论