HIVE 是一个构建在Hadoop上的数据仓库框架。
![](https://img.haomeiwen.com/i11797539/988228b9adb1baac.png)
![](https://img.haomeiwen.com/i11797539/07947f47ac2bbf6f.png)
metastore是hive元数据的集中存放地,包括两部分:服务和后台数据的存储。
hive 数据类型
![](https://img.haomeiwen.com/i11797539/5a0d2d1a52df2403.png)
![](https://img.haomeiwen.com/i11797539/bc56719ec0a205d4.png)
托管表和外部表
存储格式
-- 默认的存储的格式:分割的文本
CREATE TABLE things (id INT, name STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 二进制存储格式:顺序文件、Avro文件、Parquet文件、RCFile、ORCFILE
create table … stored as AVRO;
create table … stored as Parquet;
-- 使用SerDe:RegexSerDe
-- 用正则表达式从一个文本文件中读取定长的观测站元数据
CREATE TABLE stations (usaf STRING, wban STRING, name STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\\d{6}) (\\d{5}) (.{29}) .*"
);
-- LOAD DATA 向 表中输入数据
LOAD DATA LOCAL INPATH "input/ncdc/metadata/stations-fixed-width.txt"
INTO TABLE stations;
-- 建立存储格式为AVRO的表,insert插入数据
CREATE TABLE users_avro (id INT, name STRING)
STORED AS AVRO;
INSERT OVERWRITE TABLE users_avro
SELECT * FROM users;
-- 建立存储格式为PARQUET的表,直接插入查询结果
CREATE TABLE users_parquet STORED AS PARQUET
AS
SELECT * FROM users;
-- 建立存储格式为ORCFILE的表,直接插入查询结果
CREATE TABLE users_orc STORED AS ORCFILE
AS
SELECT * FROM users;
建立顺序文件的存储表,直接插入查询结果
CREATE TABLE users_seqfile STORED AS SEQUENCEFILE
AS
SELECT id, name FROM users;
导入数据
-- 多表插入
CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
FROM records2
INSERT OVERWRITE TABLE stations_by_year
SELECT year, COUNT(DISTINCT station)
GROUP BY year
INSERT OVERWRITE TABLE records_by_year
SELECT year, COUNT(1)
GROUP BY year
INSERT OVERWRITE TABLE good_records_by_year
SELECT year, COUNT(1)
WHERE temperature != 9999 AND quality IN (0, 1, 4, 5, 9)
GROUP BY year;
表的修改
alter table XXX add target(col3 string);
表的丢弃
-- 删除表的数据,保留表的结构,truncate 对外部表不起作用,此时需要在hive的shell环境中使用 dfs -rmr
truncate table XXX
-- 用like创建和已有的表一样结构的表
create table AAA like XXX;
mapreduce脚本
#python 过滤低质量气温记录
import re
import sys
for line in sys.stdin:
(year, temp, q) = line.strip().split()
if (temp != "9999" and re.match("[01459]", q)):
print "%s\t%s" % (year, temp)
-- hive使用该函数
ADD FILE /Users/tom/book-workspace/hadoop-book/ch17-hive/src/main/python/is_good_quality.py;
FROM records2
SELECT TRANSFORM(year, temperature, quality)
USING 'is_good_quality.py'
AS year, temperature;
# python 脚本 reduce 最大温度
#!/usr/bin/env python
import sys
(last_key, max_val) = (None, 0)
for line in sys.stdin:
(key, val) = line.strip().split("\t")
if last_key and last_key != key:
print "%s\t%s" % (last_key, max_val)
(last_key, max_val) = (key, int(val))
else:
(last_key, max_val) = (key, max(max_val, int(val)))
if last_key:
print "%s\t%s" % (last_key, max_val)
ADD FILE ch17-hive/src/main/python/max_temperature_reduce.py;
FROM (
FROM records2
MAP year, temperature, quality
USING 'is_good_quality.py'
AS year, temperature) map_output
REDUCE year, temperature
USING 'max_temperature_reduce.py'
AS year, temperature;
用户自定义函数
//剪除字符串尾字符的UDF
package com.hadoopbook.hive;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class Strip extends UDF {
private Text result = new Text();
// 去除输入的前导和结束的空白字符
public Text evaluate(Text str) {
if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString()));
return result;
}
//去除字符串尾出现在指定字符集合的任何字符
public Text evaluate(Text str, String stripChars) {
if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString(), stripChars));
return result;
}
}
本地使用自定义函数
-- create function
create function strip as 'com.hadoopbook.hive.Strip' USING JAR '/path/to/hive-examples.jar'
select strip(' bee ') from dummy; -- 返回bee
select strip('banana','ab') from dummy; -- 返回nan
-- temporary
ADD JAR /path/to/hive-examples.jar;
create temporary function strip as 'com.hadoopbook.hive.Strip';
网友评论