- Hive 总:https://cwiki.apache.org/confluence/display/Hive/Home
- 本文参考:https://cwiki.apache.org/confluence/display/Hive/GettingStarted
- functions: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
安装
$ tar -xzvf hive-x.y.z.tar.gz
$ cd hive-x.y.z
$ export HIVE_HOME={{pwd}}
$ export PATH=$HIVE_HOME/bin:$PATH
你也可以选择编译hive源码
$ git clone https://git-wip-us.apache.org/repos/asf/hive.git
$ cd hive
$ mvn clean package -Pdist
$ cd packaging/target/apache-hive-{version}-SNAPSHOT-bin/apache-hive-{version}-SNAPSHOT-bin
$ ls
LICENSE
NOTICE
README.txt
RELEASE_NOTES.txt
bin/ (all the shell scripts)
lib/ (required jar files)
conf/ (configuration files)
examples/ (sample input and query files)
hcatalog / (hcatalog installation)
scripts / (upgrade scripts for hive-metastore)
#hive+hadoop一起编译
mvn clean package -Phadoop-1,dist
mvn clean package -Phadoop-2,dist
运行hive
hive两种模式,第一种是CLI模式直接是shell交互式运行,第二种模式是C-S模式也就是hive客户端-服务端模式hiveserver2
环境变量
#必要的环境变量
#hadoop home
$ export HADOOP_HOME=<hadoop-install-dir>
$ $HADOOP_HOME/bin/hadoop fs -mkdir /tmp
$ $HADOOP_HOME/bin/hadoop fs -mkdir /user/hive/warehouse
$ $HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
$ $HADOOP_HOME/bin/hadoop fs -chmod g+w /user/hive/warehouse
#hive_HOME
export HIVE_HOME=<hive-install-dir>
第一种模式:运行hive CLI
#直接运行命令
$ $HIVE_HOME/bin/hive
第二种模式
#初始化模式
$ $HIVE_HOME/bin/schematool -dbType <db type> -initSchema
#启动hive Server
$ $HIVE_HOME/bin/hiveserver2
#启动hive 客户端
$ $HIVE_HOME/bin/beeline -u jdbc:hive2://$HS2_HOST:$HS2_PORT
#默认jdbc:hive2://localhost:10000
$ $HIVE_HOME/bin/beeline -u jdbc:hive2://
HcatLog
#server
$HIVE_HOME/hcatalog/sbin/hcat_server.sh
#client
$ $HIVE_HOME/hcatalog/bin/hcat
WebHCat
$ $HIVE_HOME/hcatalog/sbin/webhcat_server.sh
hive的运行模式
分为两种,一种是mapreduce运行在本地,另外一种是运行在集群上
hive> SET mapreduce.framework.name=local;
hive> SET hive.exec.mode.local.auto=false;
DDL
详细参考DDL
hive> CREATE TABLE pokes (foo INT, bar STRING);
#分区的意思是hdfs存储时候的物理目录按照分区字符串来存取。
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
hive> CREATE TABLE t_hive (a int, b int, c int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
#表描述
hive> SHOW TABLES;
hive> SHOW TABLES '.*s';
hive> DESCRIBE invites;
#alter
hive> ALTER TABLE events RENAME TO 3koobecaf;
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
hive> ALTER TABLE invites REPLACE COLUMNS (foo INT, bar STRING, baz INT COMMENT 'baz replaces new_col2');
#drop
DROP TABLE pokes;
DML
详细参考DML
LOCAL
代表着文件在本地磁盘上,如果没有LOCAL
那么代表着文件在HDFS上
OVERWRITE
代表如果记录存在那么覆盖,如果没有这个关键字在那么append上去
#本地文件
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
#两个partition
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
#HDFS
hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
SQL
SELECT
hive> SELECT a.foo FROM invites a WHERE a.ds='2008-08-15';
#HDFS
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='2008-08-15';
#本地
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(*) FROM invites a WHERE a.ds='2008-08-15';
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;
Group BY
#注意两种查询方式
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(*) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(*) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
join
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
MULTITABLE INSERT
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
STREAMING
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
网友评论