1.下载安装hive
https://mirrors.tuna.tsinghua.edu.cn/apache/hive/hive-3.1.1/
解压
tar -zxvf apache-hive-3.1.1-bin.tar.gz
2.启动 hadoop daemon 服务
因为 hive 依赖于 hdfs,所以需要启动 hadoop 服务
$HADOOP_HOME/sbin/start-all.sh
3.配置 hive 并使用 mysql 存储 metastore 信息
配置相关环境变量
vim /etc/profile
export HIVE_HOME=/home/hadoop/apache-hive-3.1.1-bin
export PATH=$HIVE_HOME/bin:$PATH
配置mysql:
cd $HIVE_HOME/scripts/metastore/upgrade/mysql # 进入 sql 脚本目录,便于后面执行脚本
service mysqld start
mysql -u root -p # root 登陆 mysql 服务器
>create 'hive'@'localhost' identified by 'password'; # 创建 hive 用户
>create database hive; # 创建 metastore 库
>grant all on hive.* to 'hive'@'localhost'; # 赋予相关库权限
>flush privileges; # 令权限生效
>source hive-schema-3.1.0.mysql.sql # 初始化 meta_store 相关表结构
配置 hive:
这里用 mysql 作为 hive metastore 因此需要下载 jdbc 连接库,以 centos7 为例,下载 jdbc jar
wget https://dev.mysql.com/downloads/file/?id=485761 # 下载页面
yum install mysql-connector-java-8.0.16-1.el7.noarch.rpm # 安装 jdbc
ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar # 将组件 软连接到 相关库
vim $HIVE_HOME/conf/hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost/hive?createDatabaseIfNotExist=true</value>
<description>metadata is stored in a MySQL server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>MySQL JDBC driver class</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>user name for connecting to mysql server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>$hive_password</value>
<description>password for connecting to mysql server</description>
</property>
<property>
<name>datanucleus.autoCreateSchema</name>
<value>true</value>
</property>
<property>
<name>datanucleus.fixedDatastore</name>
<value>true</value>
</property>
</configuration>
上面是配置 mysql 信息,主要是三个,db_name, account, password 根据自己的情况修改。
启动hive,创建一张外部表:
hdfs dfs -mkdir -p /dbtaobao/dataset/user_log # 创建表目录
hdfs dfs -put ~/tmp.log /dbtaobao/dataset/user_log # 将 csv 分件作为 hive 外部数据源
[hadoop@hadoop1 mysql]$ head -n 10 ~/tmp.log
328862,323294,833,2882,2661,08,29,0,0,1,内蒙古
328862,844400,1271,2882,2661,08,29,0,1,1,山西
328862,575153,1271,2882,2661,08,29,0,2,1,山西
328862,996875,1271,2882,2661,08,29,0,1,1,内蒙古
328862,1086186,1271,1253,1049,08,29,0,0,2,浙江
328862,623866,1271,2882,2661,08,29,0,0,2,黑龙江
328862,542871,1467,2882,2661,08,29,0,5,2,四川
328862,536347,1095,883,1647,08,29,0,7,1,吉林
328862,364513,1271,2882,2661,08,29,0,1,2,贵州
328862,575153,1271,2882,2661,08,29,0,0,0,陕西
创建一张 hive 表
create external table dbtaobao.user_log(user_id INT,item_id INT,cat_id INT,merchant_id INT,brand_id INT,month STRING,day STRING,action INT,age_range INT,gender INT,province STRING) comment "first test table" row format delimited fields terminated by ',' stored as textFile location '/dbtaobao/dataset/user_log'; # create table ddl
上面创建 user_log 表的语句和 mysql 很想,其中特别说明下:
- row format delimited 告诉 hive csv 文件的一行对应 hive 表一行
- terminated by ',' 行内字段分割符
- STORED AS TEXTFILE 存储为文本格式, 另一种格式为 Sequence Files
- location 指定 hdfs 中的存储位置
网友评论