一、环境准备
1、版本选择
Hbase2.0不能跟Hadoop3.0官方不推荐目前,还是NT状态,2.8.3是S支持状态。
2、机器配置
配置固定ip(根据主机设置)
/etc/network/interfaces.d
#auto eth0
#iface eth0 inet dhcp
auto eth0
iface eth0 inet static
address 192.168.1.141
gateway 192.168.1.1
netmask 255.255.255.0
配置dns
/etc/resolvconf/resolv.conf.d
nameserver 119.29.29.29
nameserver 182.254.116.116
配置Host
/etc/hosts
#主机信息
192.168.1.141 hadoop01
#添加节点的信息
192.168.1.142 hadoop02
192.168.1.143 hadoop03
配置Hostname
/etc/hostname(根据主机设置)
hadoop01
修改最大线程数(略)
设置时区
sudo tzselect
sudo cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
3、jdk和ntp(各主机都需要)
jdk1.8
sudo add-apt-repository ppa:webupd8team/java
sudo apt-get update
sudo apt-get install oracle-java8-installer
root@hadoop01:~# java -version
java version "1.8.0_171"
Java(TM) SE Runtime Environment (build 1.8.0_171-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.171-b11, mixed mode)
写入path到/etc/profile
export JAVA_HOME=/usr/lib/jvm/java-8-oracle
export JRE_HOME=$JAVA_HOME/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOMR}/bin:$PATH
即使生效
source /etc/profile
ntp
sudo apt-get install ntp
service ntp start
二、配置各主机ssh免登陆
1、生成密钥(主机全部执行一遍)
ssh-keygen -t rsa
root@hadoop01:~# ssh-keygen -t rsa
Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa):
Created directory '/root/.ssh'.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /root/.ssh/id_rsa.
Your public key has been saved in /root/.ssh/id_rsa.pub.
The key fingerprint is:
SHA256:ZZhKxPH5mFe1jglHPW24sSu/Ovb2yaTYWPqZGezPgps root@hadoop01
The key's randomart image is:
+---[RSA 2048]----+
| .o. ...o |
| ... + . .=.o|
| . = + o .* |
| . . * + +o |
| . S o o .. |
| . o . |
| .* . |
| .@.% .|
| E+&=B.|
+----[SHA256]-----+
2、指向互信主机
cd /root/.ssh
cat id_rsa.pub >>authorized_keys
scp ~/.ssh/authorized_keys hadoop01:/root/.ssh/authorized_keys
scp ~/.ssh/authorized_keys hadoop02:/root/.ssh/authorized_keys
scp ~/.ssh/authorized_keys hadoop03:/root/.ssh/authorized_keys
3、ssh测试
root@hadoop01:~/.ssh# ssh hadoop01
Welcome to Ubuntu 16.04.1 LTS (GNU/Linux 3.10.65 aarch64)
* Documentation: https://help.ubuntu.com
* Management: https://landscape.canonical.com
* Support: https://ubuntu.com/advantage
Last login: Sun May 13 10:12:08 2018 from 192.168.1.107
root@hadoop01:~# exit
logout
Connection to hadoop01 closed.
root@hadoop01:~/.ssh# ssh hadoop02
Welcome to Ubuntu 16.04.1 LTS (GNU/Linux 3.10.65 aarch64)
* Documentation: https://help.ubuntu.com
* Management: https://landscape.canonical.com
* Support: https://ubuntu.com/advantage
Last login: Sun May 13 10:26:55 2018 from 192.168.1.107
root@hadoop02:~# exit
logout
Connection to hadoop02 closed.
root@hadoop01:~/.ssh# ssh hadoop03
Welcome to Ubuntu 16.04.1 LTS (GNU/Linux 3.10.65 aarch64)
* Documentation: https://help.ubuntu.com
* Management: https://landscape.canonical.com
* Support: https://ubuntu.com/advantage
Last login: Sun May 13 10:24:10 2018 from 192.168.1.107
root@hadoop03:~# exit
logout
Connection to hadoop03 closed.
root@hadoop01:~/.ssh#
三、搭建Zookeeper
http://zookeeper.apache.org/
1、下载
wget http://mirrors.tuna.tsinghua.edu.cn/apache/zookeeper/zookeeper-3.4.12/zookeeper-3.4.12.tar.gz
tar zxvf zookeeper-3.4.12.tar.gz
2、部署
复制配置文件
mkdir /home/zookeeper-3.4.12/data
mkdir -p /home/zookeeper-3.4.12/datalog
cd /home/zookeeper-3.4.12/conf
cp zoo_sample.cfg zoo.cfg
zoo.cfg内容
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
dataDir=/home/zookeeper-3.4.12/data
dataLogDir=/home/zookeeper-3.4.12/datalog
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1
server.1=hadoop01:2888:3888
server.2=hadoop02:2888:3888
server.3=hadoop03:2888:3888
在zookeeper的data目录下创建myid文件,master机内容1,其他主机2和3;(复制后记得修改)
复制到slave主机
scp -r zookeeper-3.4.12 hadoop02:/home/
scp -r zookeeper-3.4.12 hadoop03:/home/
各主机etc/profile
export ZOOKEEPER_HOME=/home/zookeeper-3.4.12
export PATH=$PATH:$ZOOKEEPER_HOME/bin:$ZOOKEEPER_HOME/conf
记得
source /etc/profile
3、启动
各主机启动
zkServer.sh start
root@hadoop01:/home# zkServer.sh start
ZooKeeper JMX enabled by default
Using config: /home/zookeeper-3.4.12/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
4、常用命令
启动
zkServer.sh start
停止
zkServer.sh stop
状态
zkServer.sh status
5、验证
leader主机由zookeeper推荐,自动标03为leader。
root@hadoop03:/# zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /home/zookeeper-3.4.12/bin/../conf/zoo.cfg
Mode: leader
root@hadoop02:/# zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /home/zookeeper-3.4.12/bin/../conf/zoo.cfg
Mode: follower
root@hadoop01:~# zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /home/zookeeper-3.4.12/bin/../conf/zoo.cfg
Mode: follower
正常运行。
四、搭建Hadoop
http://hadoop.apache.org/
1、下载
下载bin链接版本,src版本还需要自己编译。
wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-2.8.3/hadoop-2.8.3.tar.gz
tar zxvf hadoop-2.8.3.tar.gz
2、配置
在各主机上建立相关目录
mkdir /home/data
mkdir /home/data/journal
mkdir /home/data/tmp
mkdir /home/data/hdfs
mkdir /home/data/hdfs/data
mkdir /home/data/hdfs/name
配置core-site.xml
<!-- 指定hdfs的nameservice为ns -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns</value>
</property>
<!--指定hadoop数据临时存放目录-->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/data/tmp</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>4096</value>
</property>
<!--指定zookeeper地址-->
<property>
<name>ha.zookeeper.quorum</name>
<value>hadoop01:2181,hadoop02:2181,hadoop03:2181</value>
</property>
配置hdfs-site.xml
<configuration>
<!--指定hdfs的nameservice为ns,需要和core-site.xml中的保持一致 -->
<property>
<name>dfs.nameservices</name>
<value>ns</value>
</property>
<!-- ns下面有两个NameNode,分别是nn1,nn2 -->
<property>
<name>dfs.ha.namenodes.ns</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.ns.nn1</name>
<value>hadoop01:9000</value>
</property>
<!-- nn1的http通信地址 -->
<property>
<name>dfs.namenode.http-address.ns.nn1</name>
<value>hadoop01:50070</value>
</property>
<!-- nn2的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.ns.nn2</name>
<value>hadoop02:9000</value>
</property>
<!-- nn2的http通信地址 -->
<property>
<name>dfs.namenode.http-address.ns.nn2</name>
<value>hadoop02:50070</value>
</property>
<!-- 指定NameNode的元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://hadoop01;hadoop02;hadoop03/ns</value>
</property>
<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/home/data/journal</value>
</property>
<!-- 开启NameNode故障时自动切换 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 配置失败自动切换实现方式 -->
<property>
<name>dfs.client.failover.proxy.provider.ns</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,如果ssh是默认22端口,value直接写sshfence即可(hadoop:22022) -->
<property>
<name>dfs.ha.fencing.methods</name>
<!-- <value>sshfence</value> -->
<value>
sshfence
shell(/bin/true)
</value>
</property>
<!-- 使用隔离机制时需要ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/data/hdfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/data/hdfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<!-- 在NN和DN上开启WebHDFS (REST API)功能,不是必须 -->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
</configuration>
配置mapred-site.xml
内存配置问题,是主机只有1G内存。内存大可不用配置。
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>
/home/hadoop-2.8.3/etc/hadoop,
/home/hadoop-2.8.3/share/hadoop/common/*,
/home/hadoop-2.8.3/share/hadoop/common/lib/*,
/home/hadoop-2.8.3/share/hadoop/hdfs/*,
/home/hadoop-2.8.3/share/hadoop/hdfs/lib/*,
/home/hadoop-2.8.3/share/hadoop/mapreduce/*,
/home/hadoop-2.8.3/share/hadoop/mapreduce/lib/*,
/home/hadoop-2.8.3/share/hadoop/yarn/*,
/home/hadoop-2.8.3/share/hadoop/yarn/lib/*
</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>512</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx512M</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>512</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx256M</value>
</property>
</configuration>
配置yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop01</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<description>The address of the RM web application.</description>
<name>yarn.resourcemanager.webapp.address</name>
<value>hadoop01:18008</value>
</property>
</configuration>
配置slaves
hadoop01
hadoop02
hadoop03
配置hadoop-env.sh
export HADOOP_OPTS="$HADOOP_OPTS -Duser.timezone=GMT+08"
配置yarn-env.sh
YARN_OPTS="$YARN_OPTS -Duser.timezone=GMT+08"
配置path
etc/profile
export HADOOP_HOME=/home/hadoop-2.8.3
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source /etc/profile
3、配置slave
复制到slave
cd /home
scp -r hadoop-2.8.3 hadoop02:/home/
scp -r hadoop-2.8.3 hadoop03:/home/
4、首次启动
1、首先启动各个节点的Zookeeper,在各个节点上执行以下命令:
zkServer.sh start
2、在某一个namenode节点执行如下命令,创建命名空间
hdfs zkfc -formatZK
3、在每个journalnode节点用如下命令启动journalnode
hadoop-daemon.sh start journalnode
4、在主namenode节点格式化namenode和journalnode目录
hdfs namenode -format ns
5、在主namenode节点启动namenode进程
hadoop-daemon.sh start namenode
6、在备namenode节点执行第一行命令,这个是把备namenode节点的目录格式化并把元数据从主namenode节点copy过来,并且这个命令不会把journalnode目录再格式化了!然后用第二个命令启动备namenode进程!
hdfs namenode -bootstrapStandby
hadoop-daemon.sh start namenode
7、在两个namenode节点都执行以下命令
hadoop-daemon.sh start zkfc
8、在所有datanode节点都执行以下命令启动datanode
hadoop-daemon.sh start datanode
5、常用命令
启动和停止
start-dfs.sh
start-yarn.sh
stop-yarn.sh
stop-dfs.sh
6、验证
看图示意
http://192.168.1.141:18008/cluster/nodes
http://192.168.1.142:50070/dfshealth.html#tab-overview
http://192.168.1.141:50070/dfshealth.html#tab-overview
访问正常
五、搭建Hbase
http://hbase.apache.org/
1、下载
带bin的不用编译。
wget http://mirrors.hust.edu.cn/apache/hbase/2.0.0/hbase-2.0.0-bin.tar.gz
tar -zvxf hbase-2.0.0-bin.tar.gz
2、配置
配置hbase-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-oracle
export HBASE_CLASSPATH=/home/hadoop-2.8.3/etc/hadoop
export HBASE_MANAGES_ZK=false
export TZ="Asia/Shanghai"
关闭hbase自带的zookeeper,这个只能测试,不能生产环境。
classpath一定要改成hadoop的目录,不然不认识集群名称。
网上大部分教程都不是真正的分布式。
配置hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://ns/hbase</value>
</property>
<!--启用分布式集群-->
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<!--默认HMaster HTTP访问端口-->
<property>
<name>hbase.master.info.port</name>
<value>16010</value>
</property>
<!--默认HRegionServer HTTP访问端口-->
<property>
<name>hbase.regionserver.info.port</name>
<value>16030</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>hadoop01:2181,hadoop02:2181,hadoop03:2181</value>
</property>
<property>
<name>hbase.coprocessor.abortonerror</name>
<value>false</value>
</property>
</configuration>
ns是前面配置的namenode集群名称
配置regionservers
hadoop02
hadoop03
配置profile
export HBASE_HOME=/home/hbase-2.0.0
export PATH=$HBASE_HOME/bin:$PATH
source /etc/profile
3、启动
复制到slave
cd /home/
scp -r /home/hbase-2.0.0 hadoop02:/home/
scp -r /home/hbase-2.0.0 hadoop03:/home/
start-hbase.sh
root@hadoop01:/home# start-hbase.sh
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/hbase-2.0.0/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/hadoop-2.8.3/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
running master, logging to /home/hbase-2.0.0/logs/hbase-root-master-hadoop01.out
hadoop03: running regionserver, logging to /home/hbase-2.0.0/bin/../logs/hbase-root-regionserver-hadoop03.out
hadoop02: running regionserver, logging to /home/hbase-2.0.0/bin/../logs/hbase-root-regionserver-hadoop02.out
stop-hbase.sh
4、常用命令
启动(master机器)
start-hbase.sh
关闭
stop-hbase.sh
启动节点
hbase-daemon.sh start regionserver
5、验证
http://192.168.1.141:16010/master-status
http://192.168.1.142:16030/rs-status
http://192.168.1.143:16030/rs-status
六、测试系统
1、测试Namenode自动切换
在02上
root@Hadoop02:~# jps
3410 QuorumPeerMain
5636 DFSZKFailoverController
5765 NodeManager
5367 DataNode
5287 NameNode
5498 JournalNode
5979 Jps
kill namenode
root@Hadoop02:~# kill -9 5287
回去看standby的是否变成active自动切换成功图片
2、扩容量增加Datanode
3、Hadoop Wordcount Sample
4、Hbase Shell
# hbase shell
(1)创建表
create 'test','address'
(2)添加记录
put'test','row1','address:province','zhejiang'
put 'test','row2','address:city','hangzhou'
(3)查看记录
get 'test','row1'
(4)查看表中的记录总数
count 'test'
(5)删除记录
delete 'test','row1','address'
(6)删除一张表
disable 'test'
drop 'test'
(7)查看所有记录
scan 'test'
hbase(main):001:0> create 'test','address'
Created table test
Took 7.7403 seconds
=> Hbase::Table - test
hbase(main):002:0> put'test','row1','address:province','zhejiang'
Took 1.0868 seconds
hbase(main):003:0> put 'test','row2','address:city','hangzhou'
Took 0.0293 seconds
hbase(main):004:0> get 'test','row1'
COLUMN CELL
address:province timestamp=1526199666251, value=zhejiang
1 row(s)
Took 0.2447 seconds
hbase(main):005:0> count 'test'
2 row(s)
Took 0.2910 seconds
=> 2
hbase(main):006:0> scan 'test'
ROW COLUMN+CELL
row1 column=address:province, timestamp=1526199666251, value=zhejiang
row2 column=address:city, timestamp=1526199674131, value=hangzhou
2 row(s)
Took 0.0520 seconds
hbase(main):007:0>
从0开始耗时10小时可用。
网友评论