版本
hive 2.3.7
spark 2.4.5
hadoop 2.7.7
添加环境变量
export HADOOP_HOME=/opt/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_HOME=/opt/spark
export HIVE_HOME=/opt/hive
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${HIVE_HOME}/bin:$PATH
构建spark-without-hive
编译环境准备
- java 1.8
- mvn 3.6.3
添加mvn镜像
vi $MAVEN_HOME/settings.xml
<mirrors>
<mirror>
<id>nexus-aliyun</id>
<mirrorOf>*</mirrorOf>
<name>Nexus aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</mirror>
</mirrors>
下载spark源码
git clone https://gitclone.com/github.com/apache/spark.git
cd spark
git checkout v2.4.5
构建
./dev/make-distribution.sh --name hadoop2.7-without-hive --tgz -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.7 -Pparquet-provided -Porc-provided -DskipTests
解压
mv spark2.4.5-hadoop2.7-without-hive.tgz /opt
cd /opt
tar -xzf spark2.4.5-hadoop2.7-without-hive.tgz
ln -s spark2.4.5-hadoop2.7-without-hive spark
添加配置(集群测试-非必须)
vi $SPARK_HOME/conf/spark-env.sh
chmod +x $SPARK_HOME/conf/spark-env.sh
#!/usr/bin/env bash
export SPARK_MASTER_HOST=__host__
export SPARK_WORKER_CORES=2
export SPARK_WORKER_MEMORY=4g
export SPARK_WORKER_INSTANCES=1
export SPARK_EXECUTOR_MEMORY=4g
export SPARK_DRIVE_MEMORY=4g
测试
$SPARK_HOME/sbin/start-all.sh
$SPARK_HOME/bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://__master_host__:7077 \
$SPARK_HOME/examples/jars/spark-examples*.jar \
10
$SPARK_HOME/sbin/stop-all.sh
安装hdfs
略
配置yarn
vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>__host__</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>spark_shuffle,mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<configuration>
配置fair-scheduler.xml
<?xml version="1.0"?>
<allocations>
<queue name="sample_queue">
<minResources>4000 mb,2vcores</minResources>
<maxResources>64000 mb,32vcores</maxResources>
<maxRunningApps>50</maxRunningApps>
<weight>2.0</weight>
<schedulingPolicy>fair</schedulingPolicy>
</queue>
<defaultFairSharePreemptionTimeout>600</defaultFairSharePreemptionTimeout>
<defaultMinSharePreemptionTimeout>60</defaultMinSharePreemptionTimeout>
<queueMaxAMShareDefault>0.5</queueMaxAMShareDefault>
<queuePlacementPolicy>
<rule name="specified" />
<rule name="primaryGroup" create="false" />
<rule name="nestedUserQueue">
<rule name="secondaryGroupExistingQueue" create="false" />
</rule>
<rule name="default" queue="sample_queue"/>
</queuePlacementPolicy>
</allocations>
修改yarn集群资源显式统计(非必须)
vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
<name>yarn.scheduler.capacity.resource-calculator</name>
<value>org.apache.hadoop.yarn.util.resource.DominantResourceCalculator</value>
添加spark-shuffle-jar
cp $SPARK_HOME/yarn/spark-2.4.5-yarn-shuffle.jar $HADOOP_HOME/share/hadoop/yarn/lib
启动yarn
$SPARK_HOME/sbin/start-yarn.sh
测试
$SPARK_HOME/bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
$SPARK_HOME/examples/jars/spark-examples*.jar \
10
安装hive
下载hive
wget http://mirror.bit.edu.cn/apache/hive/hive-2.3.7/apache-hive-2.3.7-bin.tar.gz
mv apache-hive-2.3.7-bin.tar.gz /opt
cd /opt
tar -xzf apache-hive-2.3.7-bin.tar.gz
ln -s apache-hive-2.3.7-bin hive
添加spark依赖包
vi $HIVE_HOME/bin/hive
# 找到 ${HIVE_LIB}/*.jar 大概在140行,在下面添加
for f in ${SPARK_HOME}/jars/*.jar; do
CLASSPATH=${CLASSPATH}:$f;
done
配置hive-site.xml
vi $HIVE_HOME/conf/hive-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>datanucleus.schema.autoCreateTables</name>
<value>true</value>
</property>
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
<description>creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once</description>
</property>
<!-- 使用mysql存储hive metadata, 非必须 -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://__host__:__port__/hive_metastore?createDatabaseIfNotExist=true&useSSL=false</value>
<description>metadata is stored in a MySQL server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>MySQL JDBC driver class</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>__username__</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>__password__</value>
</property>
<!-- 以下为 hive 优化项,非必须 -->
<property>
<name>hive.merge.sparkfiles</name>
<value>true</value>
<description>在 Spark DAG 转换结束时合并小文件</description>
</property>
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>hive Insert动态分区</description>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
<description>hive Insert动态分区</description>
</property>
<property>
<name>hive.exec.max.dynamic.partitions</name>
<value>10000</value>
<description>hive Insert动态分区,允许的最大分区数量</description>
</property>
<property>
<name>hive.exec.max.dynamic.partitions.pernode</name>
<value>10000</value>
<description>hive Insert动态分区,单节点允许的最大分区数量</description>
</property>
<property>
<name>hive.log.explain.output</name>
<value>true</value>
<description>将spark在用户级别记录查询的 EXPLAIN 输出</description>
</property>
<property>
<name>hive.explain.user</name>
<value>true</value>
<description>用户级别记录查询的 EXPLAIN 输出</description>
</property>
<property>
<name>hive.spark.explain.user</name>
<value>true</value>
<description>将spark用户级别记录查询的 EXPLAIN 输出</description>
</property>
<!-- hive on spark 必备配置 -->
<property>
<name>hive.execution.engine</name>
<value>spark</value>
<description>修改hive的执行引擎为saprk</description>
</property>
<!-- spark 相关配置 -->
<property>
<name>spark.master</name>
<value>yarn</value>
<description>配置spark on yarn,standlone直接使用master url</description>
</property>
<property>
<name>spark.serializer</name>
<value>org.apache.spark.serializer.KryoSerializer</value>
<description>配置spark的序列化类</description>
</property>
<property>
<name>spark.executor.extraClassPath</name>
<value>hdfs://__hdfs_host__:9000/hive-lib/*</value>
<description>配置spark用到的hive的jar包</description>
</property>
<property>
<name>spark.yarn.jars</name>
<value>hdfs://__hdfs_host__:9000/spark-jars/*</value>
<description>配置spark的lib包在hdfs的位置</description>
</property>
<!--配置spark executor的计算资源 -->
<property>
<name>spark.executor.memory</name>
<value>4g</value>
</property>
<property>
<name>spark.yarn.executor.memoryOverhead</name>
<value>1g</value>
</property>
<property>
<name>spark.driver.memory</name>
<value>4g</value>
</property>
<property>
<name>spark.yarn.driver.memoryOverhead</name>
<value>1g</value>
</property>
<property>
<name>spark.executor.cores</name>
<value>2</value>
</property>
<!-- 以下为spark优化项,非必须 -->
<property>
<name>hive.spark.client.connect.timeout</name>
<value>10000ms</value>
</property>
<property>
<name>spark.eventLog.enabled</name>
<value>true</value>
<description>配置spark的日志</description>
</property>
<property>
<name>spark.eventLog.dir</name>
<value>hdfs://__hdfs_host__:9000/spark-eventlog</value>
</property>
<!--配置spark executor动态分配-->
<property>
<name>spark.shuffle.service.enabled</name>
<value>true</value>
</property>
<property>
<name>spark.dynamicAllocation.enabled</name>
<value>true</value>
</property>
<property>
<name>spark.dynamicAllocation.minExecutors</name>
<value>0</value>
</property>
<property>
<name>spark.dynamicAllocation.maxExecutorss</name>
<value>16</value>
</property>
<property>
<name>spark.dynamicAllocation.initialExecutors</name>
<value>2</value>
</property>
<property>
<name>spark.dynamicAllocation.executorIdleTimeout</name>
<value>60000</value>
</property>
<property>
<name>spark.dynamicAllocation.schedulerBacklogTimeout</name>
<value>1000</value>
</property>
</configuration>
启动hive
$HIVE_HOME/bin/hive --service metastore
$HIVE_HOME/bin/hive --service hiveserver2
$HIVE_HOME/bin/beeline -n op -u jdbc:hive2://localhost:10000
网友评论