美文网首页
hive on spark 安装部署

hive on spark 安装部署

作者: shaun_x | 来源:发表于2020-08-29 00:58 被阅读0次

    版本

    hive 2.3.7
    spark 2.4.5
    hadoop 2.7.7

    添加环境变量

    export HADOOP_HOME=/opt/hadoop
    export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
    export SPARK_HOME=/opt/spark
    export HIVE_HOME=/opt/hive
    
    export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${HIVE_HOME}/bin:$PATH
    

    构建spark-without-hive

    编译环境准备

    • java 1.8
    • mvn 3.6.3
      添加mvn镜像
    vi $MAVEN_HOME/settings.xml
     
    <mirrors>
        <mirror>
           <id>nexus-aliyun</id>
           <mirrorOf>*</mirrorOf>
           <name>Nexus aliyun</name>
           <url>http://maven.aliyun.com/nexus/content/groups/public</url>
        </mirror>
      </mirrors>
    

    下载spark源码

    git clone https://gitclone.com/github.com/apache/spark.git
    cd spark
    git checkout v2.4.5
    

    构建

    ./dev/make-distribution.sh --name hadoop2.7-without-hive --tgz -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.7 -Pparquet-provided -Porc-provided -DskipTests
    

    解压

    mv spark2.4.5-hadoop2.7-without-hive.tgz /opt
    cd /opt
    tar -xzf spark2.4.5-hadoop2.7-without-hive.tgz
    ln -s spark2.4.5-hadoop2.7-without-hive spark
    

    添加配置(集群测试-非必须)

    vi $SPARK_HOME/conf/spark-env.sh
    chmod +x $SPARK_HOME/conf/spark-env.sh
     
    #!/usr/bin/env bash
    export SPARK_MASTER_HOST=__host__
    export SPARK_WORKER_CORES=2
    export SPARK_WORKER_MEMORY=4g
    export SPARK_WORKER_INSTANCES=1
    export SPARK_EXECUTOR_MEMORY=4g
    export SPARK_DRIVE_MEMORY=4g
    

    测试

    $SPARK_HOME/sbin/start-all.sh
    $SPARK_HOME/bin/spark-submit \
        --class org.apache.spark.examples.SparkPi \
        --master spark://__master_host__:7077 \
        $SPARK_HOME/examples/jars/spark-examples*.jar \
        10
     
    $SPARK_HOME/sbin/stop-all.sh
    

    安装hdfs


    配置yarn

    vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
     
    <configuration>
    <property>
        <name>yarn.resourcemanager.scheduler.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
    </property>
    <property>
        <name>yarn.scheduler.fair.preemption</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>__host__</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>spark_shuffle,mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
        <value>org.apache.spark.network.yarn.YarnShuffleService</value>
    </property>
        <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>4096</value>
    </property>
    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>2</value>
    </property>
    <configuration>
    

    配置fair-scheduler.xml

    <?xml version="1.0"?>
    <allocations>
      <queue name="sample_queue">
        <minResources>4000 mb,2vcores</minResources>
        <maxResources>64000 mb,32vcores</maxResources>
        <maxRunningApps>50</maxRunningApps>
        <weight>2.0</weight>
        <schedulingPolicy>fair</schedulingPolicy>
      </queue>
    
      <defaultFairSharePreemptionTimeout>600</defaultFairSharePreemptionTimeout>
      <defaultMinSharePreemptionTimeout>60</defaultMinSharePreemptionTimeout>
      <queueMaxAMShareDefault>0.5</queueMaxAMShareDefault>
    
      <queuePlacementPolicy>
        <rule name="specified" />
        <rule name="primaryGroup" create="false" />
        <rule name="nestedUserQueue">
            <rule name="secondaryGroupExistingQueue" create="false" />
        </rule>
        <rule name="default" queue="sample_queue"/>
      </queuePlacementPolicy>
    </allocations>
    

    修改yarn集群资源显式统计(非必须)

    vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
     
     
    <name>yarn.scheduler.capacity.resource-calculator</name>
    <value>org.apache.hadoop.yarn.util.resource.DominantResourceCalculator</value>
    

    添加spark-shuffle-jar

    cp $SPARK_HOME/yarn/spark-2.4.5-yarn-shuffle.jar $HADOOP_HOME/share/hadoop/yarn/lib
    

    启动yarn

    $SPARK_HOME/sbin/start-yarn.sh
    

    测试

    $SPARK_HOME/bin/spark-submit \
       --class org.apache.spark.examples.SparkPi \
       --master yarn \
       $SPARK_HOME/examples/jars/spark-examples*.jar \
       10
    

    安装hive

    下载hive

    wget http://mirror.bit.edu.cn/apache/hive/hive-2.3.7/apache-hive-2.3.7-bin.tar.gz
    mv apache-hive-2.3.7-bin.tar.gz /opt
    cd /opt
    tar -xzf apache-hive-2.3.7-bin.tar.gz
    ln -s apache-hive-2.3.7-bin hive
    

    添加spark依赖包

    vi $HIVE_HOME/bin/hive
     
    # 找到 ${HIVE_LIB}/*.jar 大概在140行,在下面添加
    for f in ${SPARK_HOME}/jars/*.jar; do
         CLASSPATH=${CLASSPATH}:$f;
    done
    

    配置hive-site.xml

    vi $HIVE_HOME/conf/hive-site.xml
     
    <?xml version="1.0"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
     
    <configuration>
    <property>
            <name>mapreduce.framework.name</name>
            <value>yarn</value>
    </property>
    <property>
            <name>hive.metastore.schema.verification</name>
            <value>false</value>
    </property>
    <property>
            <name>datanucleus.schema.autoCreateTables</name>
            <value>true</value>
    </property>
    <property>
            <name>datanucleus.schema.autoCreateAll</name>
            <value>true</value>
            <description>creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once</description>
    </property>
     
    <!-- 使用mysql存储hive metadata, 非必须 -->
    <property>
            <name>javax.jdo.option.ConnectionURL</name>
            <value>jdbc:mysql://__host__:__port__/hive_metastore?createDatabaseIfNotExist=true&useSSL=false</value>
            <description>metadata is stored in a MySQL server</description>
    </property>
    <property>
            <name>javax.jdo.option.ConnectionDriverName</name>
            <value>com.mysql.jdbc.Driver</value>
            <description>MySQL JDBC driver class</description>
    </property>
    <property>
            <name>javax.jdo.option.ConnectionUserName</name>
            <value>__username__</value>
    </property>
    <property>
            <name>javax.jdo.option.ConnectionPassword</name>
            <value>__password__</value>
    </property>
     
    <!-- 以下为 hive 优化项,非必须 -->
    <property>
            <name>hive.merge.sparkfiles</name>
            <value>true</value>
            <description>在 Spark DAG 转换结束时合并小文件</description>
    </property>
    <property>
            <name>hive.exec.dynamic.partition</name>
            <value>true</value>
            <description>hive Insert动态分区</description>
    </property>
    <property>
            <name>hive.exec.dynamic.partition.mode</name>
            <value>nonstrict</value>
            <description>hive Insert动态分区</description>
    </property>
    <property>
            <name>hive.exec.max.dynamic.partitions</name>
            <value>10000</value>
            <description>hive Insert动态分区,允许的最大分区数量</description>
    </property>
    <property>
            <name>hive.exec.max.dynamic.partitions.pernode</name>
            <value>10000</value>
            <description>hive Insert动态分区,单节点允许的最大分区数量</description>
    </property>
    <property>
            <name>hive.log.explain.output</name>
            <value>true</value>
            <description>将spark在用户级别记录查询的 EXPLAIN 输出</description>
    </property>
    <property>
            <name>hive.explain.user</name>
            <value>true</value>
            <description>用户级别记录查询的 EXPLAIN 输出</description>
    </property>
    <property>
            <name>hive.spark.explain.user</name>
            <value>true</value>
            <description>将spark用户级别记录查询的 EXPLAIN 输出</description>
    </property>
     
    <!-- hive on spark 必备配置 -->
    <property>
            <name>hive.execution.engine</name>
            <value>spark</value>
            <description>修改hive的执行引擎为saprk</description>
    </property>
     
    <!-- spark 相关配置 -->
    <property>
            <name>spark.master</name>
            <value>yarn</value>
            <description>配置spark on yarn,standlone直接使用master url</description>
    </property>
    <property>
            <name>spark.serializer</name>
            <value>org.apache.spark.serializer.KryoSerializer</value>
            <description>配置spark的序列化类</description>
    </property>
    <property>
            <name>spark.executor.extraClassPath</name>
            <value>hdfs://__hdfs_host__:9000/hive-lib/*</value>
            <description>配置spark用到的hive的jar包</description>
    </property>
    <property>
            <name>spark.yarn.jars</name>
            <value>hdfs://__hdfs_host__:9000/spark-jars/*</value>
            <description>配置spark的lib包在hdfs的位置</description>
    </property>
     
    <!--配置spark executor的计算资源 -->
    <property>
            <name>spark.executor.memory</name>
            <value>4g</value>
    </property>
    <property>
            <name>spark.yarn.executor.memoryOverhead</name>
            <value>1g</value>
    </property>
    <property>
            <name>spark.driver.memory</name>
            <value>4g</value>
    </property>
    <property>
            <name>spark.yarn.driver.memoryOverhead</name>
            <value>1g</value>
    </property>
    <property>
            <name>spark.executor.cores</name>
            <value>2</value>
    </property>
     
     
    <!-- 以下为spark优化项,非必须 -->
    <property>
            <name>hive.spark.client.connect.timeout</name>
            <value>10000ms</value>
    </property>
    <property>
            <name>spark.eventLog.enabled</name>
            <value>true</value>
            <description>配置spark的日志</description>
    </property>
    <property>
            <name>spark.eventLog.dir</name>
            <value>hdfs://__hdfs_host__:9000/spark-eventlog</value>
    </property>
    <!--配置spark executor动态分配-->
    <property>
            <name>spark.shuffle.service.enabled</name>
            <value>true</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.enabled</name>
            <value>true</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.minExecutors</name>
            <value>0</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.maxExecutorss</name>
            <value>16</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.initialExecutors</name>
            <value>2</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.executorIdleTimeout</name>
            <value>60000</value>
    </property>
    <property>
            <name>spark.dynamicAllocation.schedulerBacklogTimeout</name>
            <value>1000</value>
    </property>
    </configuration>
    

    启动hive

    $HIVE_HOME/bin/hive --service metastore
     
    $HIVE_HOME/bin/hive --service hiveserver2
     
    $HIVE_HOME/bin/beeline -n op -u jdbc:hive2://localhost:10000
    

    相关文章

      网友评论

          本文标题:hive on spark 安装部署

          本文链接:https://www.haomeiwen.com/subject/duglsktx.html