shell-常用脚本

作者: logi | 来源:发表于2020-05-11 13:29 被阅读0次

循环校验文件是否存在，存在时继续执行

${hadoop} fs -test -e /user/xxxx
while [ $? -ne 0 ]
do
   sleep 5m
   echo 'waiting'
  ${hadoop} fs -test -e 
done

spark-submit 脚本

#!/bin/bash
set -x

HOME_PATH=$(cd $(dirname $0);pwd)



# warn: 配置文件必须在和 run.sh 同级目录
log4j_setting="-Dlog4j.configuration=file:log4j-driver.properties"

hadoop fs -test -e $output.path
if [ $? -eq 0 ];then
    echo "output: $output exists!"
    echo "pls use: hadoop fs -rmr ${output}.path"
    exit 1
    hadoop fs -rmr ${output}.path
fi

SPARKSUBMIT=/opt/meituan/spark-2.2/bin/spark-submit
    $SPARKSUBMIT \
        --name xx \
        --master yarn-cluster \
        --executor-cores 1 \
        --executor-memory 8g \
        --driver-memory 10g \
        --queue xx \
        --num-executors 150 \
        --conf spark.yarn.executor.memoryOverhead=6000 \
        --conf spark.yarn.driver.memoryOverhead=2048 \
        --conf spark.shuffle.service.enabled=true \
        --conf spark.shuffle.service.port=7337 \
        --conf spark.shuffle.io.retryWait=60s \
        --conf spark.shuffle.file.buffer=64k \
        --conf spark.shuffle.consolidateFiles=true \
        --conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
        --conf spark.executor.heartbeatInterval=7200000 \
        --conf spark.network.timeout=7200000 \
        --conf spark.sql.shuffle.partitions=1000 \
        --conf spark.rdd.compress=true \
        --conf spark.driver.maxResultSize=2g \
        --conf spark.akka.frameSize=150 \
        --conf spark.default.parallelism=750 \
        --conf spark.hadoop.validateOutputSpecs=false \
        --conf spark.speculation=true \
        --conf spark.speculation.multiplier=3 \
        --conf spark.driver.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
        --conf spark.executor.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
        --conf spark.memory.fraction=0.75 \
        --conf spark.memory.storageFraction=0.5 \
        --conf spark.network.timeout=720s \
        --files "$HOME_PATH/log4j-executor.properties,$HOME_PATH/log4j-d    river.properties,/opt/meituan/spark-2.2/conf/hive-site.xml" \
        --conf "spark.driver.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
        --conf "spark.executor.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
        --class com.navercorp.Main \
        ./target/node2vec-0.0.2-SNAPSHOT.jar \ #jar包
        #其他参数在这写


if [ $? -eq 0 ];then
    echo "spark run finished."
    echo "get emb? \n hadoop fs -getmerge $output.emb ../data/output"
else
    echo "error run spark!"
fi

spark-shell REPL启动脚本

/opt/meituan/spark-2.2/bin/spark-shell \
--master yarn \
--executor-cores 5 \
--executor-memory 10g \
--queue x \
--num-executors 50 \
--jars ./target/spark_template-1.0-SNAPSHOT-selfcontained.jar \ #各种shell需要的依赖打包好上传
--driver-memory 10g \ 
--name CLZ \
--conf spark.shuffle.service.enabled=true \
--conf spark.shuffle.service.port=7337 \
--conf spark.shuffle.io.retryWait=60s \
--conf spark.shuffle.file.buffer=64k \
--conf spark.shuffle.consolidateFiles=true \
--conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
--conf spark.executor.heartbeatInterval=7200000 \
--conf spark.network.timeout=7200000 \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.rdd.compress=true \
--conf spark.driver.maxResultSize=2g \
--conf spark.rpc.message.maxSize=150 \
--conf spark.default.parallelism=750 \
--conf spark.hadoop.validateOutputSpecs=false \
--conf spark.speculation=true \
--conf spark.speculation.multiplier=3 \
--conf spark.memory.fraction=0.75 \
--conf spark.memory.storageFraction=0.5 \
--conf spark.network.timeout=720s \
--conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.yarn.driver.memoryOverhead=2048 \

网友评论

本文标题：shell-常用脚本

本文链接：https://www.haomeiwen.com/subject/delunhtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

shell-常用脚本

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读