- 循环校验文件是否存在,存在时继续执行
${hadoop} fs -test -e /user/xxxx
while [ $? -ne 0 ]
do
sleep 5m
echo 'waiting'
${hadoop} fs -test -e
done
- spark-submit 脚本
#!/bin/bash
set -x
HOME_PATH=$(cd $(dirname $0);pwd)
# warn: 配置文件必须在和 run.sh 同级目录
log4j_setting="-Dlog4j.configuration=file:log4j-driver.properties"
hadoop fs -test -e $output.path
if [ $? -eq 0 ];then
echo "output: $output exists!"
echo "pls use: hadoop fs -rmr ${output}.path"
exit 1
hadoop fs -rmr ${output}.path
fi
SPARKSUBMIT=/opt/meituan/spark-2.2/bin/spark-submit
$SPARKSUBMIT \
--name xx \
--master yarn-cluster \
--executor-cores 1 \
--executor-memory 8g \
--driver-memory 10g \
--queue xx \
--num-executors 150 \
--conf spark.yarn.executor.memoryOverhead=6000 \
--conf spark.yarn.driver.memoryOverhead=2048 \
--conf spark.shuffle.service.enabled=true \
--conf spark.shuffle.service.port=7337 \
--conf spark.shuffle.io.retryWait=60s \
--conf spark.shuffle.file.buffer=64k \
--conf spark.shuffle.consolidateFiles=true \
--conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
--conf spark.executor.heartbeatInterval=7200000 \
--conf spark.network.timeout=7200000 \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.rdd.compress=true \
--conf spark.driver.maxResultSize=2g \
--conf spark.akka.frameSize=150 \
--conf spark.default.parallelism=750 \
--conf spark.hadoop.validateOutputSpecs=false \
--conf spark.speculation=true \
--conf spark.speculation.multiplier=3 \
--conf spark.driver.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
--conf spark.executor.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
--conf spark.memory.fraction=0.75 \
--conf spark.memory.storageFraction=0.5 \
--conf spark.network.timeout=720s \
--files "$HOME_PATH/log4j-executor.properties,$HOME_PATH/log4j-d river.properties,/opt/meituan/spark-2.2/conf/hive-site.xml" \
--conf "spark.driver.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
--conf "spark.executor.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
--class com.navercorp.Main \
./target/node2vec-0.0.2-SNAPSHOT.jar \ #jar包
#其他参数在这写
if [ $? -eq 0 ];then
echo "spark run finished."
echo "get emb? \n hadoop fs -getmerge $output.emb ../data/output"
else
echo "error run spark!"
fi
- spark-shell REPL启动脚本
/opt/meituan/spark-2.2/bin/spark-shell \
--master yarn \
--executor-cores 5 \
--executor-memory 10g \
--queue x \
--num-executors 50 \
--jars ./target/spark_template-1.0-SNAPSHOT-selfcontained.jar \ #各种shell需要的依赖打包好上传
--driver-memory 10g \
--name CLZ \
--conf spark.shuffle.service.enabled=true \
--conf spark.shuffle.service.port=7337 \
--conf spark.shuffle.io.retryWait=60s \
--conf spark.shuffle.file.buffer=64k \
--conf spark.shuffle.consolidateFiles=true \
--conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
--conf spark.executor.heartbeatInterval=7200000 \
--conf spark.network.timeout=7200000 \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.rdd.compress=true \
--conf spark.driver.maxResultSize=2g \
--conf spark.rpc.message.maxSize=150 \
--conf spark.default.parallelism=750 \
--conf spark.hadoop.validateOutputSpecs=false \
--conf spark.speculation=true \
--conf spark.speculation.multiplier=3 \
--conf spark.memory.fraction=0.75 \
--conf spark.memory.storageFraction=0.5 \
--conf spark.network.timeout=720s \
--conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.yarn.driver.memoryOverhead=2048 \
网友评论