美文网首页
shell-常用脚本

shell-常用脚本

作者: logi | 来源:发表于2020-05-11 13:29 被阅读0次
    1. 循环校验文件是否存在,存在时继续执行
    ${hadoop} fs -test -e /user/xxxx
    while [ $? -ne 0 ]
    do
       sleep 5m
       echo 'waiting'
      ${hadoop} fs -test -e 
    done
    
    1. spark-submit 脚本
    #!/bin/bash
    set -x
    
    HOME_PATH=$(cd $(dirname $0);pwd)
    
    
    
    # warn: 配置文件必须在和 run.sh 同级目录
    log4j_setting="-Dlog4j.configuration=file:log4j-driver.properties"
    
    hadoop fs -test -e $output.path
    if [ $? -eq 0 ];then
        echo "output: $output exists!"
        echo "pls use: hadoop fs -rmr ${output}.path"
        exit 1
        hadoop fs -rmr ${output}.path
    fi
    
    SPARKSUBMIT=/opt/meituan/spark-2.2/bin/spark-submit
        $SPARKSUBMIT \
            --name xx \
            --master yarn-cluster \
            --executor-cores 1 \
            --executor-memory 8g \
            --driver-memory 10g \
            --queue xx \
            --num-executors 150 \
            --conf spark.yarn.executor.memoryOverhead=6000 \
            --conf spark.yarn.driver.memoryOverhead=2048 \
            --conf spark.shuffle.service.enabled=true \
            --conf spark.shuffle.service.port=7337 \
            --conf spark.shuffle.io.retryWait=60s \
            --conf spark.shuffle.file.buffer=64k \
            --conf spark.shuffle.consolidateFiles=true \
            --conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
            --conf spark.executor.heartbeatInterval=7200000 \
            --conf spark.network.timeout=7200000 \
            --conf spark.sql.shuffle.partitions=1000 \
            --conf spark.rdd.compress=true \
            --conf spark.driver.maxResultSize=2g \
            --conf spark.akka.frameSize=150 \
            --conf spark.default.parallelism=750 \
            --conf spark.hadoop.validateOutputSpecs=false \
            --conf spark.speculation=true \
            --conf spark.speculation.multiplier=3 \
            --conf spark.driver.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
            --conf spark.executor.extraJavaOptions="-Dlion.logType=null -Dsquirrel.log.dir=/var/tmp/lion.log" \
            --conf spark.memory.fraction=0.75 \
            --conf spark.memory.storageFraction=0.5 \
            --conf spark.network.timeout=720s \
            --files "$HOME_PATH/log4j-executor.properties,$HOME_PATH/log4j-d    river.properties,/opt/meituan/spark-2.2/conf/hive-site.xml" \
            --conf "spark.driver.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
            --conf "spark.executor.extraJavaOptions=-Dlog4j.debug=true ${log4j_setting}" \
            --class com.navercorp.Main \
            ./target/node2vec-0.0.2-SNAPSHOT.jar \ #jar包
            #其他参数在这写
    
    
    if [ $? -eq 0 ];then
        echo "spark run finished."
        echo "get emb? \n hadoop fs -getmerge $output.emb ../data/output"
    else
        echo "error run spark!"
    fi
    
    
    1. spark-shell REPL启动脚本
    /opt/meituan/spark-2.2/bin/spark-shell \
    --master yarn \
    --executor-cores 5 \
    --executor-memory 10g \
    --queue x \
    --num-executors 50 \
    --jars ./target/spark_template-1.0-SNAPSHOT-selfcontained.jar \ #各种shell需要的依赖打包好上传
    --driver-memory 10g \ 
    --name CLZ \
    --conf spark.shuffle.service.enabled=true \
    --conf spark.shuffle.service.port=7337 \
    --conf spark.shuffle.io.retryWait=60s \
    --conf spark.shuffle.file.buffer=64k \
    --conf spark.shuffle.consolidateFiles=true \
    --conf spark.yarn.scheduler.heartbeat.interval-ms=7200000 \
    --conf spark.executor.heartbeatInterval=7200000 \
    --conf spark.network.timeout=7200000 \
    --conf spark.sql.shuffle.partitions=1000 \
    --conf spark.rdd.compress=true \
    --conf spark.driver.maxResultSize=2g \
    --conf spark.rpc.message.maxSize=150 \
    --conf spark.default.parallelism=750 \
    --conf spark.hadoop.validateOutputSpecs=false \
    --conf spark.speculation=true \
    --conf spark.speculation.multiplier=3 \
    --conf spark.memory.fraction=0.75 \
    --conf spark.memory.storageFraction=0.5 \
    --conf spark.network.timeout=720s \
    --conf spark.yarn.executor.memoryOverhead=2048 \
    --conf spark.yarn.driver.memoryOverhead=2048 \
    

    相关文章

      网友评论

          本文标题:shell-常用脚本

          本文链接:https://www.haomeiwen.com/subject/delunhtx.html