美文网首页
fairseq 命令 结论

fairseq 命令 结论

作者: VanJordan | 来源:发表于2019-05-04 18:44 被阅读0次
    • enfr使用interactivedecoder
    #!/usr/bin/env sh
    set -e
    
    # global flag
    FLAG=baseline
    SRC_LANG=en
    TGT_LANG=fr
    
    # user directory
    ROOT_DIR=/home/zhaoliang/fairseq-slim  
    TMP_DIR=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure
    DATA_DIR=$ROOT_DIR/data-bin/$SRC_LANG-$TGT_LANG  
    TRAIN_DIR=$ROOT_DIR/train/$FLAG-$SRC_LANG-$TGT_LANG
    # file path 
    RESULT_FILE=$ROOT_DIR/$FLAG-$SRC_LANG-$TGT_LANG.test.result
    
    # SRC_DICT_PATH=/home/zhaoliang/fairseq-master/pretrain_model/wmt16.en-de.joined-dict.transformer/dict.en.txt
    # TGT_DICT_PATH=/home/zhaoliang/fairseq-master/pretrain_model/wmt16.en-de.joined-dict.transformer/dict.de.txt
    # DECODER_TEXT_DIR=/home/zhaoliang/fairseq-master/data-bin/wmt16-en-de
    
    
    # for interactive decoder
    TEST_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/test.en
    REF_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/test.vi
    CODE_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/data/code
    DICT_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/data/dict.en-vi
    
    # hypeparameters 
    BEAM_SIZE=4
    TRAIN_BS=10000
    DECODER_BS=500
    GRADIENTS_ACCUMULATIONS=16
    DECODER_NUMWORKERS=4
    
    # set up for training 
    # export CUDA_VISIBLE_DEVICES=${2:-3}
    
    mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
    
    echo "will use GPU $CUDA_VISIBLE_DEVICES"
    
    if [[ $1 == 'train' ]]; then
    
        TEXT=/home/zhaoliang/fairseq-slim/data-raw/en-fr-pure
        # DATADIR=data-bin/en-fr-pure
    
        # # clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
        # MOSESDECODER=/home/zhaoliang/fairseq-slim/mosesdecoder
        # # perl $MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/train en fr $TEXT/train.clean 3 400
        # # perl $MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/dev fr en $TEXT/valid.clean 3 400
    
        # # build subword vocab
        # SUBWORD_NMT=/home/zhaoliang/fairseq-slim/subword-nmt/subword_nmt
        NUM_OPS=32000
    
        # # learn codes and encode separately
        # CODES=codes.enfr.${NUM_OPS}.bpe
        # # echo "Encoding subword with BPE using ops=${NUM_OPS}"
        # # python3 $SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.en > $TEXT/${CODES}.en
        # # python3 $SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.fr > $TEXT/${CODES}.fr
    
        # # echo "Applying vocab to training"
        # # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en < $TEXT/train.clean.en > $TEXT/train.${NUM_OPS}.bpe.en
        # # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.fr < $TEXT/train.clean.fr > $TEXT/train.${NUM_OPS}.bpe.fr
    
        # VOCAB=vocab.${NUM_OPS}.bpe
        # echo "Generating vocab: ${VOCAB}.en"
        # cat $TEXT/train.${NUM_OPS}.bpe.en | python3 $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.en
    
        # echo "Generating vocab: ${VOCAB}.fr"
        # cat $TEXT/train.${NUM_OPS}.bpe.fr | python3 $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.fr
    
        # # encode validation
        # echo "Applying vocab to valid"
        # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en --vocabulary $TEXT/${VOCAB}.en < $TEXT/valid.clean.en > $TEXT/valid.${NUM_OPS}.bpe.en
        # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.fr --vocabulary $TEXT/${VOCAB}.fr < $TEXT/valid.clean.fr > $TEXT/valid.${NUM_OPS}.bpe.fr
    
    
        # generate training data
        # python3 preprocess.py \
        #   --source-lang en \
        #   --target-lang fr \
        #   --trainpref $TEXT/train.${NUM_OPS}.bpe \
        #   --validpref $TEXT/valid.${NUM_OPS}.bpe \
        #   --workers 12 \
        #   --destdir $DATA_DIR 
    
        mkdir -p  $TRAIN_DIR/ckpt $TRAIN_DIR/log
        python3 train.py $DATA_DIR \
            --arch transformer  \
            --clip-norm 0 --optimizer adam --lr 0.001 \
            --source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
            --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
            --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
            --lr-scheduler inverse_sqrt \
            --update-freq $GRADIENTS_ACCUMULATIONS \
            --ddp-backend=no_c10d \
            --max-update 100000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
            --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
            --dropout 0.3 --attention-dropout 0.1 \
            --tensorboard-logdir $TRAIN_DIR/log \
            --save-dir $TRAIN_DIR/ckpt \
            --fp16  # use-fp16
            # --update-freq $GRADIENTS_ACCUMULATIONS \
        # python3 train.py $DATA_DIR --arch transformer_vaswani_wmt_en_de_big --optimizer adam --adam-betas '(0.9, 0.98)' \
        #   --clip-norm 0.0 \
        #   --lr-scheduler inverse_sqrt \
        #   --warmup-init-lr 1e-07 \
        #   --warmup-updates 4000 \
        #   --lr 0.001 \
        #   --min-lr 1e-09 \
        #   --dropout 0.3 \
        #   --weight-decay 0.0 \
        #   --criterion label_smoothed_cross_entropy \
        #   --label-smoothing 0.1 \
        #   --max-tokens $TRAIN_BS \  # bs
        #   --max-update 3000 \ # training step is max-update * update-freq
        #   --update-freq $GRADIENTS_ACCUMULATIONS \ # gradients accumulations is 32 then its equal to bs is 32 times than before
        #   --tensorboard-logdir $TRAIN_DIR/log \ 
        #   --log-format json --save-dir $TRAIN_DIR/log \
        #   --fp16  # use-fp16
        # Average 10 latest checkpoints:
    elif [[ $1 == 'avg' ]]; then
    
        mkdir -p  $TRAIN_DIR/ckpt $TRAIN_DIR/ckpt
        python3 scripts/average_checkpoints.py \
            --inputs $TRAIN_DIR/ckpt \
            --num-epoch-checkpoints 10  \
            --output $TRAIN_DIR/ckpt/model.pt
    
    elif [[ $1 == 'test' ]]; then
        
        if [ -f $TRAIN_DIR/ckpt/model.pt ]; then
            echo "will test use $TRAIN_DIR/ckpt/model.pt"
        else
            echo 'ckpt not found'
        fi
        python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
            --beam $BEAM_SIZE \
            --num-workers 12 \
            --remove-bpe \
            --batch-size $DECODER_BS | grep -P '^H' |cut -f3-  | tee $TRAIN_DIR/$FLAGS.translation.$TGT_LANG
    
        python3 score.py \
            --sys $TRAIN_DIR/$TRANS-$FLAGS.translation.$TARGET_LANG \
            --ref $REF_FILE  | tee $RESULT_FILE 
    
    elif [[ $1 == 'decoder' ]]; then
        # SUBWORD_NMT is the source code dir of subword_nmt
        # subword_path is the code path of training data 
        SUBWORD_NMT=/home/zhaoliang/fairseq-slim/subword-nmt
        TRAIN_DIR=/home/zhaoliang/fairseq-slim/train/baseline-en-vi
        cat $TEST_FILE | python3 $SUBWORD_NMT/apply_bpe.py -c $CODE_FILE | \
            python3 interactive.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
                --beam $BEAM_SIZE \
                --source-lang $SRC_LANG \
                --target-lang $TGT_LANG  \
                --num-workers 12 \
                --remove-bpe \
                --batch-size $DECODER_BS \
                --buffer-size $DECODER_BS | grep -P '^H' |cut -f3-  | tee $TRAIN_DIR/$FLAGS.translation.$TGT_LANG
    
        python3 score.py \
            --sys $TRAIN_DIR/$FLAGS.translation.$TGT_LANG \
            --ref $REF_FILE  | tee  $RESULT_FILE 
    else 
        echo "unknown argment 1"
    fi
    
    # 17200 en   7800 vi
    
    • envi
    
    TMP_DIR=data-raw/en-vi-pure
    SRC_LANG=en
    TGT_LANG=vi
    TRAIN_BS=3582
    TRAIN_DIR=train/${SRC_LANG}-${TGT_LANG}_fp16
    DATA_DIR=data-bin/iwslt_${SRC_LANG}_${TGT_LANG}
    result_file=$TRAIN_DIR/${SRC_LANG}-${TGT_LANG}.log
    echo 'preprocess ' > $result_file
    
    # python3 preprocess.py \
    #   --source-lang $SRC_LANG \
    #   --target-lang $TGT_LANG \
    #   --trainpref $TMP_DIR/train \
    #   --validpref $TMP_DIR/dev \
    #   --testpref $TMP_DIR/test \
    #   --nwordssrc 17200 \
    #   --nwordstgt 7800 \
    #   --workers 12 \
    #   --destdir $DATA_DIR
    
    mkdir -p $TRAIN_DIR/log $TRAIN_DIR/ckpt
    echo 'train ' >> $result_file
    python3 train.py $DATA_DIR \
        --arch transformer_iwslt_de_en  \
        --share-decoder-input-output-embed \
        --clip-norm 0 --optimizer adam --lr 0.001 \
        --source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
        --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
        --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
        --lr-scheduler inverse_sqrt \
        --max-update 20000 --warmup-updates 4000 --warmup-init-lr '1e-07' --update-freq 4\
        --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
        --dropout 0.3 \
        --tensorboard-logdir $TRAIN_DIR/log --log-format simple\
        --save-dir $TRAIN_DIR/ckpt \
        --fp16  2>&1 | tee $TRAIN_DIR/train.log
    
    echo 'average checkpoints' >> $result_file
    python3 scripts/average_checkpoints.py \
        --inputs $TRAIN_DIR/ckpt \
        --num-epoch-checkpoints 10  \
        --output $TRAIN_DIR/ckpt/model.pt
    
    echo 'generate' >> $result_file
    # python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
    #     --beam 5 \
    #     --num-workers 12 \
    #     --batch-size 128 | grep -P '^H' |cut -f3-  | tee $TRAIN_DIR/$FLAG.translation.$TGT_LANG
    python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
        --source-lang $SRC_LANG --target-lang $TGT_LANG \
        --beam 5 \
        --num-workers 12 \
        --batch-size 128 2>&1 | tee $TRAIN_DIR/tmp.log
    
    # echo 'score' >> $result_file
    
    # grep ^H $TRAIN_DIR/tmp.log | cut -f3- > $TRAIN_DIR/gen.out.sys
    # grep ^T $TRAIN_DIR/tmp.log | cut -f2- > $TRAIN_DIR/gen.out.ref
    
    # python3 score.py --sys $TRAIN_DIR/gen.out.sys --ref $TRAIN_DIR/gen.out.ref | tee $result_file
    
    • avg比不avg要好的多,lastbest要好
    • 大的bs很关键1268个句子的token27923,所以最好能把内存塞满
    • 跑出30.66的命令
    
    TMP_DIR=data-raw/en-vi-pure
    SRC_LANG=en
    TGT_LANG=vi
    # TRAIN_BS=3582
    TRAIN_BS=10000
    MAX_UPDATE=5000
    TRAIN_DIR=train/${SRC_LANG}_${TGT_LANG}_fp16
    DATA_DIR=data-bin/iwslt_${SRC_LANG}_${TGT_LANG}
    result_file=$TRAIN_DIR/result.txt
    # python3 preprocess.py \
    #   --source-lang $SRC_LANG \
    #   --target-lang $TGT_LANG \
    #   --trainpref $TMP_DIR/train \
    #   --validpref $TMP_DIR/dev \
    #   --testpref $TMP_DIR/test \
    #   --nwordssrc 17200 \
    #   --nwordstgt 7800 \
    #   --workers 12 \
    #   --destdir $DATA_DIR
    mkdir -p $TRAIN_DIR/log $TRAIN_DIR/ckpt
    touch $result_file
    python3 train.py $DATA_DIR \
        --arch transformer_iwslt_de_en  \
        --share-decoder-input-output-embed \
        --clip-norm 0 --optimizer adam --lr 0.001 \
        --source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
        --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
        --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
        --lr-scheduler inverse_sqrt \
        --max-update $MAX_UPDATE --warmup-updates 4000 --warmup-init-lr '1e-07' --update-freq 4\
        --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
        --dropout 0.3 \
        --tensorboard-logdir $TRAIN_DIR/log --log-format simple\
        --save-dir $TRAIN_DIR/ckpt \
        --fp16  2>&1 | tee $TRAIN_DIR/train.log
    
    python3 scripts/average_checkpoints.py \
        --inputs $TRAIN_DIR/ckpt \
        --num-epoch-checkpoints 10  \
        --output $TRAIN_DIR/ckpt/model.pt
    
    
    # python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
    #     --beam 5 \
    #     --num-workers 12 \
    #     --batch-size 128 | grep -P '^H' |cut -f3-  | tee $TRAIN_DIR/$FLAG.translation.$TGT_LANG
    echo 'average 10 checkpoints: ' | tee $result_file
    python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
        --source-lang $SRC_LANG --target-lang $TGT_LANG \
        --beam 5 \
        --num-workers 12 \
        --batch-size 128 2>&1 | tee $TRAIN_DIR/translation.$TGT_LANG | tail -2 | tee $result_file
    
    echo "best checkpoint:" | tee $result_file
    python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/checkpoint_best.pt \
        --source-lang $SRC_LANG --target-lang $TGT_LANG \
        --beam 5 \
        --num-workers 12 \
        --batch-size 128 2>&1 | tail -2 | tee $result_file
    
    echo "last checkpoint:" | tee $result_file
    python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/checkpoint_last.pt \
        --source-lang $SRC_LANG --target-lang $TGT_LANG \
        --beam 5 \
        --num-workers 12 \
        --batch-size 128 2>&1 | tail -2 | tee $result_file
    
    # grep ^H $TRAIN_DIR/tmp.log | cut -f3- > $TRAIN_DIR/gen.out.sys
    # grep ^T $TRAIN_DIR/tmp.log | cut -f2- > $TRAIN_DIR/gen.out.ref
    
    # python3 score.py --sys $TRAIN_DIR/gen.out.sys --ref $TRAIN_DIR/gen.out.ref | tee $result_file
    
    • 调成bs增大到25000的时候原来只是overflow 8
    • fp32要比fp16要好上0.4bleu,同样都是10000次updateBS7000

    关于BSupdate的关系,BS也不是越大越好,

    • 梯度累计都是4的时候
    • 3582的时候训练70epoch19075次更新,20000次是30.57花费的时间是379471epochvalid loss4.648
    • 10000的时候训练70epoch7000次更新,但是实际上5000次更新效果就很好了是30.66时间是273650epochvalid loss4.55
    • 25000的时候训练70epoch3196次更新,2000次更新的训练时间241844epochvalid loss4.975,效果只有25.5,3000次更新效果只有28.95训练时间是349366个epoch

    梯度累计是8的时候

    • 10000 5000次更新,和3000次更新
    • 40000的时候训练70epoch10671000次更新,和1500次更新
      40000
    10000的时候
    3582
    25000
    • 40000的时候是24GB rtx的极限了峰值的时候有23GB,现在稳定在22GB左右,而且GPU峰值利用率大大提高,可以达到98%10000的时候峰值利用率84
      image.png

    当训练不足的时候average反而是有害的

    • 总共训练了43epoch明显是训练不足,不管是avg还是不avg效果都远远比不了原来的30.6valid loss5.07远远高于正常的4.6

    fp32和fp16

    • envi 10000 5000 4 fp3230.40 fp1630.66
    • vien


    batch size 40000和10000的时候训练速度差不多


    相关文章

      网友评论

          本文标题:fairseq 命令 结论

          本文链接:https://www.haomeiwen.com/subject/bpdfoqtx.html