-
enfr
使用interactive
decoder
#!/usr/bin/env sh
set -e
# global flag
FLAG=baseline
SRC_LANG=en
TGT_LANG=fr
# user directory
ROOT_DIR=/home/zhaoliang/fairseq-slim
TMP_DIR=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure
DATA_DIR=$ROOT_DIR/data-bin/$SRC_LANG-$TGT_LANG
TRAIN_DIR=$ROOT_DIR/train/$FLAG-$SRC_LANG-$TGT_LANG
# file path
RESULT_FILE=$ROOT_DIR/$FLAG-$SRC_LANG-$TGT_LANG.test.result
# SRC_DICT_PATH=/home/zhaoliang/fairseq-master/pretrain_model/wmt16.en-de.joined-dict.transformer/dict.en.txt
# TGT_DICT_PATH=/home/zhaoliang/fairseq-master/pretrain_model/wmt16.en-de.joined-dict.transformer/dict.de.txt
# DECODER_TEXT_DIR=/home/zhaoliang/fairseq-master/data-bin/wmt16-en-de
# for interactive decoder
TEST_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/test.en
REF_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/test.vi
CODE_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/data/code
DICT_FILE=/home/zhaoliang/fairseq-slim/data-raw/en-vi-pure/data/dict.en-vi
# hypeparameters
BEAM_SIZE=4
TRAIN_BS=10000
DECODER_BS=500
GRADIENTS_ACCUMULATIONS=16
DECODER_NUMWORKERS=4
# set up for training
# export CUDA_VISIBLE_DEVICES=${2:-3}
mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
echo "will use GPU $CUDA_VISIBLE_DEVICES"
if [[ $1 == 'train' ]]; then
TEXT=/home/zhaoliang/fairseq-slim/data-raw/en-fr-pure
# DATADIR=data-bin/en-fr-pure
# # clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
# MOSESDECODER=/home/zhaoliang/fairseq-slim/mosesdecoder
# # perl $MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/train en fr $TEXT/train.clean 3 400
# # perl $MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/dev fr en $TEXT/valid.clean 3 400
# # build subword vocab
# SUBWORD_NMT=/home/zhaoliang/fairseq-slim/subword-nmt/subword_nmt
NUM_OPS=32000
# # learn codes and encode separately
# CODES=codes.enfr.${NUM_OPS}.bpe
# # echo "Encoding subword with BPE using ops=${NUM_OPS}"
# # python3 $SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.en > $TEXT/${CODES}.en
# # python3 $SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.fr > $TEXT/${CODES}.fr
# # echo "Applying vocab to training"
# # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en < $TEXT/train.clean.en > $TEXT/train.${NUM_OPS}.bpe.en
# # python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.fr < $TEXT/train.clean.fr > $TEXT/train.${NUM_OPS}.bpe.fr
# VOCAB=vocab.${NUM_OPS}.bpe
# echo "Generating vocab: ${VOCAB}.en"
# cat $TEXT/train.${NUM_OPS}.bpe.en | python3 $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.en
# echo "Generating vocab: ${VOCAB}.fr"
# cat $TEXT/train.${NUM_OPS}.bpe.fr | python3 $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.fr
# # encode validation
# echo "Applying vocab to valid"
# python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en --vocabulary $TEXT/${VOCAB}.en < $TEXT/valid.clean.en > $TEXT/valid.${NUM_OPS}.bpe.en
# python3 $SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.fr --vocabulary $TEXT/${VOCAB}.fr < $TEXT/valid.clean.fr > $TEXT/valid.${NUM_OPS}.bpe.fr
# generate training data
# python3 preprocess.py \
# --source-lang en \
# --target-lang fr \
# --trainpref $TEXT/train.${NUM_OPS}.bpe \
# --validpref $TEXT/valid.${NUM_OPS}.bpe \
# --workers 12 \
# --destdir $DATA_DIR
mkdir -p $TRAIN_DIR/ckpt $TRAIN_DIR/log
python3 train.py $DATA_DIR \
--arch transformer \
--clip-norm 0 --optimizer adam --lr 0.001 \
--source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
--log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler inverse_sqrt \
--update-freq $GRADIENTS_ACCUMULATIONS \
--ddp-backend=no_c10d \
--max-update 100000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
--adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
--dropout 0.3 --attention-dropout 0.1 \
--tensorboard-logdir $TRAIN_DIR/log \
--save-dir $TRAIN_DIR/ckpt \
--fp16 # use-fp16
# --update-freq $GRADIENTS_ACCUMULATIONS \
# python3 train.py $DATA_DIR --arch transformer_vaswani_wmt_en_de_big --optimizer adam --adam-betas '(0.9, 0.98)' \
# --clip-norm 0.0 \
# --lr-scheduler inverse_sqrt \
# --warmup-init-lr 1e-07 \
# --warmup-updates 4000 \
# --lr 0.001 \
# --min-lr 1e-09 \
# --dropout 0.3 \
# --weight-decay 0.0 \
# --criterion label_smoothed_cross_entropy \
# --label-smoothing 0.1 \
# --max-tokens $TRAIN_BS \ # bs
# --max-update 3000 \ # training step is max-update * update-freq
# --update-freq $GRADIENTS_ACCUMULATIONS \ # gradients accumulations is 32 then its equal to bs is 32 times than before
# --tensorboard-logdir $TRAIN_DIR/log \
# --log-format json --save-dir $TRAIN_DIR/log \
# --fp16 # use-fp16
# Average 10 latest checkpoints:
elif [[ $1 == 'avg' ]]; then
mkdir -p $TRAIN_DIR/ckpt $TRAIN_DIR/ckpt
python3 scripts/average_checkpoints.py \
--inputs $TRAIN_DIR/ckpt \
--num-epoch-checkpoints 10 \
--output $TRAIN_DIR/ckpt/model.pt
elif [[ $1 == 'test' ]]; then
if [ -f $TRAIN_DIR/ckpt/model.pt ]; then
echo "will test use $TRAIN_DIR/ckpt/model.pt"
else
echo 'ckpt not found'
fi
python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
--beam $BEAM_SIZE \
--num-workers 12 \
--remove-bpe \
--batch-size $DECODER_BS | grep -P '^H' |cut -f3- | tee $TRAIN_DIR/$FLAGS.translation.$TGT_LANG
python3 score.py \
--sys $TRAIN_DIR/$TRANS-$FLAGS.translation.$TARGET_LANG \
--ref $REF_FILE | tee $RESULT_FILE
elif [[ $1 == 'decoder' ]]; then
# SUBWORD_NMT is the source code dir of subword_nmt
# subword_path is the code path of training data
SUBWORD_NMT=/home/zhaoliang/fairseq-slim/subword-nmt
TRAIN_DIR=/home/zhaoliang/fairseq-slim/train/baseline-en-vi
cat $TEST_FILE | python3 $SUBWORD_NMT/apply_bpe.py -c $CODE_FILE | \
python3 interactive.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
--beam $BEAM_SIZE \
--source-lang $SRC_LANG \
--target-lang $TGT_LANG \
--num-workers 12 \
--remove-bpe \
--batch-size $DECODER_BS \
--buffer-size $DECODER_BS | grep -P '^H' |cut -f3- | tee $TRAIN_DIR/$FLAGS.translation.$TGT_LANG
python3 score.py \
--sys $TRAIN_DIR/$FLAGS.translation.$TGT_LANG \
--ref $REF_FILE | tee $RESULT_FILE
else
echo "unknown argment 1"
fi
# 17200 en 7800 vi
- envi
TMP_DIR=data-raw/en-vi-pure
SRC_LANG=en
TGT_LANG=vi
TRAIN_BS=3582
TRAIN_DIR=train/${SRC_LANG}-${TGT_LANG}_fp16
DATA_DIR=data-bin/iwslt_${SRC_LANG}_${TGT_LANG}
result_file=$TRAIN_DIR/${SRC_LANG}-${TGT_LANG}.log
echo 'preprocess ' > $result_file
# python3 preprocess.py \
# --source-lang $SRC_LANG \
# --target-lang $TGT_LANG \
# --trainpref $TMP_DIR/train \
# --validpref $TMP_DIR/dev \
# --testpref $TMP_DIR/test \
# --nwordssrc 17200 \
# --nwordstgt 7800 \
# --workers 12 \
# --destdir $DATA_DIR
mkdir -p $TRAIN_DIR/log $TRAIN_DIR/ckpt
echo 'train ' >> $result_file
python3 train.py $DATA_DIR \
--arch transformer_iwslt_de_en \
--share-decoder-input-output-embed \
--clip-norm 0 --optimizer adam --lr 0.001 \
--source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
--log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler inverse_sqrt \
--max-update 20000 --warmup-updates 4000 --warmup-init-lr '1e-07' --update-freq 4\
--adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
--dropout 0.3 \
--tensorboard-logdir $TRAIN_DIR/log --log-format simple\
--save-dir $TRAIN_DIR/ckpt \
--fp16 2>&1 | tee $TRAIN_DIR/train.log
echo 'average checkpoints' >> $result_file
python3 scripts/average_checkpoints.py \
--inputs $TRAIN_DIR/ckpt \
--num-epoch-checkpoints 10 \
--output $TRAIN_DIR/ckpt/model.pt
echo 'generate' >> $result_file
# python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
# --beam 5 \
# --num-workers 12 \
# --batch-size 128 | grep -P '^H' |cut -f3- | tee $TRAIN_DIR/$FLAG.translation.$TGT_LANG
python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
--source-lang $SRC_LANG --target-lang $TGT_LANG \
--beam 5 \
--num-workers 12 \
--batch-size 128 2>&1 | tee $TRAIN_DIR/tmp.log
# echo 'score' >> $result_file
# grep ^H $TRAIN_DIR/tmp.log | cut -f3- > $TRAIN_DIR/gen.out.sys
# grep ^T $TRAIN_DIR/tmp.log | cut -f2- > $TRAIN_DIR/gen.out.ref
# python3 score.py --sys $TRAIN_DIR/gen.out.sys --ref $TRAIN_DIR/gen.out.ref | tee $result_file
-
avg
比不avg
要好的多,last
比best
要好
- 大的
bs
很关键1268
个句子的token
是27923
,所以最好能把内存塞满 - 跑出
30.66
的命令
TMP_DIR=data-raw/en-vi-pure
SRC_LANG=en
TGT_LANG=vi
# TRAIN_BS=3582
TRAIN_BS=10000
MAX_UPDATE=5000
TRAIN_DIR=train/${SRC_LANG}_${TGT_LANG}_fp16
DATA_DIR=data-bin/iwslt_${SRC_LANG}_${TGT_LANG}
result_file=$TRAIN_DIR/result.txt
# python3 preprocess.py \
# --source-lang $SRC_LANG \
# --target-lang $TGT_LANG \
# --trainpref $TMP_DIR/train \
# --validpref $TMP_DIR/dev \
# --testpref $TMP_DIR/test \
# --nwordssrc 17200 \
# --nwordstgt 7800 \
# --workers 12 \
# --destdir $DATA_DIR
mkdir -p $TRAIN_DIR/log $TRAIN_DIR/ckpt
touch $result_file
python3 train.py $DATA_DIR \
--arch transformer_iwslt_de_en \
--share-decoder-input-output-embed \
--clip-norm 0 --optimizer adam --lr 0.001 \
--source-lang $SRC_LANG --target-lang $TGT_LANG --max-tokens $TRAIN_BS --no-progress-bar \
--log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler inverse_sqrt \
--max-update $MAX_UPDATE --warmup-updates 4000 --warmup-init-lr '1e-07' --update-freq 4\
--adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
--dropout 0.3 \
--tensorboard-logdir $TRAIN_DIR/log --log-format simple\
--save-dir $TRAIN_DIR/ckpt \
--fp16 2>&1 | tee $TRAIN_DIR/train.log
python3 scripts/average_checkpoints.py \
--inputs $TRAIN_DIR/ckpt \
--num-epoch-checkpoints 10 \
--output $TRAIN_DIR/ckpt/model.pt
# python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
# --beam 5 \
# --num-workers 12 \
# --batch-size 128 | grep -P '^H' |cut -f3- | tee $TRAIN_DIR/$FLAG.translation.$TGT_LANG
echo 'average 10 checkpoints: ' | tee $result_file
python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/model.pt \
--source-lang $SRC_LANG --target-lang $TGT_LANG \
--beam 5 \
--num-workers 12 \
--batch-size 128 2>&1 | tee $TRAIN_DIR/translation.$TGT_LANG | tail -2 | tee $result_file
echo "best checkpoint:" | tee $result_file
python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/checkpoint_best.pt \
--source-lang $SRC_LANG --target-lang $TGT_LANG \
--beam 5 \
--num-workers 12 \
--batch-size 128 2>&1 | tail -2 | tee $result_file
echo "last checkpoint:" | tee $result_file
python3 generate.py $DATA_DIR/ --path $TRAIN_DIR/ckpt/checkpoint_last.pt \
--source-lang $SRC_LANG --target-lang $TGT_LANG \
--beam 5 \
--num-workers 12 \
--batch-size 128 2>&1 | tail -2 | tee $result_file
# grep ^H $TRAIN_DIR/tmp.log | cut -f3- > $TRAIN_DIR/gen.out.sys
# grep ^T $TRAIN_DIR/tmp.log | cut -f2- > $TRAIN_DIR/gen.out.ref
# python3 score.py --sys $TRAIN_DIR/gen.out.sys --ref $TRAIN_DIR/gen.out.ref | tee $result_file
- 调成
bs
增大到25000
的时候原来只是overflow 8
-
fp32
要比fp16
要好上0.4
个bleu
,同样都是10000次update
,BS
是7000
。
关于BS
和update
的关系,BS
也不是越大越好,
- 梯度累计都是
4
的时候 -
3582
的时候训练70
个epoch
是19075
次更新,20000
次是30.57
花费的时间是3794
,71
个epoch
,valid loss
是4.648
-
10000
的时候训练70
个epoch
是7000
次更新,但是实际上5000
次更新效果就很好了是30.66
时间是2736
,50
个epoch
,valid loss
是4.55
-
25000
的时候训练70
个epoch
是3196
次更新,2000次更新的训练时间2418
,44
个epoch
,valid loss
是4.975
,效果只有25.5
,3000
次更新效果只有28.95
训练时间是3493
66个epoch
,
梯度累计是8的时候
-
10000
5000
次更新,和3000
次更新 -
40000
的时候训练70
个epoch
是1067
,1000
次更新,和1500
次更新
40000
3582
25000
-
40000
的时候是24GB
rtx
的极限了峰值的时候有23GB
,现在稳定在22GB
左右,而且GPU
峰值利用率大大提高,可以达到98%
,10000
的时候峰值利用率84
多
image.png
当训练不足的时候average反而是有害的
- 总共训练了
43
个epoch
明显是训练不足,不管是avg
还是不avg
效果都远远比不了原来的30.6
,valid loss
也5.07
远远高于正常的4.6
fp32和fp16
-
envi 10000 5000 4
fp32
是30.40
fp16
是30.66
-
vien
网友评论