首先做个总结汇总一下
1.检查自己下载的sra是否是完整的,validate.shu
运行这个脚本,还在跑着
2.制作循环生成fq文件的list-sra2case.txt,运行脚本make_sra2case.sh
3.sra转fq文件,运行脚本sra2fq.sh
正式开始
cd /data1/jiarongf/learning/cancer-WES/run
vim validate.sh
cd ../0.sra/
## SRA数据验证
touch validate.out
ls SRR* | grep -v "vdb" | xargs vdb-validate - >> validate.out 2>&1
cat validate.out | grep consistent | wc
q
sh validate.sh
这个代码包含操作步骤,比如那个q,
下面是validate.sh
的内容
cd ../0.sra/
touch validate.out
ls SRR* | grep -v "vdb" | xargs vdb-validate - >> validate.out 2>&1
cat validate.out | grep consistent | wc
我觉得这块是不是因为教程太久远了,发现做出来和教程不太一样,教程最后的结果
validate.out
然而我做出来的
validate.out
这个也要跑好久,就先放着把,先往下做
第二天做完了的
(/home/jiarongf/my-envs/wes) 15:35:42 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
sh validate.sh
30 210 2580
跑完了也检查了validate.out文件,没什么问题
制作sra2case.txt
15:56:28 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
cat make_sra2case.sh
cd ../data
cat SraRunTable.txt | grep "WXS" | cut -d ',' -f 1 > config
cat SraRunTable.txt | grep "WXS" | cut -d ',' -f 27 | sed s/_1// > case
paste -d ' ' config case > sra2case.txt
15:56:31 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
sh make_sra2case.sh
15:57:24 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
ls ../data
case config sra2case.txt SraRunTable.txt SRR_Acc_List.txt
生成成功
cat config
SRR3182418
SRR3182419
SRR3182420
SRR3182421
SRR3182422
SRR3182423
SRR3182424
SRR3182425
SRR3182426
SRR3182427
SRR3182428
SRR3182429
SRR3182430
SRR3182431
SRR3182432
SRR3182433
SRR3182434
SRR3182435
SRR3182436
SRR3182437
SRR3182438
SRR3182439
SRR3182440
SRR3182441
SRR3182442
SRR3182443
SRR3182444
SRR3182445
SRR3182446
SRR3182447
cat case
case5_germline_WES
case2_germline_WES
case4_germline_WES
case3_germline_WES
case6_germline_WES
case1_germline_WES
case4_biorep_A_WES
case4_biorep_B_techrep_WES
case4_biorep_C_WES
case3_biorep_A_WES
case3_biorep_B_WES
case3_biorep_C_techrep_WES
case6_biorep_A_techrep_WES
case6_biorep_B_WES
case6_biorep_C_WES
case1_biorep_A_techrep_WES
case1_biorep_B_WES
case1_biorep_C_WES
case2_biorep_A_WES
case2_biorep_B_techrep_WES
case2_biorep_C_WES
case5_biorep_A_WES
case5_biorep_B_techrep_WES
case4_techrep_2_WES
case3_techrep_2_WES
case6_techrep_2_WES
case1_techrep_2_WES
case2_techrep_2_WES
case5_techrep_2_WES
case5_biorep_C_WES
cat ../data/sra2case.txt
SRR3182418 case5_germline_WES
SRR3182419 case2_germline_WES
SRR3182420 case4_germline_WES
SRR3182421 case3_germline_WES
SRR3182422 case6_germline_WES
SRR3182423 case1_germline_WES
SRR3182424 case4_biorep_A_WES
SRR3182425 case4_biorep_B_techrep_WES
SRR3182426 case4_biorep_C_WES
SRR3182427 case3_biorep_A_WES
SRR3182428 case3_biorep_B_WES
SRR3182429 case3_biorep_C_techrep_WES
SRR3182430 case6_biorep_A_techrep_WES
SRR3182431 case6_biorep_B_WES
SRR3182432 case6_biorep_C_WES
SRR3182433 case1_biorep_A_techrep_WES
SRR3182434 case1_biorep_B_WES
SRR3182435 case1_biorep_C_WES
SRR3182436 case2_biorep_A_WES
SRR3182437 case2_biorep_B_techrep_WES
SRR3182438 case2_biorep_C_WES
SRR3182439 case5_biorep_A_WES
SRR3182440 case5_biorep_B_techrep_WES
SRR3182441 case4_techrep_2_WES
SRR3182442 case3_techrep_2_WES
SRR3182443 case6_techrep_2_WES
SRR3182444 case1_techrep_2_WES
SRR3182445 case2_techrep_2_WES
SRR3182446 case5_techrep_2_WES
SRR3182447 case5_biorep_C_WES
SRA转FQ(fasterq-dump)生成脚本sra2fq.sh
## 不建议还使用fastq-dump,速度很慢,但是fasterq-dump的参数不如fastq-dump丰富,没有“-A”参数,所以需要自己修改name,建议最后使用pigz压缩,fasterq-dump和pigz都可以设置多线程
touch sra2fq.log
## sra2fq.sh
cat ../sra2case.txt | while read id
do
arr=(${id})
sample=${arr[0]}.sra
case=${arr[1]}
echo "time fasterq-dump -p -x -3 -N -P -f --skip-technical -e 16 ${sample} -O . >> sra2fq.log 2>&1"
echo "sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq "
echo "pigz -p 16 ${case}_1.fastq"
echo "sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq "
echo "pigz -p 16 -f ${case}_2.fastq"
done > sra2fq.sh
nohup bash sra2fq.sh &
上述是教程的脚本
因为报错了
进行小幅度更改
现在我的目录是/data1/jiarongf/learning/cancer-WES/run
因为我所有运行的脚本都在这里放着
touch sra2fq.log
cd /data1/jiarongf/learning/cancer-WES/0.sra
## sra2fq.sh
cat ../data/sra2case.txt | while read id
do
arr=(${id})
sample=${arr[0]}.sra
case=${arr[1]}
echo "fasterq-dump"
time fasterq-dump -p -x -3 -N -P -f --skip-technical -e 16 /data1/jiarongf/learning/cancer-WES/0.sra/${sample} -O /data1/jiarongf/learning/cancer-WES/0.sra/ >> sra2fq.log 2>&1
echo "sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq "
sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq
echo "pigz -p 16 ${case}_1.fastq"
pigz -p 16 ${case}_1.fastq
echo "sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq "
sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq
echo "pigz -p 16 -f ${case}_2.fastq"
pigz -p 16 -f ${case}_2.fastq
done
上面的代码是sra2fq.sh
的
直接运行,需要开后台,不然跑的很慢
17:04:03 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
screen -ls
There are screens on:
32942.wes_fq (04/18/2022 04:04:49 PM) (Attached)
11537.wes (04/18/2022 03:35:08 PM) (Detached)
2 Sockets in /run/screen/S-jiarongf.
17:06:03 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
source activate wes
(/home/jiarongf/my-envs/wes) 17:06:17 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
l
creat-wes-envs.sh download-aspera.sh ibm-aspera-connect_4.1.3.93_linux.tar.gz prefetch.sh sra2fq.sh validate.sh
download-aspera.log ibm-aspera-connect_4.1.3.93_linux.sh* make_sra2case.sh sra2fq.log sra2fq.sh.log
(/home/jiarongf/my-envs/wes) 17:06:21 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
bash sra2fq.sh
fasterq-dump
不知道能不能跑成功,先试试看,后面慢慢改
跑成功了,是这样的
0.sra
(/home/jiarongf/my-envs/wes) 10:03:46 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/0.sra
$
ls *.gz
case1_biorep_A_techrep_WES_1.fastq.gz case4_biorep_A_WES_1.fastq.gz
case1_biorep_A_techrep_WES_2.fastq.gz case4_biorep_A_WES_2.fastq.gz
case1_biorep_B_WES_1.fastq.gz case4_biorep_B_techrep_WES_1.fastq.gz
case1_biorep_B_WES_2.fastq.gz case4_biorep_B_techrep_WES_2.fastq.gz
case1_biorep_C_WES_1.fastq.gz case4_biorep_C_WES_1.fastq.gz
case1_biorep_C_WES_2.fastq.gz case4_biorep_C_WES_2.fastq.gz
case1_germline_WES_1.fastq.gz case4_germline_WES_1.fastq.gz
case1_germline_WES_2.fastq.gz case4_germline_WES_2.fastq.gz
case1_techrep_2_WES_1.fastq.gz case4_techrep_2_WES_1.fastq.gz
case1_techrep_2_WES_2.fastq.gz case4_techrep_2_WES_2.fastq.gz
case2_biorep_A_WES_1.fastq.gz case5_biorep_A_WES_1.fastq.gz
case2_biorep_A_WES_2.fastq.gz case5_biorep_A_WES_2.fastq.gz
case2_biorep_B_techrep_WES_1.fastq.gz case5_biorep_B_techrep_WES_1.fastq.gz
case2_biorep_B_techrep_WES_2.fastq.gz case5_biorep_B_techrep_WES_2.fastq.gz
case2_biorep_C_WES_1.fastq.gz case5_biorep_C_WES_1.fastq.gz
case2_biorep_C_WES_2.fastq.gz case5_biorep_C_WES_2.fastq.gz
case2_germline_WES_1.fastq.gz case5_germline_WES_1.fastq.gz
case2_germline_WES_2.fastq.gz case5_germline_WES_2.fastq.gz
case2_techrep_2_WES_1.fastq.gz case5_techrep_2_WES_1.fastq.gz
case2_techrep_2_WES_2.fastq.gz case5_techrep_2_WES_2.fastq.gz
case3_biorep_A_WES_1.fastq.gz case6_biorep_A_techrep_WES_1.fastq.gz
case3_biorep_A_WES_2.fastq.gz case6_biorep_A_techrep_WES_2.fastq.gz
case3_biorep_B_WES_1.fastq.gz case6_biorep_B_WES_1.fastq.gz
case3_biorep_B_WES_2.fastq.gz case6_biorep_B_WES_2.fastq.gz
case3_biorep_C_techrep_WES_1.fastq.gz case6_biorep_C_WES_1.fastq.gz
case3_biorep_C_techrep_WES_2.fastq.gz case6_biorep_C_WES_2.fastq.gz
case3_germline_WES_1.fastq.gz case6_germline_WES_1.fastq.gz
case3_germline_WES_2.fastq.gz case6_germline_WES_2.fastq.gz
case3_techrep_2_WES_1.fastq.gz case6_techrep_2_WES_1.fastq.gz
case3_techrep_2_WES_2.fastq.gz case6_techrep_2_WES_2.fastq.gz
网友评论