美文网首页生信
文章复现-全外显子数据分析学习2sra转fq文件

文章复现-全外显子数据分析学习2sra转fq文件

作者: jiarf | 来源:发表于2022-04-19 10:04 被阅读0次

    首先做个总结汇总一下


    1.检查自己下载的sra是否是完整的,validate.shu运行这个脚本,还在跑着
    2.制作循环生成fq文件的list-sra2case.txt,运行脚本make_sra2case.sh
    3.sra转fq文件,运行脚本sra2fq.sh


    正式开始

    cd /data1/jiarongf/learning/cancer-WES/run
    vim validate.sh
    cd ../0.sra/
    ## SRA数据验证
    touch validate.out
    ls  SRR* | grep -v "vdb" | xargs vdb-validate - >> validate.out 2>&1
    cat validate.out | grep consistent | wc
    q
    sh validate.sh
    

    这个代码包含操作步骤,比如那个q,
    下面是validate.sh的内容

    cd ../0.sra/
    touch validate.out
    ls  SRR* | grep -v "vdb" | xargs vdb-validate - >> validate.out 2>&1
    cat validate.out | grep consistent | wc
    

    我觉得这块是不是因为教程太久远了,发现做出来和教程不太一样,教程最后的结果


    validate.out

    然而我做出来的


    validate.out
    这个也要跑好久,就先放着把,先往下做
    第二天做完了的
    (/home/jiarongf/my-envs/wes) 15:35:42 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    sh validate.sh
         30     210    2580
    
    跑完了也检查了validate.out文件,没什么问题

    制作sra2case.txt

    15:56:28 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    cat make_sra2case.sh
    cd ../data
    cat SraRunTable.txt | grep "WXS" | cut -d ',' -f 1 > config
    cat SraRunTable.txt | grep "WXS" | cut -d ',' -f 27 | sed s/_1// > case
    paste -d ' ' config case > sra2case.txt
    
    15:56:31 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    sh make_sra2case.sh
    15:57:24 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    ls ../data
    case  config  sra2case.txt  SraRunTable.txt  SRR_Acc_List.txt
    

    生成成功

    cat config
    SRR3182418
    SRR3182419
    SRR3182420
    SRR3182421
    SRR3182422
    SRR3182423
    SRR3182424
    SRR3182425
    SRR3182426
    SRR3182427
    SRR3182428
    SRR3182429
    SRR3182430
    SRR3182431
    SRR3182432
    SRR3182433
    SRR3182434
    SRR3182435
    SRR3182436
    SRR3182437
    SRR3182438
    SRR3182439
    SRR3182440
    SRR3182441
    SRR3182442
    SRR3182443
    SRR3182444
    SRR3182445
    SRR3182446
    SRR3182447
    
    cat case
    case5_germline_WES
    case2_germline_WES
    case4_germline_WES
    case3_germline_WES
    case6_germline_WES
    case1_germline_WES
    case4_biorep_A_WES
    case4_biorep_B_techrep_WES
    case4_biorep_C_WES
    case3_biorep_A_WES
    case3_biorep_B_WES
    case3_biorep_C_techrep_WES
    case6_biorep_A_techrep_WES
    case6_biorep_B_WES
    case6_biorep_C_WES
    case1_biorep_A_techrep_WES
    case1_biorep_B_WES
    case1_biorep_C_WES
    case2_biorep_A_WES
    case2_biorep_B_techrep_WES
    case2_biorep_C_WES
    case5_biorep_A_WES
    case5_biorep_B_techrep_WES
    case4_techrep_2_WES
    case3_techrep_2_WES
    case6_techrep_2_WES
    case1_techrep_2_WES
    case2_techrep_2_WES
    case5_techrep_2_WES
    case5_biorep_C_WES
    
    
    cat ../data/sra2case.txt
    SRR3182418 case5_germline_WES
    SRR3182419 case2_germline_WES
    SRR3182420 case4_germline_WES
    SRR3182421 case3_germline_WES
    SRR3182422 case6_germline_WES
    SRR3182423 case1_germline_WES
    SRR3182424 case4_biorep_A_WES
    SRR3182425 case4_biorep_B_techrep_WES
    SRR3182426 case4_biorep_C_WES
    SRR3182427 case3_biorep_A_WES
    SRR3182428 case3_biorep_B_WES
    SRR3182429 case3_biorep_C_techrep_WES
    SRR3182430 case6_biorep_A_techrep_WES
    SRR3182431 case6_biorep_B_WES
    SRR3182432 case6_biorep_C_WES
    SRR3182433 case1_biorep_A_techrep_WES
    SRR3182434 case1_biorep_B_WES
    SRR3182435 case1_biorep_C_WES
    SRR3182436 case2_biorep_A_WES
    SRR3182437 case2_biorep_B_techrep_WES
    SRR3182438 case2_biorep_C_WES
    SRR3182439 case5_biorep_A_WES
    SRR3182440 case5_biorep_B_techrep_WES
    SRR3182441 case4_techrep_2_WES
    SRR3182442 case3_techrep_2_WES
    SRR3182443 case6_techrep_2_WES
    SRR3182444 case1_techrep_2_WES
    SRR3182445 case2_techrep_2_WES
    SRR3182446 case5_techrep_2_WES
    SRR3182447 case5_biorep_C_WES
    
    

    SRA转FQ(fasterq-dump)生成脚本sra2fq.sh

    ## 不建议还使用fastq-dump,速度很慢,但是fasterq-dump的参数不如fastq-dump丰富,没有“-A”参数,所以需要自己修改name,建议最后使用pigz压缩,fasterq-dump和pigz都可以设置多线程
    
    touch sra2fq.log
    
    ## sra2fq.sh
    cat ../sra2case.txt | while read id
    do
       arr=(${id})
       sample=${arr[0]}.sra
       case=${arr[1]}
       echo "time fasterq-dump -p -x -3 -N -P -f --skip-technical -e 16 ${sample} -O . >> sra2fq.log 2>&1"
       echo "sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq "
       echo "pigz -p 16 ${case}_1.fastq"
       echo "sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq "
       echo "pigz -p 16 -f ${case}_2.fastq"
    done > sra2fq.sh
    
    nohup bash sra2fq.sh &
    

    上述是教程的脚本
    因为报错了
    进行小幅度更改
    现在我的目录是/data1/jiarongf/learning/cancer-WES/run
    因为我所有运行的脚本都在这里放着

    touch sra2fq.log
    cd /data1/jiarongf/learning/cancer-WES/0.sra
    ## sra2fq.sh
    cat ../data/sra2case.txt | while read id
    do
       arr=(${id})
       sample=${arr[0]}.sra
       case=${arr[1]}
       echo "fasterq-dump"
       time fasterq-dump -p -x -3 -N -P -f --skip-technical -e 16 /data1/jiarongf/learning/cancer-WES/0.sra/${sample} -O /data1/jiarongf/learning/cancer-WES/0.sra/ >> sra2fq.log 2>&1
       echo "sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq "
       sed s/${sample}/${case}/ ${sample}_1.fastq > ${case}_1.fastq
       echo "pigz -p 16 ${case}_1.fastq"
       pigz -p 16 ${case}_1.fastq
       echo "sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq "
       sed s/${sample}/${case}/ ${sample}_2.fastq > ${case}_2.fastq
       echo "pigz -p 16 -f ${case}_2.fastq"
       pigz -p 16 -f ${case}_2.fastq
    done
    

    上面的代码是sra2fq.sh
    直接运行,需要开后台,不然跑的很慢

    17:04:03 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    screen -ls
    There are screens on:
            32942.wes_fq    (04/18/2022 04:04:49 PM)        (Attached)
            11537.wes       (04/18/2022 03:35:08 PM)        (Detached)
    2 Sockets in /run/screen/S-jiarongf.
    17:06:03 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    source activate wes
    (/home/jiarongf/my-envs/wes) 17:06:17 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    l
    creat-wes-envs.sh    download-aspera.sh                     ibm-aspera-connect_4.1.3.93_linux.tar.gz  prefetch.sh  sra2fq.sh      validate.sh
    download-aspera.log  ibm-aspera-connect_4.1.3.93_linux.sh*  make_sra2case.sh                          sra2fq.log   sra2fq.sh.log
    (/home/jiarongf/my-envs/wes) 17:06:21 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
    $
    bash sra2fq.sh
    fasterq-dump
    

    不知道能不能跑成功,先试试看,后面慢慢改
    跑成功了,是这样的


    0.sra
    (/home/jiarongf/my-envs/wes) 10:03:46 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/0.sra
    $
    ls *.gz
    case1_biorep_A_techrep_WES_1.fastq.gz  case4_biorep_A_WES_1.fastq.gz
    case1_biorep_A_techrep_WES_2.fastq.gz  case4_biorep_A_WES_2.fastq.gz
    case1_biorep_B_WES_1.fastq.gz          case4_biorep_B_techrep_WES_1.fastq.gz
    case1_biorep_B_WES_2.fastq.gz          case4_biorep_B_techrep_WES_2.fastq.gz
    case1_biorep_C_WES_1.fastq.gz          case4_biorep_C_WES_1.fastq.gz
    case1_biorep_C_WES_2.fastq.gz          case4_biorep_C_WES_2.fastq.gz
    case1_germline_WES_1.fastq.gz          case4_germline_WES_1.fastq.gz
    case1_germline_WES_2.fastq.gz          case4_germline_WES_2.fastq.gz
    case1_techrep_2_WES_1.fastq.gz         case4_techrep_2_WES_1.fastq.gz
    case1_techrep_2_WES_2.fastq.gz         case4_techrep_2_WES_2.fastq.gz
    case2_biorep_A_WES_1.fastq.gz          case5_biorep_A_WES_1.fastq.gz
    case2_biorep_A_WES_2.fastq.gz          case5_biorep_A_WES_2.fastq.gz
    case2_biorep_B_techrep_WES_1.fastq.gz  case5_biorep_B_techrep_WES_1.fastq.gz
    case2_biorep_B_techrep_WES_2.fastq.gz  case5_biorep_B_techrep_WES_2.fastq.gz
    case2_biorep_C_WES_1.fastq.gz          case5_biorep_C_WES_1.fastq.gz
    case2_biorep_C_WES_2.fastq.gz          case5_biorep_C_WES_2.fastq.gz
    case2_germline_WES_1.fastq.gz          case5_germline_WES_1.fastq.gz
    case2_germline_WES_2.fastq.gz          case5_germline_WES_2.fastq.gz
    case2_techrep_2_WES_1.fastq.gz         case5_techrep_2_WES_1.fastq.gz
    case2_techrep_2_WES_2.fastq.gz         case5_techrep_2_WES_2.fastq.gz
    case3_biorep_A_WES_1.fastq.gz          case6_biorep_A_techrep_WES_1.fastq.gz
    case3_biorep_A_WES_2.fastq.gz          case6_biorep_A_techrep_WES_2.fastq.gz
    case3_biorep_B_WES_1.fastq.gz          case6_biorep_B_WES_1.fastq.gz
    case3_biorep_B_WES_2.fastq.gz          case6_biorep_B_WES_2.fastq.gz
    case3_biorep_C_techrep_WES_1.fastq.gz  case6_biorep_C_WES_1.fastq.gz
    case3_biorep_C_techrep_WES_2.fastq.gz  case6_biorep_C_WES_2.fastq.gz
    case3_germline_WES_1.fastq.gz          case6_germline_WES_1.fastq.gz
    case3_germline_WES_2.fastq.gz          case6_germline_WES_2.fastq.gz
    case3_techrep_2_WES_1.fastq.gz         case6_techrep_2_WES_1.fastq.gz
    case3_techrep_2_WES_2.fastq.gz         case6_techrep_2_WES_2.fastq.gz
    

    相关文章

      网友评论

        本文标题:文章复现-全外显子数据分析学习2sra转fq文件

        本文链接:https://www.haomeiwen.com/subject/mhitertx.html