WDL学习笔记

作者: 生信摆渡 | 来源:发表于2019-12-16 18:13 被阅读0次

    官网是用来学习的最好的地方(然而我也没怎么看...)!

    WDL是什么?

    • WDL是由Broad Institute开发的一种流程开发语言,全称workflow description language

    WDL编辑器

    使用Sublime Txet就可以,安装WDL插件就可以漂亮地显示WDL代码了。

    一个WDL代码示例

    这是一个我老板用来跑RRBS的pipeline,以此来学习。

    workflow bismarkRRBS {
    
        String file_basename
        String masterFolder
        File Fastq_R1
        File? Fastq_R2
        Boolean paired
        Boolean MspI
        Int clip_R1
        Int? clip_R2
        Int three_prime_clip_R1
        Int? three_prime_clip_R2
        String genome_folder
        Int threadN
    
        String targetFolder = masterFolder + "/" + file_basename + "/"
    
        call makeDir as md1 {
            input:
                targetFolder = targetFolder
        }
    
        call makeDir as md2 {
            input:
                targetFolder = md1.outFolder + "report"
        }
    
        call trim_galore {
            input:
                Fastq_R1 = Fastq_R1,
                Fastq_R2 = Fastq_R2,
                clip_R1  = clip_R1,
                clip_R2  = clip_R2,
                three_prime_clip_R1 = three_prime_clip_R1,
                three_prime_clip_R2 = three_prime_clip_R2,
                paired   = paired,
                MspI     = MspI,
                targetFolder = md1.outFolder,
                reportFolder = md2.outFolder,
                file_basename = file_basename
        }
    
        call bismark {
            input:
                R1_trimmed = trim_galore.R1_trimmed,
                R2_trimmed = trim_galore.R2_trimmed,
                paired  = paired,
                threadN = threadN,
                file_basename = file_basename,
                genome_folder = genome_folder,
                targetFolder  = md1.outFolder,
                reportFolder  = md2.outFolder
        }
    
        call processBam {
            input:
                file_basename = file_basename,
                alignedBam    = bismark.alignedBam,
                genome_folder = genome_folder,
                alignedReport = bismark.alignedReport,
                paired  = paired,
                threadN = threadN,
                targetFolder = md1.outFolder,
                reportFolder = md2.outFolder
        }
    
        output {
    
            File DedupBam = processBam.DedupBam
            File DedupBam_index = processBam.DedupBam_index
        }
    
    
    }
    
    
    task trim_galore {
    
        File Fastq_R1
        File? Fastq_R2
        Boolean paired
        Boolean MspI
        Int clip_R1
        Int? clip_R2
        Int three_prime_clip_R1
        Int? three_prime_clip_R2
        String targetFolder
        String reportFolder
        String file_basename
    
        String s1 = if paired then "--paired" else ""
        String s2 = s1 + if (clip_R1 > 0) then " --clip_R1 " + clip_R1 else ""
        String s3 = s2 + if (three_prime_clip_R1 > 0) then " --three_prime_clip_R1 " + three_prime_clip_R1 else ""
        String s4 = s3 + if(paired && clip_R2 > 0) then " --clip_R2 " + clip_R2 else ""
        String s5 = s4 + if(MspI) then " --rrbs " else ""
        String commandText = s5 + if(paired && three_prime_clip_R2 > 0) then " --three_prime_clip_R2 " + three_prime_clip_R2 else ""
    
        command {
    
            mkdir trimmed 
            trim_galore ${commandText} -o trimmed --basename ${file_basename} ${Fastq_R1} ${Fastq_R2}
    
            cp trimmed/*report.txt ${reportFolder}
        }
    
        output{
            File R1_trimmed = if paired then "trimmed/${file_basename}_R1_val_1.fq.gz" else "trimmed/${file_basename}_trimmed.fq.gz"
            File R2_trimmed = if paired then "trimmed/${file_basename}_R2_val_2.fq.gz" else "/dev/null"
        }
    }
    
    task bismark {
    
        File R1_trimmed
        File? R2_trimmed
        Boolean paired
        Int threadN
        String file_basename
        String genome_folder
        String targetFolder
        String reportFolder
    
        String commandText = if paired then  " -1 " + R1_trimmed + " -2 " + R2_trimmed else R1_trimmed
    
        command {
    
            bismark --genome_folder ${genome_folder} \
                ${commandText} \
                -p ${threadN} \
                --rg_id ${file_basename} \
                --rg_sample ${file_basename} \
                --basename ${file_basename} \
                --temp_dir temp_dir \
                -o aligned
    
            cp aligned/*report.txt ${reportFolder}
        }
    
        output {
    
            File alignedBam = glob("aligned/*.bam")[0]
            File alignedReport = glob("aligned/*report.txt")[0]
        }
    }
    
    task processBam {
    
        File alignedBam
        File alignedReport
        File genome_folder
        String file_basename
        Boolean paired
        Int threadN
        String targetFolder
        String reportFolder
    
        String commandText  = if paired then "-p" else "-s"
    
        command {
    
            mkdir Dedup
            mkdir Call
    
            sambamba sort --natural-sort -m 8GB --tmpdir temp_dir -t ${threadN} -o ${file_basename}_nSorted.bam ${alignedBam}
            
            bismark_methylation_extractor ${commandText} --comprehensive --bedGraph --gzip --genome_folder ${genome_folder} \
                --multicore ${threadN} \
                -o Call ${file_basename}_nSorted.bam
    
            mv Call/*splitting_report.txt ${file_basename}_splitting_report.txt
            mv Call/*M-bias.txt ${file_basename}_M-bias.txt
    
            bismark2report --alignment_report ${alignedReport} \
                --mbias_report ${file_basename}_M-bias.txt \
                --splitting_report ${file_basename}_splitting_report.txt \
                -o ${file_basename}_report.html
    
            sambamba sort -m 8GB --tmpdir temp_dir -t ${threadN} -o ${file_basename}.bam ${file_basename}_nSorted.bam
    
            cp ${file_basename}.bam ${targetFolder}
            cp ${file_basename}.bam.bai ${targetFolder}
            cp ${file_basename}*.txt ${reportFolder}
            cp ${file_basename}_report.html ${targetFolder}
            cp -R Call ${targetFolder}
        }
    
        output {
    
            File DedupBam = "${file_basename}.bam"
            File DedupBam_index = "${file_basename}.bam.bai"
        }
    }
    
    task makeDir {
    
        String targetFolder
    
        command {
            
            if [ ! -d $targetFolder ] ; then
                mkdir ${targetFolder}
            else
                rm -rf ${targetFolder}
                mkdir ${targetFolder}
            fi
        }
    
        output {
            String outFolder = "${targetFolder}"
        }
    }
    

    WDL使用流程

    WDL 脚本核心结构包括 5 个基本组件:workflow, task, call, command 以及 outputworkflow 即是描述整个流程的框架,其中调用( call )了不同的 task 。每一个 task 中又定义了各自的命令 command 和输出结果 output,位于workflow 外部。通过json文件传入参数。

    结构概览:

    # workflow部分
    workflow your_workflow_name {
        String var1
        Int var2
        File var3
        ...
        
        call task_name1 {
            input:
                var1 = ...
                ...
        }
        
        ...
        
        output {
            ...
        }
    }
    
    # task部分
    task task_name1 {
        String var1
        Int var2
        File var3
        ...
        
        command {
            ...
        }
        
        output{
        ...
        {
    {
    
    ...
    

    编写workflow部分

    这一部分就是编写你的整个流程,包括声明变量、调用(call)任务(task),定义输出等部分。

    语法结构为:

    workflow workflow_name {
        var_type1 var1
        var_type2 var2
        ...
        
        call task1 {
            input:
                ...
        }
        
        call task1 {
            input:
                ...
        }
        
        output {
            File ...
        # 其实不太理解workflow部分的output有什么作用
    }
    

    在 WDL 中共有五种基本变量类型:

    • String 表示字符串
    • Float 为浮点数
    • Int 为整数
    • Boolean 为布尔型
    • File 表示文件

    创建变量时,必须声明变量类型:String str1 = "hello"

    另外,WDL 还支持复合类型:

    • Array 为数组
    • Map 为字典
    • Object 为对象

    有些声明加上了一个 ? ,这表示该变量为可选变量。

    使用变量:${var_name}

    字符可以使用+直接相加,合并后的字符之间无空格,相对应R语言的paste0()函数

    if语句:if xxx then xxx else,可读性很高

    按照示例代码解释一遍:

    # 创建名为bismarkRRBS的workflow
    workflow bismarkRRBS {
    
    # 声明所有要使用的变量
        String file_basename
        String masterFolder
        File Fastq_R1
        File? Fastq_R2
        Boolean paired
        Boolean MspI
        Int clip_R1
        Int? clip_R2
        Int three_prime_clip_R1
        Int? three_prime_clip_R2
        String genome_folder
        Int threadN
    
        String targetFolder = masterFolder + "/" + file_basename + "/"
    
    # 调用任务。要读懂call部分,还是要先看定义的task部分
        call makeDir as md1 {
            input:
                targetFolder = targetFolder
        }
    
        call makeDir as md2 {
            input:
                targetFolder = md1.outFolder + "report"
        }
    
        call trim_galore {
            input:
                Fastq_R1 = Fastq_R1,
                Fastq_R2 = Fastq_R2,
                clip_R1  = clip_R1,
                clip_R2  = clip_R2,
                three_prime_clip_R1 = three_prime_clip_R1,
                three_prime_clip_R2 = three_prime_clip_R2,
                paired   = paired,
                MspI     = MspI,
                targetFolder = md1.outFolder,
                reportFolder = md2.outFolder,
                file_basename = file_basename
        }
    
        call bismark {
            input:
                R1_trimmed = trim_galore.R1_trimmed,
                R2_trimmed = trim_galore.R2_trimmed,
                paired  = paired,
                threadN = threadN,
                file_basename = file_basename,
                genome_folder = genome_folder,
                targetFolder  = md1.outFolder,
                reportFolder  = md2.outFolder
        }
    
        call processBam {
            input:
                file_basename = file_basename,
                alignedBam    = bismark.alignedBam,
                genome_folder = genome_folder,
                alignedReport = bismark.alignedReport,
                paired  = paired,
                threadN = threadN,
                targetFolder = md1.outFolder,
                reportFolder = md2.outFolder
        }
    
        output {
    
            File DedupBam = processBam.DedupBam
            File DedupBam_index = processBam.DedupBam_index
        }
    
    }
    

    编写task部分

    task部分语法:

    task task_name {
        var_type1 var1
        ...
        
        command {
            ...
        }
        output {
            ...
        }
    }       
    

    task的output部分可以被后面的task所调用,不用output声明则不能被调用。

    以结构最简单的创建目录task为例:

    # 创建名为makeDir的task
    task makeDir {
    
    # 声明该task中要使用的变量
        String targetFolder
    # 编写该task的命令
        command {
            
            if [ ! -d $targetFolder ] ; then
                mkdir ${targetFolder}
            else
                rm -rf ${targetFolder}
                mkdir ${targetFolder}
            fi
        }
        
    # 该task的定义了名为outFolder内容为"${targetFolder}"的一个变量
        output {
            String outFolder = "${targetFolder}"
        }
    }
    

    创建json文件

    编写好的wdl脚本只是一个pipeline,没有具体(specific)的参数。所用使用json文件进行传参。json文件以.json为后缀,其格式为:

    {
        “workflow_name.var1":"specific_var1",
        “workflow_name.var1":"specific_var2",
        ...
    }
    

    需要注意的是,除了最后一行不需要加逗号,其余行必须加逗号。

    创建json文件的代码:

    # 创建包含配置信息的list
    xList = list(var1 = ..., 
                 var2 = ...,
                 var3 = ...,
                 ...)
    
    # 编写writeJSON函数
    writeJSON <- function(xList, mainTag, fileOut){
    
        names(xList) = paste0(mainTag, ".", names(xList))
        NM  = names(xList)
        OUT = file(fileOut, "w")
        cat("{\n", file = OUT, append = FALSE, sep = "")
    
        for(i in 1:length(xList)){
    
            value = xList[[i]]
            if(i < length(xList))
                cat("\t\"", NM[i], "\":\"", value, "\",\n", file=OUT, append = TRUE, sep = "")
            else
                cat("\t\"", NM[i], "\":\"", value, "\"\n", file=OUT, append = TRUE, sep = "")
        }
        cat("}\n", file = OUT, append = TRUE, sep = "")
    
        close(OUT)
    }
    
    JSON = paste0("JSON/", Tag, ".json")
    writeJSON(xList, "workflow_name", JSON)
    

    Tag指的是数据样本的Tag。

    writeJSON函数可以写到另外一个文件中,然后再source调用比较方便。

    运行WDL脚本

    不需要安装,只需要用调用java包即可。

    wget https://github.com/broadinstitute/cromwell/releases/download/47/cromwell-47.jar
    java -jar cromwell-47.jar run test.wdl --inputs test.json 
    

    递交任务

    通过使用qsub命令来对资源使用限量进行设置,并重定向stderr和stdout。

    cmd = "java -jar cromwell-47.jar run test.wdl --inputs test.json"
    cmd = paste0("echo ", "\"", cmd, "\"")
    cmd = paste(cmd, "| qsub -N", jobName)
    cmd = paste(cmd, "-l h_cpu=720:00:00 -V -m n -cwd -pe smp", Ncpu)
    cmd = paste0(cmd, " -l h_vmem=", Memory, "G")
    cmd = paste(cmd, "-o", outFile, "-e", errFile)
    systerm(cmd)
    

    同样可以写成一个函数,source调用。

    多样本处理

    以上讲的是单个或单组样本的workflow,但一般情况下,会得到多个raw_data,这样就需要循环上面的workflow。写个for循环即可,从1样本的数量循环,提取每个样本的Tag,并以此作为jobName和要保存的文件的文件夹名。

    相关文章

      网友评论

        本文标题:WDL学习笔记

        本文链接:https://www.haomeiwen.com/subject/nhbrnctx.html