Hisat2+Stringtie+DESeq2 workflow

作者: 547可是贼帅的547 | 来源:发表于2020-01-18 14:41 被阅读0次

Hisat2+Stringtie+DESeq2 workflow
Workflow
AWS SWF
比特币地址查询
J2EE VS Spring website workflow
2020-11-10 Workflow can't be tri
【iOS】效率提升工具--Workflow
flowable 流程图的 Vue 库
【UiPath Studio 中文文档】26 参数 - 使用参数
【UiPath 问题 02 】WorkFlow File 参数的

前言

因为以前的分析流程要在Shell 和 R 之间切换，然后可控性还差，有点烦，正好最近有点数据要分析，干脆就重新建立一个 all in one 的流程，方便以后使用。

运行环境

Ubuntu 18.04
R 3.6.1

用到的软件

fastp
bowtie2
hisat2
stringtie
DESeq2

Refference data

rRNA index:用bowtie2去除fastq文件中的rRNA
tRNA index:http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa
Hisat2 index:https://cloud.biohpc.swmed.edu/index.php/s/grch38_tran/download
GRCh38.96.gtf:ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
Homo_sapiens.GRCh38.dna.primary_assembly.fa:ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz

代码

############## Functions #####################
#Hisat2 参数
grch38_tran_index = "/home/zhou/RNASEQ/index/grch38_tran/genome_tran"
rRNA_index = "/home/zhou/RNASEQ/index/rRNA_index/human_rRNA"
tRNA_index = "/home/zhou/RNASEQ/index/tRNA_index_hg38/hg38_tRNA"
GRCh38.96.gtf = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.96.gtf"
fasta = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

#fastp preprocess
##Single version
fastp_SE <- function(input_fastq = input_fastq) {
  fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_fastpedited.fastq.gz")
  fastp_cmd = paste(fastp_PATH, "-i", input_fastq, "-o", output_fastq, "-w 16", "-z 9")
  system(fastp_cmd,wait = T)
  return(output_fastq)
}
##Paired version
fastp_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R) {
  fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
  output_fastq_F = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_F),"_fastpedited.fastq.gz")
  output_fastq_R = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_R),"_fastpedited.fastq.gz")
  fastp_cmd = paste(fastp_PATH, "-i", input_fastq_F, "-o", output_fastq_F, "-I", input_fastq_R, "-O", output_fastq_R, "-w 16", "-z 9")
  system(fastp_cmd,wait = T)
  return(c(output_fastq_F,output_fastq_R))
}

#rRNA remove
##Single version
remove_rRNA_SE <- function(input_fastq = input_fastq, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmrRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam")
  return(output_fastq)
}

##Paired version
remove_rRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
  output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
  output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
  output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
  output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmrRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
  rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
  rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam",wait = T)
  system(rename_cmd_F,wait = T)
  system(rename_cmd_R,wait = T)
  return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#tRNA remove
##Single version
remove_tRNA_SE <- function(input_fastq = input_fastq, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmtRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam")
  return(output_fastq)
}

##Paired version
remove_tRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
  output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
  output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
  output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
  output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmtRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
  rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
  rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam",wait = T)
  system(rename_cmd_F,wait = T)
  system(rename_cmd_R,wait = T)
  return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#hisat2 
##Single version
hisat2_SE <- function(index = index, input_fastq = input_fastq, output_prefix = output_prefix) {
  bam_filename = paste0(output_prefix,".bam")
  hisat2_PATH = "/home/zhou/bin/hisat2"
  hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -U", input_fastq, " | samtools sort -@ 20 -o", bam_filename)
  system(hisat2_cmd,wait = T)
  return(bam_filename)
}

##Paired version
hisat2_PE <- function(index = index, input_fastq_F = input_fastq_F,input_fastq_R = input_fastq_R, output_prefix = output_prefix) {
  bam_filename = paste0(output_prefix,".bam")
  hisat2_PATH = "/home/zhou/bin/hisat2"
  hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -1", input_fastq_F, "-2", input_fastq_R, " | samtools sort -@ 20 -o", bam_filename)
  system(hisat2_cmd,wait = T)
  return(bam_filename)
}

#stringtie
stringtie <- function(input_bam = input_bam, gtf = gtf) {
  output_prefix = gsub(pattern = "\\.bam",replacement = "",x = input_bam)
  tab_filename = paste0(output_prefix,".tab")
  gtf_filename = paste0(output_prefix,".gtf")
  stringtie_PATH = "/home/zhou/bin/stringtie"
  stringtie_cmd = paste(stringtie_PATH,"-p 20 -G", gtf, "-o", gtf_filename, "-e -l", output_prefix, input_bam)
  system(stringtie_cmd,wait = T)
  return(gtf_filename)
}

#generate gene\transcript count matrix
run.prepDE.py <- function(inputfile = inputfile) {
  run.cmd = paste("python ~/Software/prepDE.py -i",inputfile)
  system(run.cmd,wait = T)
}
################ test data ############################
setwd("~/ribosome/mTOR_Liver_cell/testdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1_test.fq.gz|_R2_test.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1_test.fq.gz"),input_fastq_R = paste0(i,"_R2_test.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}


################ real data ############################
#rnaseq
setwd("~/ribosome/mTOR_Liver_cell/RNA-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}

## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")

## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)

library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))

database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res) 
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
total.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])

#riboseq
setwd("~/ribosome/mTOR_Liver_cell/RIBO-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}

## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")

## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)

library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))

database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res) 
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
Ribo.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])

结语

把所有功能都模块化了，真舒服啊！
此外，有些function可以增加参数来进一步增加自由度，比如说线程数，我懒得写了，其实可以慢慢完善的，以后看心情把，毕业季实在太忙了。。。

Hisat2+Stringtie+DESeq2 workflow
前言因为以前的分析流程要在Shell 和 R 之间切换，然后可控性还差，有点烦，正好最近有点数据要分析，干脆就重...
Workflow
Workflow入门Workflow 教程(一)Workflow 教程（二）
AWS SWF
Concepts workflow. A workflow is a set of activities that...
比特币地址查询
利用Workflow一键查询比特币地址余额说明：1.下载Workflow：https://workflow.is...
J2EE VS Spring website workflow
J2EE website workflow Spring website workflow
2020-11-10 Workflow can't be tri
Q：Workflow 不能被启动A: 因为 Service Catalog Request workflow ...
【iOS】效率提升工具--Workflow
workflow是什么？干嘛用的？ workflow是iOS上一款工具App，workflow本意是工作流，里面内...
flowable 流程图的 Vue 库
workflow-bpmn-modeler workflow-bpmn-modeler 基于 Vue 和 bpmn...
【UiPath Studio 中文文档】26 参数 - 使用参数
由于参数的特性，你将在 Invoke Workflow File 和 Launch Workflow Intera...
【UiPath 问题 02 】WorkFlow File 参数的
如果你要为 WorkFlow 设置参数（Arguments），你应该：首先，现在 Workflow 文件的参数...