RNAseq pipline: Hisat2 samtools

作者: wangsb_2020 | 来源:发表于2020-03-21 18:51 被阅读0次

RNAseq pipline: Hisat2 samtools
RNASEQ分析入门笔记1-软件安装
转录组分析所需软件
RNAseq转录组分析流程：fastp+hisat2+samto
conda环境报错： Solving environment:
RNAseq下游分析--edgeR +cluserprofile
用shell脚本实现转录组测序中解压-比对-计算基因表达量一
hisat2的使用, samtools
samtools使用大全
Samtools为何会报错：不能共享文件libbz2.so.1.

花了两天整理的RNAseq pipline 分享出来，供学习参考，同时也听听大家的建议。
RNAseq pipline: Hisat2 -> samtools -> featureCounts -> DESeq2

Step1.数据准备

genome

git clone https://github.com/moold/Genome-data-of-Hanfu-apple.git

reads

wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/009/SRR2176359
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/000/SRR2176360
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/001/SRR2176361
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/002/SRR2176362
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/003/SRR2176363
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/004/SRR2176364
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/005/SRR2176365
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/006/SRR2176366
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/007/SRR2176367
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/008/SRR2176368
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/009/SRR2176369
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/000/SRR2176370
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/001/SRR2176371
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/002/SRR2176372
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/003/SRR2176373
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/004/SRR2176374
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/005/SRR2176375
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/006/SRR2176376
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/007/SRR2176377
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/008/SRR2176378
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/009/SRR2176379
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/000/SRR2176380
wget -c ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR217/001/SRR2176381
#转为fasq文件
for n in SSR* ; do fasterq-dump -e 4 --split-files $n ; done
#gtf注释文件
gffread -T -o genes.gtf HFTH1.gene.gff3

Step2.比对到参考基因组

conda install hisat2
conda install samtools
#为参考基因组构建index
~/miniconda3/bin/hisat2-build genome.fasta genome 1>hisat2-build.log 2>&1
#Hisat2比对
#SE
for n in *fastq ; do ~/miniconda3/bin/hisat2 --new-summary -p 20 -x ../genome/genome $n -S ${n}.sam --rna-strandness R 1> ${n}.log 2>&1 ; done
#比对结果排序 index
for n in *sam ; do ~/miniconda3/bin/samtools sort -@ 30 -o ${n}.bam $n ; ~/miniconda3/bin/samtools index ${n}.bam & done

Step3.表达定量

#featureCounts in subread
conda install subread -y
for n in *bam ; do ~/miniconda3/bin/featureCounts -a ../genome/genes.gtf -o ${n}_featurecounts.txt -t exon -g gene_id $n -T 10 1> ${n}.log 2>&1; done
for n in *txt ; do cat $n | cut -f 1,7 | grep -v ^# > ${n}.readscount ; done
paste *readscount | awk '{printf $1"\t"; for(i=2;i<=NF;i+=2){printf $i"\t"};print ""}' > gene.counts.matrix
for n in `cat sample | sed 's/\t/#/' ` ; do echo ${n%#*} ${n#*#} ;sed -i "s/${n%#*}.fastq.sam.bam/${n#*#}/" gene.counts.matrix ; done

Step4.差异表达分析 DESeq2

#R
conda install R -y

#R script:  # R version >= 3.6
install.packages("ggplot2")
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("DESeq2")
library('DESeq2')
install.packages('latticeExtra')
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("GenomeInfoDbData")
BiocManager::install("DESeq2")
library('DESeq2')
BiocManager::install('apeglm')
library(apeglm)


# 表达量矩阵
setwd('1.DESeq2')
countdata = read.csv('gene.counts.matrix', header = T, sep = '\t', row.names = 1)
countdata = countdata[7:12]
head(countdata)
# 控制对照
coldata = data.frame(row.names = colnames(countdata), Sample = factor(c(rep("BPL",3), rep("KPL", 3)), levels = c("BPL", "KPL")))
head(coldata)
#coldata = read.csv('sample.csv', header = T, sep = ',')
# 构建dds
dds <- DESeqDataSetFromMatrix(countData = countdata,
                              colData = coldata,
                              design= ~ Sample)
# DESeq分析dds
dds = DESeq(dds)
# 输出结果
# res = results(dds)
# result = as.data.frame(res)
# head(result)
# result
# 使用apeglm对log2FoldChange的值进行校正
resLFC = lfcShrink(dds, coef = resultsNames(dds)[2], type = 'apeglm')
#resLFC

# sort by padj
# res = res[order(res$padj),]

diff_gene = subset(resLFC, padj <= 0.05 & (log2FoldChange >= 1 | log2FoldChange <= -1))
resdata = merge(as.data.frame(diff_gene), as.data.frame(counts(dds, normalized = F)), by="row.names", sort=FALSE)
resdata$significant = "unchange"
resdata$significant[resdata$padj <= 0.05 & resdata$log2FoldChange >= 1] = 'upregulated'
resdata$significant[resdata$padj <= 0.05 & resdata$log2FoldChange <= -1] = 'downregulated'
resdata