Bulk RNAseq上游比对1:大致流程与conda环境 - 简书 (jianshu.com)
Bulk RNAseq上游比对2:下载数据、质控 - 简书 (jianshu.com)
Bulk RNAseq上游比对3:比对mapping - 简书 (jianshu.com)
Step3:比对
- 主要包括三个子步骤:mapping ---→ sam2bam ---→ featurecount
- 如下列出了常见的比对软件的用法,包括hisat2、star、bowtie2、bwa以及salmon
- 比对软件的命令调用均主要包括四大类参数:指定索引文件、参考基因组、pair fatsq.gz文件、以及线程数
3.1 以其中一对fatsq.gz文件为例,总结各个比对软件的用法
conda activate fq_map
pare_dir=/home/data/****/mapping
fq1=${pare_dir}/trim/SRR12720999_1_val_1.fq.gz
fq2=${pare_dir}/trim/SRR12720999_2_val_2.fq.gz
#hisat2
ref_idx_hisat2=$(refgenie seek hg38/hisat2_index -c ~/refgenie/genome_config.yaml)
time hisat2 -t -p 10 -x $ref_idx_hisat2 -1 $fq1 -2 $fq2 -S test.sam
#STAR
ref_idx_star=$(refgenie seek hg38/star_index -c ~/refgenie/genome_config.yaml)
time STAR --genomeDir $ref_idx_star \
--runThreadN 10 --readFilesCommand zcat \
--readFilesIn $fq1 $fq2 \
--outSAMtype SAM --outFileNamePrefix test
#Bowtie2
ref_idx_bowtie2=$(refgenie seek hg38/bowtie2_index -c ~/refgenie/genome_config.yaml)
time bowtie2 -p 10 -x $ref_idx_bowtie2 -1 $fq1 -2 $fq2 -S test.sam
#BWA
ref_idx_bwa=$(refgenie seek hg38/bwa_index -c ~/refgenie/genome_config.yaml)
time bwa mem -t 10 $ref_idx_bwa $fq1 $fq2 -o test.sam
#salmon
ref_idx_salmon=$(refgenie seek hg38_cdna/salmon_index -c ~/refgenie/genome_config.yaml)
time salmon quant -i $ref_idx_salmon -l A \
-1 $fq1 -2 $fq2 \
-p 10 -o test_quant
3.2 以每个比对软件的全部数据、步骤实操
(1) hisat2
#1、定义变量
ref_idx_hisat2=$(refgenie seek hg38/hisat2_index -c ~/refgenie/genome_config.yaml)
ref_gtf=$(refgenie seek hg38/gencode_gtf -c ~/refgenie/genome_config.yaml)
pare_dir=/home/data/****/mapping
#2、批量比对
cat ${pare_dir}/SraAccList.txt | while read id
do
echo $id
#首先进行比对,生成sam文件
echo "Start hisat mapping...."
hisat2 -p 10 -x $ref_idx_hisat2 \
-1 ${pare_dir}/trim/${id}_1_val_1.fq.gz \
-2 ${pare_dir}/trim/${id}_2_val_2.fq.gz \
-S ${pare_dir}/hisat2/${id}.sam
#然后sam转bam,同时删除内存较大的bam
echo "Start sam2bam...."
samtools view -S ${pare_dir}/hisat2/${id}.sam \
-@ 10 -b > ${pare_dir}/hisat2/${id}.bam
rm ${pare_dir}/hisat2/${id}.sam
done
#3、featureCounts提取表达信息
featureCounts -p -T 10 -t exon -g gene_name -a $ref_gtf -o hisat2_exp_counts.txt *.bam
#第1列为基因名,第7至最后1列为各个样本的表达信息
#4、各个样本比对率概况
multiqc ./ -n hisat2_multiqc_report.html
(2) STAR
#1、定义变量
ref_idx_star=$(refgenie seek hg38/star_index -c ~/refgenie/genome_config.yaml)
ref_gtf=$(refgenie seek hg38/gencode_gtf -c ~/refgenie/genome_config.yaml)
pare_dir=/home/data/****/mapping
#2、批量比对
# 尝试换用for循环,本质都一样
for sample in $(cat ${pare_dir}/SraAccList.txt)
do
echo "START sample ${sample}"
echo "Start STAR mapping..."
#首先进行比对,生成sam文件
STAR --genomeDir $ref_idx_star \
--runThreadN 10 --readFilesCommand zcat \
--readFilesIn ${pare_dir}/trim/${sample}_1_val_1.fq.gz ${pare_dir}/trim/${sample}_2_val_2.fq.gz \
--outSAMtype SAM --outFileNamePrefix ${sample}
echo "Start sam2bam..."
#然后sam转bam,同时删除内存较大的bam
samtools view -S ${pare_dir}/star/${sample}Aligned.out.sam \
-@ 10 -b > ${pare_dir}/star/${sample}.bam
rm ${pare_dir}/star/${sample}Aligned.out.sam
done
#3、featureCounts提取表达信息
featureCounts -p -T 10 -t exon -g gene_name -a $ref_gtf -o star_exp_counts.txt *.bam
#第1列为基因名,第7至最后1列为各个样本的表达信息
#4、各个样本比对率概况
multiqc ./ -n star_multiqc_report.html
(3) Bowtie2
#1、定义变量
ref_idx_bowtie2=$(refgenie seek hg38/bowtie2_index -c ~/refgenie/genome_config.yaml)
ref_gtf=$(refgenie seek hg38/gencode_gtf -c ~/refgenie/genome_config.yaml)
pare_dir=/home/data/****/mapping
#2、批量比对
for sample in $(cat ${pare_dir}/SraAccList.txt)
do
echo "START sample ${sample}"
echo "Start Bowtie2 mapping..."
bowtie2 -p 10 -x $ref_idx_bowtie2 \
-1 ${pare_dir}/trim/${sample}_1_val_1.fq.gz \
-2 ${pare_dir}/trim/${sample}_2_val_2.fq.gz \
-S ${pare_dir}/bowtie2/${sample}.sam
echo "Start sam2bam..."
samtools view -S ${pare_dir}/bowtie2/${sample}.sam \
-@ 10 -b > ${pare_dir}/bowtie2/${sample}.bam
rm ${pare_dir}/bowtie2/${sample}.sam
done
#3、featureCounts提取表达信息
featureCounts -p -T 10 -t exon -g gene_name -a $ref_gtf -o bowtie2_exp_counts.txt *.bam
#第1列为基因名,第7至最后1列为各个样本的表达信息
#4、各个样本比对率概况
multiqc ./ -n bowtie2_multiqc_report.html
(4) BWA
#1、定义变量
ref_idx_bwa=$(refgenie seek hg38/bwa_index -c ~/refgenie/genome_config.yaml)
ref_gtf=$(refgenie seek hg38/gencode_gtf -c ~/refgenie/genome_config.yaml)
pare_dir=/home/data/****/mapping
#2、批量比对
for sample in $(cat ${pare_dir}/SraAccList.txt)
do
echo "START sample ${sample}"
echo "Start BWA mapping..."
bwa mem -t 10 $ref_idx_bwa \
${pare_dir}/trim/${sample}_1_val_1.fq.gz \
${pare_dir}/trim/${sample}_2_val_2.fq.gz \
-S -o ${pare_dir}/bwa/${sample}.sam
echo "Start sam2bam..."
samtools view -S ${pare_dir}/bwa/${sample}.sam \
-@ 10 -b > ${pare_dir}/bwa/${sample}.bam
rm ${pare_dir}/bwa/${sample}.sam
done
#3、featureCounts提取表达信息
featureCounts -p -T 10 -t exon -g gene_name -a $ref_gtf -o bwa_exp_counts.txt *.bam
#第1列为基因名,第7至最后1列为各个样本的表达信息
#4、各个样本比对率概况
multiqc ./ -n bwa_multiqc_report.html
(5) salmon
#1、定义变量
refgenie list -g hg38_cdna -c ~/refgenie/genome_config.yaml
ref_idx_salmon=$(refgenie seek hg38_cdna/salmon_index -c ~/refgenie/genome_config.yaml)
pare_dir=/home/data/****/mapping
#2、批量比对
cat ${pare_dir}/SraAccList.txt | while read id
do
echo $id
salmon quant -i $ref_idx_salmon -l A \
-1 ${pare_dir}/trim/${id}_1_val_1.fq.gz \
-2 ${pare_dir}/trim/${id}_2_val_2.fq.gz \
-p 10 -o ${pare_dir}/salmon/${id}_quant
done
#3、tximport R包提取表达矩阵
R
library(tximport)
library(readr)
library(biomaRt)
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
attributes = listAttributes(ensembl)
attributes[1:5,]
# library(httr)
# httr::set_config(config(ssl_verifypeer = 0L))
gene_ids <- getBM(attributes= c("hgnc_symbol","ensembl_transcript_id"),
mart= ensembl)
gene_ids = gene_ids[!duplicated(gene_ids[,2]),]
colnames(gene_ids) = c("gene_id","tx_id")
gene_ids = gene_ids[,c("tx_id","gene_id")]
files <- list.files(pattern = '*sf',recursive = T, full.names=T)
txi <- tximport(files, type = "salmon", tx2gene = gene_ids, ignoreTxVersion = T, ignoreAfterBar=T)
class(txi)
names(txi)
head(txi$length)
head(txi$counts)
srrs = stringr::str_extract(files, "SRR[:digit:]+")
salmon_expr <- txi$counts
salmon_expr <- apply(salmon_expr, 2, as.integer)
rownames(salmon_expr) <- rownames(txi$counts)
colnames(salmon_expr) <- srrs
save(salmon_expr, file="./salmon_expr.rda")
以上是总结的RNA-seq上游比对的流程以及命令用法,如上可以看出只是调用了每种比对软件的基本用法,如果想要深入了解每个比对软件的进阶用法,可以参考官方文档,或者相关笔记教程,还是有很多的,这里就不细致学习了。提供一个流程框架,供自己以后方便调用。
网友评论