题目链接:http://www.bio-info-trainee.com/3920.html
#此仅作为学习笔记,记录自己运行使用的代码和过程,并不与题目完全相符
#我是菜鸡,还在理解学习记录的代码
#正在复习学过的代码,这着学过的代码,能回忆起来就不错了,看着之前的代码想怎么回答问题中
#勿复制勿参考,免得被带偏,谢谢
Q1: 参考基因组及注释文件下载地址
列出人,小鼠,拟南芥的基因组序列,转录组cDNA序列,基因组注释gtf文件下载地址
GATK
Ensembl
NCBI
UCSC
Q2: 找到文章的测序数据
2018年12月的NC文章:Spatially and functionally distinct subclasses of breast cancer-associated fibroblasts revealed by single cell RNA sequencing 使用成熟的单细胞转录组( Smart-seq2 )
手段探索了癌相关的成纤维细胞 CAFs的功能和空间异质性。
#GSE111229 SRA文件存放位置
https://www.ncbi.nlm.nih.gov/sra?term=SRP133642
Q3:下载测序数据
主要是理解GEO链接: GSE111229 和原始测序数据:SRP133642 两个链接
source activate rna
#配置小环境rna
#安装aspera软件,调整环境变量,加快下载速度
cat SRR_Acc_List.txt | while read id; do (prefetch ${id} -O ~);done
#循环下载
Q4: 任意挑选6个样本走标准的RNA-SEQ上游流程
即 sra → fastq→bam→counts
注意每个步骤的质控细节,注意每个步骤的文件格式转换背后的生物学意义。
代码参考在:code
conda activate rna
#
prefetch SRR1039510 -O ~
fastq-dump --gzip --split-3 -O ~/ /teach/rna_testdata/project/1.rna/1.sra_data/SRR1039510.sra
multiqc ./*zip
#
hisat2 -p 1 -x ~/rna_testdata/database/index/hisat/hg38/genome -1 ~/rna_testdata/project/1.rna/3.raw_fq_25000reads/SRR1039510_1_100000.rawfq.gz -2 ~/rna_testdata/project/1.rna/3.raw_fq_25000reads/SRR1039510_2_100000.rawfq.gz |samtools sort -@ 1 -o ~/SRR1039510.sort.bam -
#
samtools index SRR1039510.sort.bam
#
featureCounts -T 5 -p -t exon -g gene_id -a /teach/database/gtf/gencode.v29.annotation.gtf.gz \ -o ~/all.id.txt /teach/project/1.rna/5.sort.bam/*sort.bam
#
eatureCounts -T 5 -p -t exon -g gene_id -a /teach/rna_testdata/database/gtf/gencode.v29.annotation.gtf.gz -o ~/all.id.txt ~/rna_testdata/project/1.rna/5.sort.bam/*bam
#
featureCounts -T 5 -p -t exon -g gene_id -a /teach/rna_testdata/database/gtf/gencode.v29.annotation.gtf.gz -o ~/all.id.txt ~/rna_testdata/project/1.rna/5.sort.bam/*bam
#
less -S all.id.txt
less -S all.id.txt|cut -f1,7-9|less -S
#得到counts
Q5: 理解RNA-SEQ上游流程得到的表达矩阵的多种形式
包括 每个基因比对到的reads数量
的counts矩阵,以及去除了每个细胞测序数据量(文库大小)差异
后的 rpm 矩阵,以及去除了基因长度效应
的 rpkm矩阵,以及最近比较流行的 tpm 矩阵
。
~
Q6: 任取6个样本表达矩阵随意分成2组走差异分析代码
代码参考:https://github.com/jmzeng1314/GEO/tree/master/airway_RNAseq
需要汇总PCA,heatmap,火山图,MA图,CV图等等
#PCA
rm(list = ls())
options(stringsAsFactors = F)
load(file = '1.airway_output.Rdata')
dat=exprSet
dat[1:4,1:4]
dim(dat)
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list)
library("FactoMineR")
library("factoextra")
ncol(dat)
dat[,ncol(dat)]
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)
plot(dat.pca,choix="ind")
fviz_pca_ind(dat.pca,
geom.ind = "point",
col.ind = dat$group_list, # color by groups
palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE,
legend.title = "Groups"
)
ggsave('7.PCA_all_samples.png')
#heatmap
load("1.airway_output.Rdata")
nrDEG=nrDEG3
library(pheatmap)
# 画多少可以调节~
choose_gene=head(rownames(nrDEG),50) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
pheatmap(choose_matrix,filename = '7.pheatmap.png')
dev.off()
## heatmap
load("1.airway_output.Rdata")
nrDEG=nrDEG3
library(pheatmap)
# 画多少可以调节~
choose_gene=head(rownames(nrDEG),50) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
pheatmap(choose_matrix,filename = '7.pheatmap.png')
dev.off()
## heatmap
{
dat=exprSet
dat[1:4,1:4]
table(group_list)
deg=nrDEG3
x=deg$logFC
names(x)=rownames(deg)
class(x)
x
cg=c(names(head(sort(x),100)),names(tail(sort(x),100)))
head(cg)
library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F)
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n<-2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
ac=data.frame(g=group_list)
rownames(ac)=colnames(n)
pheatmap(n,show_colnames =F,show_rownames = F, cluster_cols = T,
annotation_col=ac,filename = "7.pheatmap_group.png",
color = colorRampPalette(c("navy", "white", "firebrick3"))(50) # 增加color
)
dev.off()
}
#火山图
library(ggplot2)
DEG=nrDEG3
colnames(DEG)
plot(DEG$logFC,-log2(DEG$P.Value))
logFC_cutoff=1.26
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
table(DEG$change)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value),color=change)) + geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+ xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))
print(g)
ggsave(g,filename = '7.volcano.png')
#MA
~
#CV
~
Q7:挑选差异分析结果的统计学显著上调下调基因集
在R里面,对统计学显著上调下调基因集,进行GO/KEGG等数据库的超几何分布检验分析,原理参考:https://mp.weixin.qq.com/s/M6CRe39xmQ_lSQqeM99kow
#提取上调和下调的数据集
{
gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ]
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
}
{
geneList = nrDEG$logFC
names( geneList ) = nrDEG$ENTREZID
geneList = sort( geneList, decreasing = T )
}
Q8: 直接对任取6个样本表达矩阵做GSVA分析
~
网友评论