rm(list=ls())
setwd('/Users/zhangjuxiang/Desktop/R time seq/')
#Bioconductor 安装 edgeR
#install.packages('BiocManager') #需要首先安装 BiocManager,如果尚未安装请先执行该步
BiocManager::install('edgeR',force=T)
#读取基因表达矩阵
targets <- read.csv('rawdata.csv')
as.matrix(targets)
rownames(targets) <- targets[,1]
targets <- targets[,-1]
#指定分组,注意要保证表达矩阵中的样本顺序和这里的分组顺序是一一对应的
#对照组在前,处理组在后
group <- rep(c('BC', 'EC'),c(121,192))
library(edgeR)
#数据预处理
#(1)构建 DGEList 对象
dgelist <- DGEList(counts = targets, group = group)
#(2)过滤 low count 数据,例如 CPM 标准化(推荐)
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep,, keep.lib.sizes = FALSE]
#(3)标准化,以 TMM 标准化为例
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
#差异表达基因分析
#首先根据分组信息构建试验设计矩阵,分组信息中一定要是对照组在前,处理组在后
design <- model.matrix(~group)
#(1)估算基因表达值的离散度
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
#(2)模型拟合,edgeR 提供了多种拟合算法
#负二项广义对数线性模型
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmLRT.txt', sep = '\t', col.names = NA, quote = FALSE)
#拟似然负二项广义对数线性模型
fit <- glmQLFit(dge, design, robust = TRUE)
lrt <- topTags(glmQLFTest(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmQLFit.txt', sep = '\t', col.names = NA, quote = FALSE)
##筛选差异表达基因
#读取上述输出的差异倍数计算结果
gene_diff <- read.delim('control_treat.glmLRT.txt', row.names = 1, sep = '\t', check.names = FALSE)
#首先对表格排个序,按 FDR 值升序排序,相同 FDR 值下继续按 log2FC 降序排序
gene_diff <- gene_diff[order(gene_diff$FDR, gene_diff$logFC, decreasing = c(FALSE, TRUE)), ]
#log2FC≥1 & FDR<0.01 标识 up,代表显著上调的基因
#log2FC≤-1 & FDR<0.01 标识 down,代表显著下调的基因
#其余标识 none,代表非差异的基因
gene_diff[which(gene_diff$logFC >= 1 & gene_diff$FDR < 0.05),'sig'] <- 'up'
gene_diff[which(gene_diff$logFC <= -1 & gene_diff$FDR < 0.05),'sig'] <- 'down'
gene_diff[which(abs(gene_diff$logFC) <= 1 | gene_diff$FDR >= 0.05),'sig'] <- 'none'
#输出选择的差异基因总表
gene_diff_select <- subset(gene_diff, sig %in% c('up', 'down'))
write.table(gene_diff_select, file = 'control_treat.glmQLFit.select.txt', sep = '\t', col.names = NA, quote = FALSE)
#根据 up 和 down 分开输出
gene_diff_up <- subset(gene_diff, sig == 'up')
gene_diff_down <- subset(gene_diff, sig == 'down')
write.table(gene_diff_up, file = 'control_treat.glmQLFit.up.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(gene_diff_down, file = 'control_treat.glmQLFit.down.txt', sep = '\t', col.names = NA, quote = FALSE)
install.packages('pheatmap')
library(pheatmap)
{
tmp = gene_diff_select[gene_diff_select$PValue < 0.05,]
#差异结果需要先根据p值挑选
nrDEG_Z = tmp[ order( tmp$logFC ), ]
nrDEG_F = tmp[ order( -tmp$logFC ), ]
choose_gene = c( rownames( nrDEG_Z )[1:100], rownames( nrDEG_F )[1:100] )
choose_matrix = targets[ choose_gene, ]
choose_matrix = t( scale( t( choose_matrix ) ) )
choose_matrix[choose_matrix > 2] = 2
choose_matrix[choose_matrix < -2] = -2
annotation_col = data.frame( CellType = factor( group ) )
rownames( annotation_col ) = colnames( targets )
choose_matrix <- na.omit(choose_matrix)
pheatmap( fontsize = 2, choose_matrix, annotation_col = annotation_col, show_rownames = F, annotation_legend = F, filename = "heatmap_BRCA_medianexp2.png")
}
install.packages('ggplot2')
library( "ggplot2" )
nrDEG <- gene_diff
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
logFC_cutoff
logFC_cutoff = 1
{
nrDEG$change = as.factor( ifelse( nrDEG$PValue < 0.01 & abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
save( nrDEG, file = "nrDEG_array_medianexp.Rdata" )
this_tile <- paste0( 'Cutoff for logFC is ', round( logFC_cutoff, 3 ),
' The number of up M/Z is ', nrow(nrDEG[ nrDEG$change =='UP', ] ),
' The number of down M/Z is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )
volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(PValue), color = change)) +
geom_point( alpha = 0.4, size = 1.75) +
theme_set( theme_set( theme_minimal( base_size = 15 ) ) ) +
xlab( "log2 fold change" ) + ylab( "-log10 p-value" ) +
theme(legend.title = element_text(colour="black", size=6, face="bold")) +
theme(legend.text = element_text(colour="black", size = 7, face = "bold")) +
theme(axis.title.x = element_text(size = 9, color = "black", face = "bold")) +
theme(axis.title.y = element_text(size = 9, color = "black", face = "bold")) +
ggtitle( this_tile ) + theme( plot.title = element_text( size = 8, hjust = 0.5, face = "bold" )) +
theme(legend.position=c(1.2, 0.8)) +
theme(aspect.ratio=1) +
scale_colour_manual( values = c('green','black','red') ) + theme(panel.grid.major = element_line(colour = "white",
linetype = "blank"), panel.grid.minor = element_line(colour = "white"),
panel.background = element_rect(fill = "aliceblue",
colour = "white"), plot.background = element_rect(colour = "azure1"))
print( volcano )
ggsave( volcano, filename = 'volcano_BRCA_medianexp.tiff' )
dev.off()
}
网友评论