rm( list = ls() ) #一键清空,好习惯
options( stringsAsFactors = F )
library( "GEOquery" ) #加载R包,用于下载GEO数据
GSE_name = 'GSE108565'
options( 'download.file.method.GEOquery' = 'libcurl' ) #长城外解决方法之一
gset <- getGEO( GSE_name, getGPL = T ) #获取GEO数据,包括注释信息GPL
save( gset, file = 'gset_2.Rdata' ) #保存
load("gset_2.Rdata") #加载
gset <- gset[[1]] #降级处理
pdata <- pData(gset) #获取样本信息
exprSet <- exprs(gset) #获取表达矩阵
GPL = gset@featureData@data #获取注释信息
gset
chl <- length(colnames(pdata)) #看一下样本信息的列数
group_list <- as.character(pdata[,"neo-adjuvant chemotherapy:ch1"]) #得到分组信息 即对治疗抵抗和敏感
dim(exprSet)
exprSet[1:5,1:5] #看一下前几行
table(group_list) #统计分组信息 7抵抗 7敏感
save(exprSet,file = "final_exprSet.Rdata") #随时保存 下次直接load
## 下面就是画图plot的过程
library( "ggfortify" ) #ggfortify包基于ggplot2 可以Plotting PCA
## 聚类
{
colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' ) #根据分组信息表达矩阵改变列名
nodePar <- list( lab.cex = 0.4, pch = c( NA, 19 ), cex = 0.6, col = "red" ) #调整参数
hc = hclust( dist( t( exprSet ) ) ) #聚类表达矩阵
png('hclust.png', res = 100, height = 1800) #生成画板 设置参数
plot( as.dendrogram( hc ), nodePar = nodePar, horiz = TRUE ) #在画板上水平plot聚类结果的dendrogram即系统树图
dev.off() #关闭指定的绘图(默认情况下为当前设备)
}

## PCA主成分分析
data = as.data.frame( t( exprSet ) ) #transpose exprSet
data$group = group_list
png( 'pca_plot.png', res=100 )
autoplot( prcomp( data[ , 1:( ncol( data ) - 1 ) ] ), data = data, colour = 'group') + theme_bw() #进行PCA分析 图片保存在'pca_plot.png',也可以先plot再保存,可选png或pdf格式 ;theme_bw():A theme with white background and black gridlines.
dev.off() #关闭

### 差异分析
load( "./final_exprSet.Rdata" )
library( "limma" ) #使用limma包找出表达矩阵的差异表达基因DEGs 看说明书 GEO2R的差异分析就是基于limma包
{
design <- model.matrix( ~0 + factor( group_list ) ) #建立合适的设计矩阵
colnames( design ) = levels( factor( group_list ) ) #改变列名
rownames( design ) = colnames( exprSet ) #改变行名
}
design
## 为了使两组之间的成对比较,可以创建适当的对比度矩阵
contrast.matrix <- makeContrasts( "resistant-sensitive", levels = design )
contrast.matrix
{
fit <- lmFit( exprSet, design )
fit2 <- contrasts.fit( fit, contrast.matrix )
fit2 <- eBayes( fit2 )
nrDEG = topTable( fit2, coef = 1, n = Inf ) #获取差异表达基因
write.table( nrDEG, file = "nrDEG.out") #output
}
head(nrDEG)
results <- decideTests(fit2) #每个假设检验的结果可以用fit2获得
vennDiagram(fit2) #在每一个比较中都可以得到一个显示有意义的基因数量的维恩图

## heatmap 画热图
library( "pheatmap" )
{
nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ] #按照logFC由小到大排列 nrDEG生成nrDEG_Z
nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ] #同上
choose_gene = c( rownames( nrDEG_Z )[1:100], rownames( nrDEG_F )[1:100] ) #在差异表达基因中选择前100上调top和前100下调down
choose_matrix = exprSet[ choose_gene, ] #组成新的选择矩阵
choose_matrix = t( scale( t( choose_matrix ) ) ) #t(scale(t()))标准化 行的值
choose_matrix[choose_matrix > 1] = 1
choose_matrix[choose_matrix < -1] = -1
annotation_col = data.frame( CellType = factor( group_list ) )
rownames( annotation_col ) = colnames( exprSet )
pheatmap( fontsize = 2, choose_matrix, annotation_col = annotation_col, show_rownames = F,
annotation_legend = F, cluster_cols = F,filename = "heatmap.png") #画热图 图的基本字体大小为2 不显示行名 不显示图例 不显示列的聚类 具体参数可适当修改
}

## volcano plot 画火山图
library( "ggplot2" )
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
logFC_cutoff #注意理解含义
{
nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.01 & abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) ) #新建change列,满足nrDEG$P.Value < 0.01 & abs(nrDEG$logFC)则change为up,否则,若满足nrDEG$logFC > logFC_cutoff则change为down,否则为not
save( nrDEG, file = "nrDEG.Rdata" )
this_tile <- paste0( 'Cutoff for logFC is ', round( logFC_cutoff, 3 ),
'\nThe number of up gene is ', nrow(nrDEG[ nrDEG$change =='UP', ] ),
'\nThe number of down gene is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) ) #round( logFC_cutoff, 3 ) 保留小数点后3位
volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(P.Value), color = change)) +
geom_point( alpha = 0.4, size = 1.75) +
theme_set( theme_set( theme_bw( base_size = 15 ) ) ) +
xlab( "log2 fold change" ) + ylab( "-log10 p-value" ) +
ggtitle( this_tile ) + theme( plot.title = element_text( size = 15, hjust = 0.5)) +
scale_colour_manual( values = c('blue','black','red') )
print( volcano )
ggsave( volcano, filename = 'volcano.png' )
dev.off()
}

rm( list = ls() )
load( "./nrDEG.Rdata" )
## 注释
library( "clusterProfiler" ) #clusterProfiler implements methods to analyze and visualize functional profiles of genomic coordinates (supported by ChIPseeker), gene and gene clusters.
if (F) {browseVignettes("clusterProfiler")
} #可查看帮助文档
library( "org.Hs.eg.db" ) #加载注释包
## 需要注意这个数据的行名是ACCNUM
ls("package:org.Hs.eg.db") #看一下包里包含的内容
df <- bitr( rownames( nrDEG ), fromType = "ACCNUM", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db ) #根据org.Hs.eg.db包,将rownames( nrDEG )即ACCNUM转化为ENTREZID形式,生成data.frame
head( df )
{
nrDEG$SYMBOL = rownames( nrDEG )
nrDEG = merge( nrDEG, df, by.y='ACCNUM', by.x='SYMBOL')#将nrDEG与df融合,注意x和y分别对应df和nrDEG
}
head( nrDEG )
{
gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ] #提取上调基因的ENTREZID
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ] #提取下调基因的ENTREZID
gene_diff = c( gene_up, gene_down ) #合并
gene_all = as.character(nrDEG[ ,'ENTREZID'] ) #提取所有ENTREZID,UP DOWN NOT
}
{
geneList = nrDEG$logFC #numeric vector
names( geneList ) = nrDEG$ENTREZID #named vector
geneList = sort( geneList, decreasing = T ) #decreasing order
}
{
## KEGG pathway analysis
kk.up <- enrichKEGG( gene = gene_up ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.8 ,
qvalueCutoff = 0.8 ) # 对上调基因做KEGG分析;hsa = Homo sapiens (human)
kk.down <- enrichKEGG( gene = gene_down ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.8 ,
qvalueCutoff = 0.8 ) # 对上调基因做KEGG分析
}
head( kk.up )[ ,1:6 ] #凡事head
head( kk.down )[ ,1:6 ]

library( "ggplot2" ) #可视化
{
kegg_down_dt <- as.data.frame( kk.down ) #转化data.frame
kegg_up_dt <- as.data.frame( kk.up )
down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ] #设置p值
down_kegg$group = -1
up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ] #同上
up_kegg$group = 1
dat = rbind( up_kegg, down_kegg ) #合并
dat$pvalue = -log10( dat$pvalue ) #取对数
dat$pvalue = dat$pvalue * dat$group #up正 down负
dat = dat[ order( dat$pvalue, decreasing = F ), ]
g_kegg <- ggplot( dat,
aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) +
geom_bar( stat = "identity" ) +
scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) +
scale_x_discrete( name = "Pathway names" ) +
scale_y_continuous( name = "log10P-value" ) +
coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +
ggtitle( "Pathway Enrichment" ) #生成图
print( g_kegg )
ggsave( g_kegg, filename = 'kegg_up_down.png' ) #保存到本地
}

### GO database analysis 道理和KEGG分析是相似的
g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)
go_enrich_results <- lapply( g_list, function( gene ) {
lapply( c( 'BP', 'MF', 'CC' ) , function( ont ) {
cat( paste( 'Now process', ont ) )
ego <- enrichGO( gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head( ego ) )
return( ego )
})
}) #运行需要一定的时间
save( go_enrich_results, file = 'go_enrich_results.Rdata' )
load("go_enrich_results.Rdata")
n1 = c( 'gene_up', 'gene_down', 'gene_diff' )
n2 = c( 'BP', 'MF', 'CC' )
for ( i in 1:3 ){
for ( j in 1:3 ){
fn = paste0( 'dotplot_', n1[i], '_', n2[j], '.png' )
cat( paste0( fn, '\n' ) )
png( fn, res = 150, width = 1080 )
print( dotplot( go_enrich_results[[i]][[j]] ) )
dev.off()
}
} #一下子把BP、MF、CC的up down diff 图全部生成并保存

参考来源:生信技能树
友情链接:
课程分享
生信技能树全球公益巡讲
(https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g)
B站公益74小时生信工程师教学视频合辑
(https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw)
招学徒:
(https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw)
欢迎关注公众号:青岛生信菜鸟团
网友评论