美文网首页
GEO代码片段(芯片差异分析及绘图)

GEO代码片段(芯片差异分析及绘图)

作者: 郭师傅 | 来源:发表于2022-05-03 11:57 被阅读0次

1.读入保存的表达矩阵数据

rm(list = ls())
path <- "F:\\myresearch\\paper_recurrence\\GSE129409"
setwd(path)

library(sva)
library(dplyr)
library(stringr)
library(GEOquery)
library(limma)
library(styler)
select <- dplyr::select

gse_id <- "GSE129409"

gse <- readRDS(paste0(".\\data\\",gse_id,"_gse.rds"))

exprSet <- exprs(gse[[1]]) # 基因表达矩阵
dim(exprSet)
head(exprSet)
pdata<- pData(gse[[1]])    # 分组信息,原始文件地址等

gpl_id <- gse$GSE129409_series_matrix.txt.gz@annotation

2.获取GPL文件并注释

gpl <- getGEO(GEO = gpl_id,
              destdir = "F:\\myresearch\\GEO\\GPLfiles")

gpl <- gpl@dataTable@table 

probe2symbol_df <- gpl %>% mutate(probe_id = .$ID,
                      symbol = .$circRNA) %>% 
  select(probe_id,symbol)

# colnames(probe2symbol_df) <- c("probe_id","symbol")
exprSet <- as.data.frame(exprSet)  #change express matrix to dataframe
exprSet$probe_id <- rownames(exprSet)  #make a new column of probe_id by rowname
exprSet$probe_id <- as.character(exprSet$probe_id)
#match probe_id and gene symbol
exprSet <- exprSet %>% 
  inner_join(probe2symbol_df,by="probe_id") %>% #合并探针的信息
  dplyr::select(-probe_id) %>% #去掉多余信息
  dplyr::select(symbol, everything()) %>% #重新排列,
  mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #求出平均数(这边的.真的是画龙点睛)
  arrange(desc(rowMean)) %>% #把表达量的平均值按从大到小排序
  distinct(symbol,.keep_all = T) %>% # symbol留下第一个
  dplyr::select(-rowMean) %>% #反向选择去除rowMean这一列
  tibble::column_to_rownames(colnames(.)[1]) # 把第一列变成行名并删除

# save(gpl,probe2symbol_df,file = "F:\\myresearch\\GEO\\GPLfiles\\GPL21825")

3.数据检验和质控

##--------------------------------------------数据检验和聚类------------------------------
exprSet_L <- melt(exprSet)
colnames(exprSet_L) = c('sample','value')
exprSet_L$group = rep(pdata$group,each = nrow(exprSet))

exprSet_L[1:10,]
##箱线图
p21 <-  ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
  theme(axis.title.x  = element_text(face = 'italic'),
        axis.text.x =  element_text(angle = 45 , vjust = 0.5),
        legend.position = "top")
p21            #数据不太规整
#ggsave(p21,filename = ".\\plots\\bar_before.pdf",width = 18,height = 9,units = "cm")
##标准化后再图
exprSet_1 <- normalizeBetweenArrays(exprSet) %>% data.frame()
exprSet_L <- melt(exprSet_1)
colnames(exprSet_L) = c('sample','value')
exprSet_L$group = rep(pdata$group,each = nrow(exprSet))
p22 <-  ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
  theme(axis.title.x  = element_text(face = 'italic'),
        axis.text.x =  element_text(angle = 45 , vjust = 0.5),
        legend.position = 0)
p22            #表准化后规整多了,但知道效果咋样

plot_qc <- plot_grid(p21,p22,labels = "AUTO",nrow = 2)
plot_qc
dir.create("plots")
ggsave(plot_qc,filename = ".\\plots\\plot_qc.pdf",width = 18,height = 18,units = "cm")
#把标准化后的数据赋值给原始数据,暂时不做
#exprSet <- exprSet_1

4.聚类图和PCA图

# 聚类图-------------------------------------------
colnames(exprSet) <- paste(pdata$title, sep = "")    ## 样本名称改变
# hc <- hclust(dist(t(exprSet)))                      #hclust需要用plot画图,此例不用
dist.r <- dist(t(exprSet))
res <- hcut(dist.r, k = 2, stand = TRUE)
# Visualize
cluster_unbatch <- fviz_dend(res,
                             # 加边框
                             rect = TRUE,
                             # 边框颜色
                             rect_border = "cluster",
                             # 边框线条类型
                             rect_lty = 2,
                             # 边框线条粗细
                             lwd = 1.2,
                             # 边框填充
                             rect_fill = T,
                             # 字体大小
                             cex = 0.8,
                             # 字体颜色
                             color_labels_by_k = T,
                             # 平行放置
                             horiz = T,
                             k_colors = "lancet",      #指定色板
                             labels_track_height = 7,
                             main = element_blank()
)+theme(text = element_text(size = 8))
cluster_unbatch

# 把样本名改回来
colnames(exprSet) <- pdata$geo_accession
# pca图-------------------------------------------------------
dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),  #修改颜色
                         addEllipses = TRUE, # Concentration ellipses
                         # legend.title = "GROUP"
)+theme(legend.position = c(0.80,1),
        text = element_text(size = 8),
        legend.key.size = unit(3,"mm"),
        legend.title = element_blank()
)
pca_plot
ggsave(plot = pca_plot,,width = 89,height = 89,units = "mm",filename = paste0(".\\plots\\",gse_id,"PCA.pdf"))

5.差异分析

##--------------------------------------------开始差异分析----------------------------------------
#表达矩阵:exprSet
#分组列表:group_list

design <- model.matrix(~0+factor(group_list))
# design <- model.matrix(~group)
colnames(design) <- levels(factor(group_list))

fit <- lmFit(exprSet,design)

# makeContrasts里边组的顺序会影响结果
# 根据目前资料,实验组写左,对照组写右
group_list
cont.matrix<-makeContrasts(atrial_fibrillation-healthy_control,levels = design)
cont.matrix
#做完后实验组是1,对照组是-1,此处注释有待分析
fit2=contrasts.fit(fit,cont.matrix)
fit2 <- eBayes(fit2)  ## default no trend !!!
##eBayes() with trend=TRUE
#通过修改p.value,改变输出基因,过小可能无输出。实际影响的是adj.p,输出基因的不同可能影响火山图的起点(根部)
tempOutput = topTable(fit2,coef=1,n=Inf,adjust="BH",p.value = 0.5)    
#tempOutput1 = topTable(fit2,coef=1,n=Inf,adjust="fdr",p.value = 0.05) 
nrDEG = na.omit(tempOutput) 

allDiff <- nrDEG

6.基因筛选

# 基因筛选-------------------------------------------------
# 定义加上下调标记函数,适用于limma包结果
add_sign_p <- function(df){
  df <- df %>% mutate(sign = dplyr::case_when(df$P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
                                              df$P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
                                              TRUE ~ "stable")
  )
  return(df)
}

add_sign <- function(df){
  df <- df %>% mutate(sign = dplyr::case_when(df$adj.P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
                                              df$adj.P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
                                              TRUE ~ "stable")
  )
  return(df)
}

logFC_cutoff=1                    #fold change=1意思是差异是两倍
p_cutoff = 0.05                     #padj=0.05意思是矫正后P值小于0.05

diff <- add_sign_p(allDiff )

diffSig <- diff %>% filter(sign == "down" | sign == "up")
save(diff,diffSig,diffUp,diffDown,file = paste0(".\\data\\",gse_id,"deg.RData"))

7.绘制火山图

#-------------------------------------------绘制火山图---------------------------------------
von_1 <- ggplot(
  #设置数据
  diff, 
  aes(x = logFC, y = -log10(P.Value), colour=change)) +                 #P.value或adj.P.Val需要根据情况修改
  geom_point(alpha=0.4, size=2) +
  scale_color_manual(values=c("#546de5", "#d2dae2","#ff4757"))+
  
  # 辅助线,根据具体情况修改intercept数值
  geom_vline(xintercept=c(-2,2),lty=4,col="black",lwd=0.6) +
  geom_hline(yintercept = -log10(pvalue),lty=4,col="black",lwd=0.6) +
  
  # 坐标轴
  labs(x="log2(fold change)",
       y="-log10 (p-value)")+
  theme_bw()+
  # 图例
  theme(plot.title = element_text(hjust = 0.5), 
        legend.position = c(0.85,0.85), 
        legend.title = element_blank(),
        legend.background = element_blank(),
        legend.box.background = element_blank(),
        text = element_text(size = 8),
        
        
  )
von_1
# ggsave(von_1,filename = str_c(".\\plots\\",gse_id,"_volcano_padj.pdf"),
#        width = 9,height = 9,units = "cm")

# 定义显示的标记基因
# 将需要标记的基因放置在label列
# 这里设置logFC值大于3的差异基因来标记
# !!!需要注意的是标记的基因不能太多,Rstudio容易卡死,提前用summary()查看
# P.value或adj.P.Val需要根据情况修改,包括pvalue或padj
diff$label = ifelse(diff$P.Value < pvalue & abs(diff$logFC) >=3, as.character(rownames(diff)),"")

von_labeled <- von_1+geom_text_repel(data = diff, aes(x = logFC, 
                                                      y = -log10(P.Value), 
                                                      label = label),
                                     size = 2,box.padding = unit(0.5, "lines"),
                                     point.padding = unit(0.8, "lines"), 
                                     segment.color = "black", 
                                     show.legend = FALSE)
von_labeled
ggsave(von_labeled,
       width = 9,height = 9,units = "cm",
       filename = str_c(".\\plots\\",gse_id,"_volcano_padj_labeled.pdf"))

8.绘制热图

##-----------------------------------------heatmap plot-----------------------------------------
library(pheatmap)
## 设定差异基因阈值,减少差异基因用于提取表达矩阵
allDiff_pair=topTable(fit2,adjust='BH',coef= 1,number=Inf,p.value=0.05,lfc =1) 
##提前部分数据用作热图绘制
heatdata <- exprSet[rownames(diffSig),]
top100 <- heatdata

##制作一个分组信息用于注释,行名是标本号,值是分组
annotation_col <- data.frame(group_list)

rownames(annotation_col) <- colnames(exprSet)

##如果注释出界,可以通过调整格子比例和字体修正
heatmap <- pheatmap(top100, #热图的数据
                    cluster_rows = T,#行聚类
                    cluster_cols = TRUE,#列聚类,可以看出样本之间的区分度
                    annotation_col = annotation_col, #标注样本分类
                    annotation_legend = TRUE, # 显示注释
                    show_rownames = T,# 显示行名
                    show_colnames = T,# 显示列名
                    scale = "row", #以行来标准化,这个功能很不错
                    color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
                    border_color = FALSE,
                    fontsize = 7,
                    legend = T) 
dim(top100)

ggsave(heatmap,filename = str_c(".\\plots\\",gse_id,"_deg_heatmap.pdf"),
       width = 13,height = 9,units = "cm")

相关文章

网友评论

      本文标题:GEO代码片段(芯片差异分析及绘图)

      本文链接:https://www.haomeiwen.com/subject/spvbyrtx.html