1.读入保存的表达矩阵数据
rm(list = ls())
path <- "F:\\myresearch\\paper_recurrence\\GSE129409"
setwd(path)
library(sva)
library(dplyr)
library(stringr)
library(GEOquery)
library(limma)
library(styler)
select <- dplyr::select
gse_id <- "GSE129409"
gse <- readRDS(paste0(".\\data\\",gse_id,"_gse.rds"))
exprSet <- exprs(gse[[1]]) # 基因表达矩阵
dim(exprSet)
head(exprSet)
pdata<- pData(gse[[1]]) # 分组信息,原始文件地址等
gpl_id <- gse$GSE129409_series_matrix.txt.gz@annotation
2.获取GPL文件并注释
gpl <- getGEO(GEO = gpl_id,
destdir = "F:\\myresearch\\GEO\\GPLfiles")
gpl <- gpl@dataTable@table
probe2symbol_df <- gpl %>% mutate(probe_id = .$ID,
symbol = .$circRNA) %>%
select(probe_id,symbol)
# colnames(probe2symbol_df) <- c("probe_id","symbol")
exprSet <- as.data.frame(exprSet) #change express matrix to dataframe
exprSet$probe_id <- rownames(exprSet) #make a new column of probe_id by rowname
exprSet$probe_id <- as.character(exprSet$probe_id)
#match probe_id and gene symbol
exprSet <- exprSet %>%
inner_join(probe2symbol_df,by="probe_id") %>% #合并探针的信息
dplyr::select(-probe_id) %>% #去掉多余信息
dplyr::select(symbol, everything()) %>% #重新排列,
mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #求出平均数(这边的.真的是画龙点睛)
arrange(desc(rowMean)) %>% #把表达量的平均值按从大到小排序
distinct(symbol,.keep_all = T) %>% # symbol留下第一个
dplyr::select(-rowMean) %>% #反向选择去除rowMean这一列
tibble::column_to_rownames(colnames(.)[1]) # 把第一列变成行名并删除
# save(gpl,probe2symbol_df,file = "F:\\myresearch\\GEO\\GPLfiles\\GPL21825")
3.数据检验和质控
##--------------------------------------------数据检验和聚类------------------------------
exprSet_L <- melt(exprSet)
colnames(exprSet_L) = c('sample','value')
exprSet_L$group = rep(pdata$group,each = nrow(exprSet))
exprSet_L[1:10,]
##箱线图
p21 <- ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
theme(axis.title.x = element_text(face = 'italic'),
axis.text.x = element_text(angle = 45 , vjust = 0.5),
legend.position = "top")
p21 #数据不太规整
#ggsave(p21,filename = ".\\plots\\bar_before.pdf",width = 18,height = 9,units = "cm")
##标准化后再图
exprSet_1 <- normalizeBetweenArrays(exprSet) %>% data.frame()
exprSet_L <- melt(exprSet_1)
colnames(exprSet_L) = c('sample','value')
exprSet_L$group = rep(pdata$group,each = nrow(exprSet))
p22 <- ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
theme(axis.title.x = element_text(face = 'italic'),
axis.text.x = element_text(angle = 45 , vjust = 0.5),
legend.position = 0)
p22 #表准化后规整多了,但知道效果咋样
plot_qc <- plot_grid(p21,p22,labels = "AUTO",nrow = 2)
plot_qc
dir.create("plots")
ggsave(plot_qc,filename = ".\\plots\\plot_qc.pdf",width = 18,height = 18,units = "cm")
#把标准化后的数据赋值给原始数据,暂时不做
#exprSet <- exprSet_1
4.聚类图和PCA图
# 聚类图-------------------------------------------
colnames(exprSet) <- paste(pdata$title, sep = "") ## 样本名称改变
# hc <- hclust(dist(t(exprSet))) #hclust需要用plot画图,此例不用
dist.r <- dist(t(exprSet))
res <- hcut(dist.r, k = 2, stand = TRUE)
# Visualize
cluster_unbatch <- fviz_dend(res,
# 加边框
rect = TRUE,
# 边框颜色
rect_border = "cluster",
# 边框线条类型
rect_lty = 2,
# 边框线条粗细
lwd = 1.2,
# 边框填充
rect_fill = T,
# 字体大小
cex = 0.8,
# 字体颜色
color_labels_by_k = T,
# 平行放置
horiz = T,
k_colors = "lancet", #指定色板
labels_track_height = 7,
main = element_blank()
)+theme(text = element_text(size = 8))
cluster_unbatch
# 把样本名改回来
colnames(exprSet) <- pdata$geo_accession
# pca图-------------------------------------------------------
dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"), #修改颜色
addEllipses = TRUE, # Concentration ellipses
# legend.title = "GROUP"
)+theme(legend.position = c(0.80,1),
text = element_text(size = 8),
legend.key.size = unit(3,"mm"),
legend.title = element_blank()
)
pca_plot
ggsave(plot = pca_plot,,width = 89,height = 89,units = "mm",filename = paste0(".\\plots\\",gse_id,"PCA.pdf"))
5.差异分析
##--------------------------------------------开始差异分析----------------------------------------
#表达矩阵:exprSet
#分组列表:group_list
design <- model.matrix(~0+factor(group_list))
# design <- model.matrix(~group)
colnames(design) <- levels(factor(group_list))
fit <- lmFit(exprSet,design)
# makeContrasts里边组的顺序会影响结果
# 根据目前资料,实验组写左,对照组写右
group_list
cont.matrix<-makeContrasts(atrial_fibrillation-healthy_control,levels = design)
cont.matrix
#做完后实验组是1,对照组是-1,此处注释有待分析
fit2=contrasts.fit(fit,cont.matrix)
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
#通过修改p.value,改变输出基因,过小可能无输出。实际影响的是adj.p,输出基因的不同可能影响火山图的起点(根部)
tempOutput = topTable(fit2,coef=1,n=Inf,adjust="BH",p.value = 0.5)
#tempOutput1 = topTable(fit2,coef=1,n=Inf,adjust="fdr",p.value = 0.05)
nrDEG = na.omit(tempOutput)
allDiff <- nrDEG
6.基因筛选
# 基因筛选-------------------------------------------------
# 定义加上下调标记函数,适用于limma包结果
add_sign_p <- function(df){
df <- df %>% mutate(sign = dplyr::case_when(df$P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
df$P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
TRUE ~ "stable")
)
return(df)
}
add_sign <- function(df){
df <- df %>% mutate(sign = dplyr::case_when(df$adj.P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
df$adj.P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
TRUE ~ "stable")
)
return(df)
}
logFC_cutoff=1 #fold change=1意思是差异是两倍
p_cutoff = 0.05 #padj=0.05意思是矫正后P值小于0.05
diff <- add_sign_p(allDiff )
diffSig <- diff %>% filter(sign == "down" | sign == "up")
save(diff,diffSig,diffUp,diffDown,file = paste0(".\\data\\",gse_id,"deg.RData"))
7.绘制火山图
#-------------------------------------------绘制火山图---------------------------------------
von_1 <- ggplot(
#设置数据
diff,
aes(x = logFC, y = -log10(P.Value), colour=change)) + #P.value或adj.P.Val需要根据情况修改
geom_point(alpha=0.4, size=2) +
scale_color_manual(values=c("#546de5", "#d2dae2","#ff4757"))+
# 辅助线,根据具体情况修改intercept数值
geom_vline(xintercept=c(-2,2),lty=4,col="black",lwd=0.6) +
geom_hline(yintercept = -log10(pvalue),lty=4,col="black",lwd=0.6) +
# 坐标轴
labs(x="log2(fold change)",
y="-log10 (p-value)")+
theme_bw()+
# 图例
theme(plot.title = element_text(hjust = 0.5),
legend.position = c(0.85,0.85),
legend.title = element_blank(),
legend.background = element_blank(),
legend.box.background = element_blank(),
text = element_text(size = 8),
)
von_1
# ggsave(von_1,filename = str_c(".\\plots\\",gse_id,"_volcano_padj.pdf"),
# width = 9,height = 9,units = "cm")
# 定义显示的标记基因
# 将需要标记的基因放置在label列
# 这里设置logFC值大于3的差异基因来标记
# !!!需要注意的是标记的基因不能太多,Rstudio容易卡死,提前用summary()查看
# P.value或adj.P.Val需要根据情况修改,包括pvalue或padj
diff$label = ifelse(diff$P.Value < pvalue & abs(diff$logFC) >=3, as.character(rownames(diff)),"")
von_labeled <- von_1+geom_text_repel(data = diff, aes(x = logFC,
y = -log10(P.Value),
label = label),
size = 2,box.padding = unit(0.5, "lines"),
point.padding = unit(0.8, "lines"),
segment.color = "black",
show.legend = FALSE)
von_labeled
ggsave(von_labeled,
width = 9,height = 9,units = "cm",
filename = str_c(".\\plots\\",gse_id,"_volcano_padj_labeled.pdf"))
8.绘制热图
##-----------------------------------------heatmap plot-----------------------------------------
library(pheatmap)
## 设定差异基因阈值,减少差异基因用于提取表达矩阵
allDiff_pair=topTable(fit2,adjust='BH',coef= 1,number=Inf,p.value=0.05,lfc =1)
##提前部分数据用作热图绘制
heatdata <- exprSet[rownames(diffSig),]
top100 <- heatdata
##制作一个分组信息用于注释,行名是标本号,值是分组
annotation_col <- data.frame(group_list)
rownames(annotation_col) <- colnames(exprSet)
##如果注释出界,可以通过调整格子比例和字体修正
heatmap <- pheatmap(top100, #热图的数据
cluster_rows = T,#行聚类
cluster_cols = TRUE,#列聚类,可以看出样本之间的区分度
annotation_col = annotation_col, #标注样本分类
annotation_legend = TRUE, # 显示注释
show_rownames = T,# 显示行名
show_colnames = T,# 显示列名
scale = "row", #以行来标准化,这个功能很不错
color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
border_color = FALSE,
fontsize = 7,
legend = T)
dim(top100)
ggsave(heatmap,filename = str_c(".\\plots\\",gse_id,"_deg_heatmap.pdf"),
width = 13,height = 9,units = "cm")
网友评论