1、log2转换判断执行
# 差异分析前,判断是否需要LOG2转换------------------------------------------
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
2、差异分析结果标记(1)
# 定义加上下调标记函数,适用于limma包结果
add_sign_p <- function(df,lfc = 2,p = 0.05){
df <- df %>% mutate(sign = dplyr::case_when(df$P.Value < p & df$logFC> lfc ~ "up",
df$P.Value < p & df$logFC< -lfc ~ "down",
TRUE ~ "stable")
)
return(df)
}
add_sign <- function(df,lfc = 2,padj = 0.05){
df <- df %>% mutate(sign = dplyr::case_when(df$adj.P.Val < padj & df$logFC> lfc ~ "up",
df$adj.P.Val < padj & df$logFC< -lfc ~ "down",
TRUE ~ "stable"
)
)
return(df)
}
diff_padj <- add_sign(df = diff_1,lfc = 1,padj = 0.05)
table(diff_padj$sign)
diffSig_padj <- diff_padj %>% filter(sign == "down" | sign == "up")
3、差异分析结果标记(2)
# 定义加上下调标记函数,适用于DEseq2包结果
add_sign_p <- function(df){
df <- df %>% mutate(sign = dplyr::case_when(df$pvalue < p_cutoff & df$log2FoldChange> logFC_cutoff ~ "up",
df$pvalue < p_cutoff & df$log2FoldChange< -logFC_cutoff ~ "down",
TRUE ~ "stable")
)
return(df)
}
add_sign <- function(df){
df <- df %>% mutate(sign = dplyr::case_when(df$padj < p_cutoff & df$log2FoldChange> logFC_cutoff ~ "up",
df$padj < p_cutoff & df$log2FoldChange< -logFC_cutoff ~ "down",
TRUE ~ "stable")
)
return(df)
}
4.表达矩阵PCA绘图
library(FactoMineR)
library(factoextra)
array_qc_pcaplot <- function(exprSet,group_list){
dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
# palette = c("#00AFBB", "#E7B800"), #修改颜色
addEllipses = TRUE, # Concentration ellipses
# legend.title = "GROUP"
) + theme(
legend.position = c(0.80, 1),
text = element_text(size = 8),
legend.key.size = unit(3, "mm"),
legend.title = element_blank()
)
}
5.基因注释
# 定义注释函数
# 两个参数:exprSet:表达矩阵,行名是探针
# probe2symbol:注释文件,第一列探针,第二列sybol
annot <- function(exprSet,probe2symbol_df){
# 开始注释
colnames(probe2symbol_df) <- c("probe_id","symbol")
exprSet <- as.data.frame(exprSet) #change express matrix to dataframe
exprSet$probe_id <- rownames(exprSet) #make a new column of probe_id by rowname
exprSet$probe_id <- as.character(exprSet$probe_id)
#match probe_id and gene symbol
exprSet <- exprSet %>%
inner_join(probe2symbol_df,by="probe_id") %>% #合并探针的信息
dplyr::select(-probe_id) %>% #去掉多余信息
dplyr::select(symbol, everything()) %>% #重新排列,
mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #求出平均数(这边的.真的是画龙点睛)
arrange(desc(rowMean)) %>% #把表达量的平均值按从大到小排序
distinct(symbol,.keep_all = T) %>% # symbol留下第一个
dplyr::select(-rowMean) %>% #反向选择去除rowMean这一列
tibble::column_to_rownames(colnames(.)[1]) # 把第一列变成行名并删除
return(exprSet)
}
6.limma包差异基因分析
#表达矩阵:exprSet
#分组列表:group_list
# 1.制作design
group_list <- pdata$group
design <- model.matrix(~0+factor(group_list))
# design <- model.matrix(~group)
colnames(design) <- levels(factor(group_list))
# 2.制作比较矩阵
# makeContrasts里边组的顺序会影响结果
# 根据目前资料,实验组写左,对照组写右
group_list
cont.matrix<-makeContrasts(T2D-control,levels = design)
cont.matrix
#做完后实验组是1,对照组是-1,此处注释有待分析
# 定义DEG函数,适用于array,limma包
deg_limma <- function(exprSet,design = design,cont.matrix = cont.matrix){
fit <- lmFit(exprSet_4,design)
fit2=contrasts.fit(fit,cont.matrix)
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
#通过修改p.value,改变输出基因,过小可能无输出。实际影响的是adj.p,输出基因的不同可能影响火山图的起点(根部)
tempOutput = topTable(fit2,coef=1,n=Inf,adjust="BH")
#tempOutput1 = topTable(fit2,coef=1,n=Inf,adjust="fdr",p.value = 0.05)
nrDEG = na.omit(tempOutput)
return(nrDEG)
}
diff_1 <- deg_limma(exprSet = exprSet,
design = design,
cont.matrix = cont.matrix)
网友评论