美文网首页
表达矩阵log后进行差异分析

表达矩阵log后进行差异分析

作者: 桃浪桃浪 | 来源:发表于2019-07-16 14:37 被阅读0次

关于limma包差异分析结果的logFC解释

http://www.bio-info-trainee.com/1209.html

首先,我们要明白,limma接受的输入参数就是一个表达矩阵,而且是log后的表达矩阵(以2为底)。

那么最后计算得到的logFC这一列的值,其实就是输入的表达矩阵中case一组的平均表达量减去control一组的平均表达量的值,那么就会有正负之分,代表了case相当于control组来说,该基因是上调还是下调。

我之前总是有疑问,明明是case一组的平均表达量和control一组的平均表达量差值呀,跟log foldchange没有什么关系呀。

后来,我终于想通了,因为我们输入的是log后的表达矩阵,那么case一组的平均表达量和control一组的平均表达量都是log了的,那么它们的差值其实就是log的foldchange

首先,我们要理解foldchange的意义,如果case是平均表达量是8,control是2,那么foldchange就是4,logFC就是2咯

那么在limma包里面,输入的时候case的平均表达量被log后是3,control是1,那么差值是2,就是说logFC就是2。

这不是巧合,只是一个很简单的数学公式log(x)-log(y)=log(x/y)

即FC(foldchange)=x/y
进行log处理后再做一次

> ###log处理
> exp_log <- log2(exp+1)
> ###挑出数据
> exp1 <-exp_log[,7:11]#即三组control,两组EGF 12
> ###查看数据
> dim(exp1)
[1] 31099     5
> dat1 <- as.data.frame(t(exp1))###转置列为基因名,行为样本
> dat1[1:4,1:4]
         1367452_at 1367453_at 1367454_at 1367455_at
GSM75272   14.07837   13.20202   14.19491   14.29942
GSM75273   14.11096   13.09994   13.97388   14.36622
GSM75274   14.27705   12.81412   14.30077   14.30550
GSM75275   13.82473   13.70316   14.69665   14.11910
> dim(exp1)
[1] 31099     5
> dim(dat1)
[1]     5 31099
> dat1 <- cbind(dat1,group_list_1)###将分组信息加入
> dim(dat1)
[1]     5 31100
> library(dplyr)
> ###PCA的统一操作走起
> library(FactoMineR)#画主成分分析图需要加载这两个包
Warning message:
程辑包‘FactoMineR’是用R版本3.6.1 来建造的 
> library(factoextra)
载入需要的程辑包:ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
> ###进行主成分分析
> dat1.pca <- PCA(dat1[,-ncol(dat1)], graph = FALSE)
> ###画PCA图
> fviz_pca_ind(dat1.pca,
+              geom.ind = "point", # show points only (nbut not "text")
+              col.ind = dat1$group_list_1, # color by groups
+              palette = c("#00AFBB", "#E7B800"),##https://www.114la.com/other/rgb.htm
+              addEllipses = TRUE, # Concentration ellipses
+              legend.title = "Groups"
+ )
Too few points to calculate an ellipse
Too few points to calculate an ellipse
> ?fviz_pca_ind
> ggsave('all_samples_PCA.png')
Saving 9.03 x 5.6 in image
Too few points to calculate an ellipse
Too few points to calculate an ellipse
PCA图

单个基因差异分析

> table(group_list_1)
group_list_1
control  EGF 12 
      3       2 
> exp1[1:4,1:4]
           GSM75272 GSM75273 GSM75274 GSM75275
1367452_at    14.08    14.11    14.28    13.82
1367453_at    13.20    13.10    12.81    13.70
1367454_at    14.19    13.97    14.30    14.70
1367455_at    14.30    14.37    14.31    14.12
> boxplot(exp1[1,]~group_list_1)
image.png
> boxplot(exp1[1,]~group_list_1)
> bp=function(g){         #定义一个函数g,函数为{}里的内容
+   library(ggpubr)
+   df=data.frame(gene=g,group=group_list_1)
+   p <- ggboxplot(df, x = "group", y = "gene",
+                  color = "group", palette = "jco",
+                  add = "jitter")
+   #  Add p-value
+   p + stat_compare_means(label.y = 8)
+ }
> bp(exp1[1,]) ## 调用上面定义好的函数,避免同样的绘图代码重复多次敲。
Loading required package: magrittr
image.png
> #差异分析,用limma包来做
> #需要表达矩阵和group_list,其他都不要动
> library(limma)
> design=model.matrix(~factor(group_list_1))
> fit=lmFit(exp1,design)
> fit=eBayes(fit)
> dim(fit)
[1] 31099     2
> #差异基因排名
> deg=topTable(fit,coef=2,number = Inf)
> head(deg)
              logFC AveExpr      t   P.Value adj.P.Val     B
1370019_at   -5.522  11.145 -34.63 1.736e-06   0.01261 5.299
1376711_at   -5.240  13.389 -32.54 2.274e-06   0.01261 5.170
1387958_at    6.431  10.722  28.22 4.221e-06   0.01261 4.835
1370387_at    5.767   9.032  27.89 4.438e-06   0.01261 4.805
1370982_at   -4.201   6.493 -27.84 4.477e-06   0.01261 4.800
1388054_a_at  4.544  10.726  27.78 4.521e-06   0.01261 4.794
> bp(exp1[rownames(deg)[1],])
image.png
> #为deg数据框添加几列
> #1.加probe_id列,把行名变成一列
> library(dplyr)
> deg <- mutate(deg,probe_id=rownames(deg))
> #tibble::rownames_to_column()
> head(deg)
   logFC AveExpr      t   P.Value adj.P.Val     B     probe_id
1 -5.522  11.145 -34.63 1.736e-06   0.01261 5.299   1370019_at
2 -5.240  13.389 -32.54 2.274e-06   0.01261 5.170   1376711_at
3  6.431  10.722  28.22 4.221e-06   0.01261 4.835   1387958_at
4  5.767   9.032  27.89 4.438e-06   0.01261 4.805   1370387_at
5 -4.201   6.493 -27.84 4.477e-06   0.01261 4.800   1370982_at
6  4.544  10.726  27.78 4.521e-06   0.01261 4.794 1388054_a_at
> #2.加symbol列,火山图要用
#id转换,查找芯片平台对应的包
http://www.bio-info-trainee.com/1399.html
 "GPL1355"###注释包为rat2302.db
[1] "GPL1355"
> library(rat2302.db)
> ls("package:rat2302.db")
 [1] "rat2302"              "rat2302.db"           "rat2302_dbconn"      
 [4] "rat2302_dbfile"       "rat2302_dbInfo"       "rat2302_dbschema"    
 [7] "rat2302ACCNUM"        "rat2302ALIAS2PROBE"   "rat2302CHR"          
[10] "rat2302CHRLENGTHS"    "rat2302CHRLOC"        "rat2302CHRLOCEND"    
[13] "rat2302ENSEMBL"       "rat2302ENSEMBL2PROBE" "rat2302ENTREZID"     
[16] "rat2302ENZYME"        "rat2302ENZYME2PROBE"  "rat2302GENENAME"     
[19] "rat2302GO"            "rat2302GO2ALLPROBES"  "rat2302GO2PROBE"     
[22] "rat2302MAPCOUNTS"     "rat2302ORGANISM"      "rat2302ORGPKG"       
[25] "rat2302PATH"          "rat2302PATH2PROBE"    "rat2302PFAM"         
[28] "rat2302PMID"          "rat2302PMID2PROBE"    "rat2302PROSITE"      
[31] "rat2302REFSEQ"        "rat2302SYMBOL"        "rat2302UNIGENE"      
[34] "rat2302UNIPROT"      
> ids <- toTable(rat2302SYMBOL)
> head(ids)
    probe_id symbol
1 1367452_at  Sumo2
2 1367453_at  Cdc37
3 1367454_at  Copb2
4 1367455_at    Vcp
5 1367457_at  Becn1
6 1367458_at Lypla2
#3.加change列:上调或下调,火山图要用
logFC_t <- 1 #不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
> change=ifelse(deg$P.Value>0.01,'stable', 
+               ifelse( deg$logFC >logFC_t,'up', 
+                       ifelse( deg$logFC < -logFC_t,'down','stable') )
+ )
> deg <- mutate(deg,change)
> table(deg$change)

  down stable     up 
  1254  18732    932 

火山图

plot(deg$logFC,-log10(deg$P.Value))
image.png
> library(ggpubr)
> ggscatter(dat1, x = "logFC", y = "v",size=0.5,color = "change")
image.png
> ggscatter(dat1, x = "logFC", y = "v", color = "change",size = 0.5,
+           label = "symbol", repel = T,
+           label.select = dat1$symbol[1:30] ,
+           #label.select = c('CD36','DUSP6'), #挑选一些基因在图中显示出来
+           palette = c("#00AFBB", "#999999", "#FC4E07") )
image.png

绘制热图

> library(pheatmap)
> table(deg$change)

  down stable     up 
  1254  18732    932 
> table(x=='up')

FALSE  TRUE 
19986   932 
> table(x[x=='up'])

 up 
932 
> cg <- names(x[x=='up'])

归一化画热图

> n=t(scale(t(exp1[cg,])))
> thr=2
> n[n>thr]=thr
> n[n< -thr]= -thr
> n[1:4,1:4]
             GSM75272 GSM75273 GSM75274 GSM75275
1387958_at    -0.6468  -0.7954  -0.7451    1.066
1370387_at    -0.6985  -0.8093  -0.6800    1.122
1388054_a_at  -0.6861  -0.7605  -0.7429    1.125
1388142_at    -0.7516  -0.6713  -0.7660    1.124
> pheatmap(n,show_colnames =F,show_rownames = F)
> #显示分组
> ac=data.frame(group=group_list_1)
> rownames(ac)=colnames(n)
> pheatmap(n,show_colnames =T,show_rownames = F,cluster_col = F,annotation_col = ac,color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
#保存
>pdf(file = "heatmap.pdf")

相关文章

网友评论

      本文标题:表达矩阵log后进行差异分析

      本文链接:https://www.haomeiwen.com/subject/sclzkctx.html