美文网首页
PCA 和火山图

PCA 和火山图

作者: 余绕 | 来源:发表于2022-12-20 22:56 被阅读0次

1.PCA

  1. 加载R包
BiocManager::install("PCAtools")

library(PCAtools)
library(tidyverse)
```

#读取数据
```
gene_expression<-read.table(file="rnaseq-apple/gene_exp.txt",header = T)
head(gene_expression)
    BLO_S1_LD1 BLO_S1_LD2 BLO_S1_LD3 BLO_S2_LD1 BLO_S2_LD2 BLO_S2_LD3 BLO_S3_LD1 BLO_S3_LD2 BLO_S3_LD3 BLO_S4_LD1 BLO_S4_LD2 BLO_S4_LD3 KID_S1_LD1 KID_S1_LD2 KID_S1_LD3
HF36249      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF06454      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      5.231
HF31517     54.105     54.628     72.646     34.335     66.884     70.510     28.292     22.236     61.755     24.227     22.919     16.880     63.790     67.738     88.882
HF07207      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF16138      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF24764     37.845     31.335     32.200     51.658     33.046     40.531     26.209     36.140     28.605     25.200     35.759     14.814     37.978     44.917     42.374
        KID_S2_LD1 KID_S2_LD2 KID_S2_LD3 KID_S3_LD1 KID_S3_LD2 KID_S3_LD3 KID_S4_LD1 KID_S4_LD2 KID_S4_LD3
HF36249      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF06454      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF31517     75.407     55.568     73.206      7.496     29.949     31.430     24.733     11.832     15.213
HF07207      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF16138      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000      0.000
HF24764     37.583     41.682     23.796     27.777     32.126     35.417     20.903     29.229     19.286

samole_infor<-read.table(file="rnaseq-apple/sample_info.txt",header = T,row.names = 1)
head(samole_infor)
           strain stage Antho Cy.3.Gal Fruit_weight
BLO_S1_LD1    BLO    S1   3.0      3.0           40
BLO_S1_LD2    BLO    S1   3.0      3.0           40
BLO_S1_LD3    BLO    S1   3.0      3.0           40
BLO_S2_LD1    BLO    S2  12.2     12.2           90
BLO_S2_LD2    BLO    S2  12.2     12.2           90
BLO_S2_LD3    BLO    S2  12.2     12.2           90

tail(samole_infor)
           strain stage Antho Cy.3.Gal Fruit_weight
KID_S3_LD1    KID    S3 137.2    115.9          144
KID_S3_LD2    KID    S3 137.2    115.9          144
KID_S3_LD3    KID    S3 137.2    115.9          144
KID_S4_LD1    KID    S4 406.7    346.3          165
KID_S4_LD2    KID    S4 406.7    346.3          165
KID_S4_LD3    KID    S4 406.7    346.3          165

计算PCA

pac<-pca(gene_expression,metadata=samole_infor)
rotated<-pac$rotated
> head(rotated)
                 PC1       PC2        PC3         PC4        PC5       PC6       PC7       PC8        PC9        PC10       PC11       PC12      PC13      PC14       PC15
BLO_S1_LD1 -49469.14  15057.16  -454.5192   8411.3724  7552.6960 -1220.861  -443.726 -466.1595   310.5622  1217.96952 -1947.7771  824.33618 -531.3273  522.7237  -96.24148
BLO_S1_LD2 -45633.87  15543.93 13245.5139 -10940.2559 -3788.9435  7916.307 -3299.722 -165.4762  1312.4755  -258.09290   105.2893  -99.42380  228.7940  207.9851  446.01438
BLO_S1_LD3 -46717.66  20209.53  8234.3625   3985.0197 11576.3309  1204.673 -1103.596  525.1471  -226.3431  -891.70579  1045.9973 -631.17628  257.8611 -181.2148 -167.83820
BLO_S2_LD1 -13144.99 -30803.63 16205.3486  11338.7275 -5255.6342  2002.183 -2341.521 1178.7140 -1076.0024  -974.26857  -360.8252  -15.80409 -158.5351 -263.9794 -663.84528
BLO_S2_LD2 -12322.07 -14888.44  5275.2716   -818.2086   614.6884  1486.736  3250.971 -677.2021   534.4907    43.48976  1893.9821 1371.67688 -792.4302  450.5443   84.75676
BLO_S2_LD3 -15290.96 -14108.23 -4526.2833  -7139.3094  4992.1368  2273.598  3079.258  494.1480  -496.2651 -1215.31600  -380.2841 1003.34225 -559.9832 -282.6319 -754.21886
                  PC16       PC17      PC18      PC19       PC20       PC21       PC22      PC23         PC24
BLO_S1_LD1  439.564694 -633.14314  310.2185  676.8497 -150.60109 -319.31067  110.00396  253.8511 6.883389e-11
BLO_S1_LD2  164.993336  -52.55215  345.6621  491.3560  -16.93068 -348.21336 -100.91814 -348.5413 6.883389e-11
BLO_S1_LD3 -619.697283  537.75110 -395.9370 -893.2291  106.45542  151.39062  -27.51704  181.5122 6.883389e-11
BLO_S2_LD1    3.851858  -39.14639  412.5201  -47.6036   37.82040   17.66335 -135.07773 -130.2348 6.883389e-11
BLO_S2_LD2  335.251504  510.19947 -496.5087  660.7700  370.71522  174.25787  629.00796  646.1405 6.883389e-11
BLO_S2_LD3  485.540092 -725.04285 -723.8823 -465.4147 -176.28882 -361.28678 -264.05864 -754.1549 6.883389e-11

ggplot2绘图

整理数据

1.去掉行名

rotated_tidy<-rownames_to_column(rotated,var="sample_name")
head(rotated_tidy)
  sample_name       PC1       PC2        PC3         PC4        PC5       PC6       PC7       PC8        PC9        PC10       PC11       PC12      PC13      PC14       PC15
1  BLO_S1_LD1 -49469.14  15057.16  -454.5192   8411.3724  7552.6960 -1220.861  -443.726 -466.1595   310.5622  1217.96952 -1947.7771  824.33618 -531.3273  522.7237  -96.24148
2  BLO_S1_LD2 -45633.87  15543.93 13245.5139 -10940.2559 -3788.9435  7916.307 -3299.722 -165.4762  1312.4755  -258.09290   105.2893  -99.42380  228.7940  207.9851  446.01438
3  BLO_S1_LD3 -46717.66  20209.53  8234.3625   3985.0197 11576.3309  1204.673 -1103.596  525.1471  -226.3431  -891.70579  1045.9973 -631.17628  257.8611 -181.2148 -167.83820
4  BLO_S2_LD1 -13144.99 -30803.63 16205.3486  11338.7275 -5255.6342  2002.183 -2341.521 1178.7140 -1076.0024  -974.26857  -360.8252  -15.80409 -158.5351 -263.9794 -663.84528
5  BLO_S2_LD2 -12322.07 -14888.44  5275.2716   -818.2086   614.6884  1486.736  3250.971 -677.2021   534.4907    43.48976  1893.9821 1371.67688 -792.4302  450.5443   84.75676
6  BLO_S2_LD3 -15290.96 -14108.23 -4526.2833  -7139.3094  4992.1368  2273.598  3079.258  494.1480  -496.2651 -1215.31600  -380.2841 1003.34225 -559.9832 -282.6319 -754.21886
         PC16       PC17      PC18      PC19       PC20       PC21       PC22      PC23         PC24
1  439.564694 -633.14314  310.2185  676.8497 -150.60109 -319.31067  110.00396  253.8511 6.883389e-11
2  164.993336  -52.55215  345.6621  491.3560  -16.93068 -348.21336 -100.91814 -348.5413 6.883389e-11
3 -619.697283  537.75110 -395.9370 -893.2291  106.45542  151.39062  -27.51704  181.5122 6.883389e-11
4    3.851858  -39.14639  412.5201  -47.6036   37.82040   17.66335 -135.07773 -130.2348 6.883389e-11
5  335.251504  510.19947 -496.5087  660.7700  370.71522  174.25787  629.00796  646.1405 6.883389e-11
6  485.540092 -725.04285 -723.8823 -465.4147 -176.28882 -361.28678 -264.05864 -754.1549 6.883389e-11

samole_infor_tidy<-rownames_to_column(samole_infor,var="sample_name")
head(samole_infor_tidy)
 sample_name strain stage Antho Cy.3.Gal Fruit_weight
1  BLO_S1_LD1    BLO    S1   3.0      3.0           40
2  BLO_S1_LD2    BLO    S1   3.0      3.0           40
3  BLO_S1_LD3    BLO    S1   3.0      3.0           40
4  BLO_S2_LD1    BLO    S2  12.2     12.2           90
5  BLO_S2_LD2    BLO    S2  12.2     12.2           90

解释度 比如PC1解释度为8.224552e+01,PC2为1.023421e+01等

pac$variance
        PC1          PC2          PC3          PC4          PC5          PC6          PC7          PC8          PC9         PC10         PC11         PC12         PC13 
8.224552e+01 1.023421e+01 3.175249e+00 1.454057e+00 1.141373e+00 8.234302e-01 3.700784e-01 1.207162e-01 9.389095e-02 7.164297e-02 4.812836e-02 4.039241e-02 2.888659e-02 
        PC14         PC15         PC16         PC17         PC18         PC19         PC20         PC21         PC22         PC23         PC24 
2.386747e-02 2.170546e-02 1.904344e-02 1.556694e-02 1.426426e-02 1.313013e-02 1.249111e-02 1.190933e-02 1.082498e-02 9.622137e-03 2.797783e-28
解释度绘图
screeplot(pac)
image.png

合数据加注释,好进一步利用ggplot画图

pca_results<-left_join(rotated_tidy,samole_infor_tidy,by ="sample_name") #俩名字一样一个即可

自定义形状绘图

ggplot(data = pca_results,aes(x=pca_results$PC1,pca_results$PC2))+
  geom_point(size=8,aes(shape=strain,fill=stage))+
  scale_shape_manual(values = 21:22)+theme_bw()+
  stat_ellipse(aes(color=stage))+          #加圆圈
  labs(x="PC1(82% variance explained)", y="PC2(10% variance explained)")
image.png

scale_shape_manual(values = 21:22)#选取点的形状,具体每个数字对应的点的形状见下图:

image.png

2火山图

读取书
de_result <- read_excel("de_result.xlsx", 
                         col_types = c("numeric", "text", "text", 
                                       "text", "text", "numeric", "numeric", 
                                        "numeric", "numeric", "text", "text", 
                                         "text", "text", "text", "text", "text", 
                                         "text", "text"))

展示列名,方便后面画图用

colnames(de_result)
 [1] "Rank"           "GENE_NAME"      "EntrezId"       "YeastOrtholog"  "Description"    "log2FoldChange" "lfcSE"         
 [8] "pvalue"         "padj"           "ORF19_ID"       "A21_region"     "ASSEMBLY22_ID"  "A22_A_region"   "A22_B_ID"      
[15] "A22_B_region"   "CGDID"          "CGD_Primary_ID" "KEGG_ID" 

加注释(上下调基因)

res<-select(de_result,Rank,GENE_NAME,log2FoldChange,pvalue) %>% mutate(direction=if_else(pvalue > 0.05| abs(log2FoldChange)<1,"NS", if_else(log2FoldChange>=1,"UP","DOWN")))

选取子集,用于后续加标签的基因

selected_genes<-c('FMP27','ERG251','NOT5','ERG3','MVB12','ERG28','ERG25','ERG27')

reselected<-filter(res,GENE_NAME%in%selected_genes)

绘图

library(ggrepel)
ggplot(data=res,aes(x=log2FoldChange,y=-log10(pvalue)))+
  geom_point(size=3,aes(color=direction),show.legend = F)+
  geom_point(data = reselected,size=3,shape=21,stroke=3,color=“yellow”)+对加标签的点加用21号圈突出出来,stroke=1是指的黑圈圈的线条粗细
  geom_text_repel(data=reselected,aes(label=GENE_NAME))+ #加标签
  geom_hline(yintercept = -log10(0.05),linetype="dotdash",color='gray')+ 
  geom_vline(xintercept = c(-1,1),linetype="dotdash",color='gray')+
  scale_color_manual(values = c("blue","gray","red"))+
  theme_bw()
image.png

相关文章

网友评论

      本文标题:PCA 和火山图

      本文链接:https://www.haomeiwen.com/subject/ulipqdtx.html