美文网首页单细胞学习
gtex数据下载和整理

gtex数据下载和整理

作者: 小洁忘了怎么分身 | 来源:发表于2023-01-26 08:42 被阅读0次

    数据下载

    官网访问有障碍,直接去xena吧。

    1.读取表达矩阵

    rm(list = ls())
    dat = data.table::fread("gtex_RSEM_gene_tpm.gz",data.table = F)
    dat[1:4,1:4]
    
    ##               sample GTEX-S4Q7-0003-SM-3NM8M GTEX-QV31-1626-SM-2S1QC
    ## 1  ENSG00000242268.2                 -3.4580                 -9.9658
    ## 2  ENSG00000259041.1                 -9.9658                 -9.9658
    ## 3  ENSG00000270112.3                 -3.6259                 -2.1779
    ## 4 ENSG00000167578.16                  4.5988                  4.6294
    ##   GTEX-13QIC-0011-R1a-SM-5O9CJ
    ## 1                      -9.9658
    ## 2                      -9.9658
    ## 3                      -1.8314
    ## 4                       6.4989
    
    library(tidyverse)
    exp = column_to_rownames(dat,"sample") %>% as.matrix()
    rownames(exp) = rownames(exp) %>% str_split("\\.",simplify = T) %>% .[,1]
    exp[1:4,1:4]
    
    ##                 GTEX-S4Q7-0003-SM-3NM8M GTEX-QV31-1626-SM-2S1QC
    ## ENSG00000242268                 -3.4580                 -9.9658
    ## ENSG00000259041                 -9.9658                 -9.9658
    ## ENSG00000270112                 -3.6259                 -2.1779
    ## ENSG00000167578                  4.5988                  4.6294
    ##                 GTEX-13QIC-0011-R1a-SM-5O9CJ GTEX-ZPCL-0126-SM-4WWC8
    ## ENSG00000242268                      -9.9658                 -9.9658
    ## ENSG00000259041                      -9.9658                 -9.9658
    ## ENSG00000270112                      -1.8314                 -9.9658
    ## ENSG00000167578                       6.4989                  5.5358
    
    # 转换行名
    library(AnnoProbe)
    library(tinyarray)
    an = annoGene(rownames(exp),ID_type = "ENSEMBL")
    exp = trans_array(exp,ids = an,from = "ENSEMBL",to = "SYMBOL")
    exp[1:4,1:4]
    
    ##             GTEX-S4Q7-0003-SM-3NM8M GTEX-QV31-1626-SM-2S1QC
    ## DDX11L1                     -5.0116                 -9.9658
    ## WASH7P                       4.4283                  3.8370
    ## MIR6859-1                   -9.9658                 -9.9658
    ## MIR1302-2HG                 -9.9658                 -9.9658
    ##             GTEX-13QIC-0011-R1a-SM-5O9CJ GTEX-ZPCL-0126-SM-4WWC8
    ## DDX11L1                          -9.9658                 -9.9658
    ## WASH7P                            3.5863                  3.6793
    ## MIR6859-1                        -9.9658                 -9.9658
    ## MIR1302-2HG                      -9.9658                 -9.9658
    

    2. 读取注释信息

    clinical = data.table::fread("GTEX_phenotype.gz")
    table(clinical$`_primary_site`)
    
    ## 
    ##  <not provided>  Adipose Tissue   Adrenal Gland         Bladder           Blood 
    ##               5             621             161              13             595 
    ##    Blood Vessel     Bone Marrow           Brain          Breast    Cervix Uteri 
    ##             753             102            1426             221              11 
    ##           Colon       Esophagus  Fallopian Tube           Heart          Kidney 
    ##             384             805               7             493              38 
    ##           Liver            Lung          Muscle           Nerve           Ovary 
    ##             141             381             478             335             112 
    ##        Pancreas       Pituitary        Prostate  Salivary Gland            Skin 
    ##             203             126             122              71             977 
    ## Small Intestine          Spleen         Stomach          Testis         Thyroid 
    ##             106             121             209             208             366 
    ##          Uterus          Vagina 
    ##              93              99
    
    clinical = clinical[clinical$`_primary_site`!="<not provided>",]
    colnames(clinical)[3] = "site"
    

    3.表达矩阵和临床信息对应起来

    s = intersect(colnames(exp),clinical$Sample)
    clinical = clinical[match(s,clinical$Sample),]
    exp = exp[,s]
    identical(clinical$Sample,colnames(exp))
    
    ## [1] TRUE
    
    exp[1:4,1:4]
    
    ##             GTEX-S4Q7-0003-SM-3NM8M GTEX-QV31-1626-SM-2S1QC
    ## DDX11L1                     -5.0116                 -9.9658
    ## WASH7P                       4.4283                  3.8370
    ## MIR6859-1                   -9.9658                 -9.9658
    ## MIR1302-2HG                 -9.9658                 -9.9658
    ##             GTEX-13QIC-0011-R1a-SM-5O9CJ GTEX-ZPCL-0126-SM-4WWC8
    ## DDX11L1                          -9.9658                 -9.9658
    ## WASH7P                            3.5863                  3.6793
    ## MIR6859-1                        -9.9658                 -9.9658
    ## MIR1302-2HG                      -9.9658                 -9.9658
    

    4. 单基因表达量画图

    library(dplyr)
    #"METTL3","SETD2","TP53"
    g = "METTL3"
    pdat = cbind(gene = exp[g,],clinical[,c(1,3)])
    library(tidyr)
    pdat = drop_na(pdat,site)
    su = group_by(pdat,site) %>% 
      summarise(a = median(gene)) %>% 
      arrange(desc(a))
    pdat$site = factor(pdat$site,levels = su$site)
    library(ggplot2)
    library(RColorBrewer)
    mypalette <- colorRampPalette(brewer.pal(8,"Set1"))
    ggplot(pdat,aes(x = site,y = gene,fill = site))+
      geom_boxplot()+
      theme_bw()+
      theme(axis.text.x = element_text(vjust = 1,hjust = 1,angle = 70),legend.position = "bottom")+
      scale_fill_manual(values = mypalette(31))+
      guides (fill=guide_legend (nrow=3, byrow=TRUE))
    
    image.png

    5.两个基因的相关性图

    g2 = "SETD2"
    pdat2 = cbind(cbind(gene1 = exp[g,],
                        gene2 = exp[g2,],
                        clinical[,c(1,3)]))
    pdat2 = drop_na(pdat2,site)
    k = count(pdat2,site) %>% 
      filter(n>2)
    pdat2 = filter(pdat2,site %in% k$site)
    ggplot(pdat2,aes(gene1,gene2))+
      geom_point()+
      geom_smooth(method=lm)+
      geom_rug()+
      theme_minimal()+
      labs(x = g,y = g2)+
      facet_wrap(~site,scales = "free")
    
    image.png

    6.炫酷的泛癌相关性图

    横纵坐标分别是-log10pvalue和相关系数r

    a <- cor.test(exp[g,],exp[g2,])
    a$estimate
    
    ##       cor 
    ## 0.9251489
    
    a$p.value
    
    ## [1] 0
    
    b = pdat2 %>% 
      group_by(site) %>% 
      summarise(cor = cor.test(gene1,gene2)$estimate,
                nlogp = -log10(cor.test(gene1,gene2)$p.value))
    head(b)
    
    ## # A tibble: 6 × 3
    ##   site             cor   nlogp
    ##   <chr>          <dbl>   <dbl>
    ## 1 Adipose Tissue 0.640  59.9  
    ## 2 Adrenal Gland  0.972  80.6  
    ## 3 Bladder        0.344   0.438
    ## 4 Blood          0.948 221\.   
    ## 5 Blood Vessel   0.883 200\.   
    ## 6 Bone Marrow    0.402   3.25
    
    library(ggplot2)
    options(scipen = 5)
    # 横坐标起始位置不可以设为0,因为这是对数坐标系,log(0)无意义。
    # 统一设置0.01或者0.1这样,会使一些p值太大的点无法呈现在图上。因此,根据-logp最小值计算起始坐标。
    blm = 10^-str_count(min(b$nlogp),"0");blm
    
    ## [1] 0.1
    
    bk = 10^((-str_count(min(b$nlogp),"0")):3);bk
    
    ## [1]    0.1    1.0   10.0  100.0 1000.0
    
    ggplot(b,aes(nlogp,cor))+
      geom_point(size=6,color="#bf77b6")+
      scale_y_continuous(expand = c(0,0),limits = c(-1,1),breaks = seq(-1,1,0.2))+
      scale_x_log10(expand = c(0,0),limits = c(blm, 1000),breaks = bk)+
      geom_hline(yintercept = 0)+
      geom_vline(xintercept = -log10(0.05))+
      theme_bw()
    
    image.png

    相关文章

      网友评论

        本文标题:gtex数据下载和整理

        本文链接:https://www.haomeiwen.com/subject/zjnjhdtx.html