ccle数据下载和整理

作者: 小洁忘了怎么分身 | 来源:发表于2023-01-24 09:32 被阅读0次

    0.数据下载

    网址:https://sites.broadinstitute.org/ccle

    Datasets-Broad DepMap Portal-CCLE2019-Allfiles,选择tpm和annotation文件下载。

    1.读取表达矩阵

    rm(list = ls())
    dat = data.table::fread("CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz",data.table = F)
    dat[1:4,1:4]
    
    ##              gene_id
    ## 1 ENSG00000000003.10
    ## 2  ENSG00000000005.5
    ## 3  ENSG00000000419.8
    ## 4  ENSG00000000457.9
    ##                                                                                                                  transcript_ids
    ## 1                                                                         ENST00000373020.4,ENST00000494424.1,ENST00000496771.1
    ## 2                                                                                           ENST00000373031.4,ENST00000485971.1
    ## 3 ENST00000371582.4,ENST00000371583.5,ENST00000371584.4,ENST00000371588.5,ENST00000413082.1,ENST00000466152.1,ENST00000494752.1
    ## 4                                     ENST00000367770.1,ENST00000367771.6,ENST00000367772.4,ENST00000423670.1,ENST00000470238.1
    ##   22RV1_PROSTATE 2313287_STOMACH
    ## 1           5.28            7.01
    ## 2           0.00            0.00
    ## 3          73.38          108.99
    ## 4           9.76           16.76
    
    exp = as.matrix(dat[,-(1:2)])
    library(stringr)
    rownames(exp) = str_split(dat$gene_id,"\\.",simplify = T)[,1]
    exp = log2(exp+1)
    exp[1:4,1:4]
    
    ##                 22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
    ## ENSG00000000003       2.650765        3.001802             4.572890
    ## ENSG00000000005       0.000000        0.000000             0.000000
    ## ENSG00000000419       6.216843        6.781229             5.845741
    ## ENSG00000000457       3.427606        4.150560             1.839960
    ##                 253J_URINARY_TRACT
    ## ENSG00000000003           4.577731
    ## ENSG00000000005           0.000000
    ## ENSG00000000419           5.535742
    ## ENSG00000000457           2.087463
    
    # 转换行名
    library(AnnoProbe)
    library(tinyarray)
    an = annoGene(rownames(exp),ID_type = "ENSEMBL")
    exp = trans_array(exp,ids = an,from = "ENSEMBL",to = "SYMBOL")
    exp[1:4,1:4]
    
    ##             22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
    ## DDX11L1          0.1634987       0.0000000           0.02856915
    ## WASH7P           4.5422580       4.1667154           3.79285535
    ## MIR1302-2HG      0.0000000       0.1505597           0.00000000
    ## FAM138A          0.0000000       0.0000000           0.95605665
    ##             253J_URINARY_TRACT
    ## DDX11L1              0.0000000
    ## WASH7P               3.5861642
    ## MIR1302-2HG          0.0000000
    ## FAM138A              0.5753123
    

    2. 读取注释信息

    clinical = read.delim("Cell_lines_annotations_20181226.txt")
    colnames(clinical)[c(1,5)] = c("id","site")
    

    3.表达矩阵和临床信息对应起来

    a = intersect(colnames(exp),clinical$id)
    exp = exp[,a]
    clinical = clinical[match(a,clinical$id),]
    identical(clinical$id,colnames(exp))
    
    ## [1] TRUE
    

    4. 单基因表达量画图

    library(dplyr)
    #"METTL3","SETD2","TP53"
    g = "METTL3"
    pdat = cbind(gene = exp[g,],clinical[,c(1,5)])
    library(tidyr)
    pdat = drop_na(pdat,site)
    su = group_by(pdat,site) %>% 
      summarise(a = median(gene)) %>% 
      arrange(desc(a))
    pdat$site = factor(pdat$site,levels = su$site)
    library(ggplot2)
    library(RColorBrewer)
    mypalette <- colorRampPalette(brewer.pal(8,"Set1"))
    ggplot(pdat,aes(x = site,y = gene,fill = site))+
      geom_boxplot()+
      theme_bw()+
      theme(axis.text.x = element_text(vjust = 1,hjust = 1,angle = 70),legend.position = "bottom")+
      scale_fill_manual(values = mypalette(25))+
      guides (fill=guide_legend (nrow=3, byrow=TRUE))
    

    相关文章

      网友评论

        本文标题:ccle数据下载和整理

        本文链接:https://www.haomeiwen.com/subject/otuhhdtx.html