ccle数据下载和整理

作者: 小洁忘了怎么分身 | 来源:发表于2023-01-24 09:32 被阅读0次

ccle数据下载和整理
Depmap（CCLE）数据库使用介绍
CCLE数据提取
gtex数据下载和整理
GDCRNATools的安装与使用---TCGA数据下载与分析工
用MCP法计算TCGA样本中的免疫浸润
TCGA数据挖掘（3）：miRNA数据的下载
2019-12-06
GTF（gene transfer format）-Go To
TCGA数据下载整理

0.数据下载

网址：https://sites.broadinstitute.org/ccle

Datasets-Broad DepMap Portal-CCLE2019-Allfiles，选择tpm和annotation文件下载。

1.读取表达矩阵

rm(list = ls())
dat = data.table::fread("CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz",data.table = F)
dat[1:4,1:4]

##              gene_id
## 1 ENSG00000000003.10
## 2  ENSG00000000005.5
## 3  ENSG00000000419.8
## 4  ENSG00000000457.9
##                                                                                                                  transcript_ids
## 1                                                                         ENST00000373020.4,ENST00000494424.1,ENST00000496771.1
## 2                                                                                           ENST00000373031.4,ENST00000485971.1
## 3 ENST00000371582.4,ENST00000371583.5,ENST00000371584.4,ENST00000371588.5,ENST00000413082.1,ENST00000466152.1,ENST00000494752.1
## 4                                     ENST00000367770.1,ENST00000367771.6,ENST00000367772.4,ENST00000423670.1,ENST00000470238.1
##   22RV1_PROSTATE 2313287_STOMACH
## 1           5.28            7.01
## 2           0.00            0.00
## 3          73.38          108.99
## 4           9.76           16.76

exp = as.matrix(dat[,-(1:2)])
library(stringr)
rownames(exp) = str_split(dat$gene_id,"\\.",simplify = T)[,1]
exp = log2(exp+1)
exp[1:4,1:4]

##                 22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
## ENSG00000000003       2.650765        3.001802             4.572890
## ENSG00000000005       0.000000        0.000000             0.000000
## ENSG00000000419       6.216843        6.781229             5.845741
## ENSG00000000457       3.427606        4.150560             1.839960
##                 253J_URINARY_TRACT
## ENSG00000000003           4.577731
## ENSG00000000005           0.000000
## ENSG00000000419           5.535742
## ENSG00000000457           2.087463

# 转换行名
library(AnnoProbe)
library(tinyarray)
an = annoGene(rownames(exp),ID_type = "ENSEMBL")
exp = trans_array(exp,ids = an,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]

##             22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
## DDX11L1          0.1634987       0.0000000           0.02856915
## WASH7P           4.5422580       4.1667154           3.79285535
## MIR1302-2HG      0.0000000       0.1505597           0.00000000
## FAM138A          0.0000000       0.0000000           0.95605665
##             253J_URINARY_TRACT
## DDX11L1              0.0000000
## WASH7P               3.5861642
## MIR1302-2HG          0.0000000
## FAM138A              0.5753123

2. 读取注释信息

clinical = read.delim("Cell_lines_annotations_20181226.txt")
colnames(clinical)[c(1,5)] = c("id","site")

3.表达矩阵和临床信息对应起来

a = intersect(colnames(exp),clinical$id)
exp = exp[,a]
clinical = clinical[match(a,clinical$id),]
identical(clinical$id,colnames(exp))

## [1] TRUE

4. 单基因表达量画图

library(dplyr)
#"METTL3","SETD2","TP53"
g = "METTL3"
pdat = cbind(gene = exp[g,],clinical[,c(1,5)])
library(tidyr)
pdat = drop_na(pdat,site)
su = group_by(pdat,site) %>% 
  summarise(a = median(gene)) %>% 
  arrange(desc(a))
pdat$site = factor(pdat$site,levels = su$site)
library(ggplot2)
library(RColorBrewer)
mypalette <- colorRampPalette(brewer.pal(8,"Set1"))
ggplot(pdat,aes(x = site,y = gene,fill = site))+
  geom_boxplot()+
  theme_bw()+
  theme(axis.text.x = element_text(vjust = 1,hjust = 1,angle = 70),legend.position = "bottom")+
  scale_fill_manual(values = mypalette(25))+
  guides (fill=guide_legend (nrow=3, byrow=TRUE))