在下载,读取得到目标肿瘤组学数据的
SummarizedExperiment
对象时,我们就可以接下来的分析了。主要分为差异分析和生存分析两部分,分别对各自的结果可进行富集分析、基因互作网络分析等。
0、前期数据准备
- 胆管癌的转录组数据(hg19)
- 下载数据时,优先选择未进行表转化的raw counts数据
query <- GDCquery(project = "TCGA-CHOL",
legacy = TRUE,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "normalized_results")
TP = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"TP")
NT = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"NT")
query <- GDCquery(project = "TCGA-CHOL",
legacy = TRUE,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results", #未标准化的原始数据
barcode = c(TP,NT))
dim(getResults(query))
GDCdownload(query)
data <- GDCprepare(query, save = T, save.filename = "CHOL_count.rda")
assays(data)
#List of length 2
#names(2): raw_count scaled_estimate
assay(data, "raw_count")[1:4,1:4]
# TCGA-3X-AAV9-01A-72R-A41I-07 TCGA-3X-AAVC-01A-21R-A41I-07
# A1BG|1 130.35 55.00
# A2M|2 44062.75 14915.96
# NAT1|9 133.00 97.00
# NAT2|10 16.00 0.00
# TCGA-ZH-A8Y4-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
# A1BG|1 3065.17 124.59
# A2M|2 14693.94 15423.86
# NAT1|9 158.00 173.00
# NAT2|10 48.00 1.00
1、数据预处理
- 这一步主要完成样本间相关性检验(组间差异)、count文库标准化,以及过滤基因。
1.1 TCGAanalyze_Preprocessing
CorOutliers <- TCGAanalyze_Preprocessing(data,cor.cut = 0)
dim(CorOutliers) #因为设定的阈值是0,所以未进行过滤
#[1] 19947 45
如下图结果,两个组,各自组内的样本相似性都很高,符合预期。不进行样本的过滤。
data(raw count / normalized count)
- TCGAanalyze_Preprocessing 样本相关性(分组间差距)是否明显
- TCGAanalyze_Normalization :raw count标准化(未进行 log转换)
- TCGAanalyze_Filtering:过滤基因
1.2 TCGAanalyze_Normalization
-
TCGAanalyze_Normalization()
提供两种分别基于gcContent / geneLength的过滤手段,默认method = "geneLength"
dataNorm <- TCGAanalyze_Normalization(tabDF = data, geneInfo = geneInfo) #没有log处理
class(dataNorm) # matrix
#[1] "matrix" "array"
dataNorm[1:4, 1:4]
# TCGA-3X-AAV9-01A-72R-A41I-07 TCGA-3X-AAVC-01A-21R-A41I-07
# A1BG 130 55
# A2M 44063 14916
# NAT1 133 97
# NAT2 16 0
# TCGA-ZH-A8Y4-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
# A1BG 3065 125
# A2M 14694 15424
# NAT1 158 173
# NAT2 48 1
assay(data, "raw_count")[1:4,1:4]
# TCGA-3X-AAV9-01A-72R-A41I-07 TCGA-3X-AAVC-01A-21R-A41I-07
# A1BG|1 130.35 55.00
# A2M|2 44062.75 14915.96
# NAT1|9 133.00 97.00
# NAT2|10 16.00 0.00
# TCGA-ZH-A8Y4-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
# A1BG|1 3065.17 124.59
# A2M|2 14693.94 15423.86
# NAT1|9 158.00 173.00
# NAT2|10 48.00 1.00
1.2 TCGAanalyze_Filtering
-
TCGAanalyze_Filtering()
默认将低表达的25%的基因全部过滤掉
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25)
dim(dataFilt)
2、差异分析
2.1 数据检查
- 表达矩阵
dim(dataFilt)
#[1] 14893 45
dataFilt[1:4,1:4]
# TCGA-3X-AAV9-01A-72R-A41I-07 TCGA-3X-AAVC-01A-21R-A41I-07
# A1BG 130 55
# A2M 44063 14916
# NAT1 133 97
# NAT2 16 0
# TCGA-ZH-A8Y4-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
# A1BG 3065 125
# A2M 14694 15424
# NAT1 158 173
# NAT2 48 1
- 样本分组信息
# selection of normal samples "NT"
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt),
typesample = c("NT"))
str(samplesNT)
#chr [1:9] "TCGA-W5-AA2R-11A-11R-A41I-07" ...
# selection of tumor samples "TP"
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt),
typesample = c("TP"))
str(samplesTP)
#chr [1:36] "TCGA-3X-AAV9-01A-72R-A41I-07" ...
2.2 差异分析
-
TCGAanalyze_DEA()
默认按照edgeR pipeine流程进行分析,也支持limma pipeline。 - mat1与mat2参数交代两组的表达矩阵,Cond1type与Cond2type参数交代两组的样本名(列名)
dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT],
mat2 = dataFilt[,samplesTP],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 , #FDR过滤阈值
logFC.cut = 2, #logFC过滤阈值
method = "glmLRT")
dataDEGs = dataDEGs[order(dataDEGs$FDR),]
dim(dataDEGs)
#[1] 5628 5
head(dataDEGs)
# logFC logCPM LR PValue FDR
# USH2A -5.260544 2.701101 230.6313 4.341863e-52 6.466336e-48
# KCNN2 -4.343924 2.429990 183.5307 8.214329e-42 6.116800e-38
# LCAT -3.682702 5.460159 157.4718 4.036993e-36 2.004098e-32
# IQCE 4.230650 5.927193 150.5665 1.303603e-34 4.853640e-31
# DCXR -4.069958 7.676658 148.5685 3.563336e-34 1.061375e-30
# PRSS16 6.948546 4.804090 140.8095 1.770919e-32 3.559063e-29
-
dataDEGsFiltLevel()
可将上述的结果更将友好地展示
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGs,"Tumor","Normal",
dataFilt[,samplesTP],dataFilt[,samplesNT])
dataDEGsFiltLevel = dataDEGsFiltLevel[order(dataDEGsFiltLevel$FDR),]
dim(dataDEGsFiltLevel)
#[1] 5628 6
head(dataDEGsFiltLevel)
# mRNA logFC FDR Tumor Normal Delta
# USH2A USH2A -5.260544 6.466336e-48 28.69444 1207.88889 150.9484
# KCNN2 KCNN2 -4.343924 6.116800e-38 43.33333 893.77778 188.2367
# LCAT LCAT -3.682702 2.004098e-32 509.11111 6640.22222 1874.9046
# IQCE IQCE 4.230650 4.853640e-31 2946.19444 153.33333 12464.3167
# DCXR DCXR -4.069958 1.061375e-30 1819.86111 32284.22222 7406.7590
# PRSS16 PRSS16 6.948546 3.559063e-29 1319.05556 10.22222 9165.5183
可视化:火山图
TCGAVisualize_volcano()
TCGAVisualize_volcano(x = dataDEGs$logFC,
y = dataDEGs$FDR,
filename = "volcanoexp.png",
x.cut = 3,
y.cut = 10^-5,
names = rownames(dataDEGs),
color = c("black","red","darkgreen"),
names.size = 2,
xlab = " Gene expression fold change (Log2)",
legend = "State",
title = "Volcano plot",
width = 10)
# Warning message:
# ggrepel: 2268 unlabeled data points (too many overlaps). Consider increasing max.overlaps
2.3 富集分析
Genelist <- rownames(dataDEGsFiltLevel)
str(Genelist)
#chr [1:5628] "USH2A" "KCNN2" "LCAT" "IQCE" "DCXR" "PRSS16" "ZDHHC13" "C20orf117" ...
ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
class(ansEA) #list
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
GOBPTab = ansEA$ResBP,
GOCCTab = ansEA$ResCC,
GOMFTab = ansEA$ResMF,
PathTab = ansEA$ResPat,
nRGTab = Genelist,
nBar = 10, text.size = 1.5)
2.3 生存分析
(1) TCGAanalyze_survival
- 首先需要下载病人的临床信息,必须包括这三列内容:"vital_status", "days_to_death", "days_to_last_follow_up",如没有,可仔细查看clinical meta,有可能是几个字母的差别,手动修改下即可。
#must = c("vital_status","days_to_death","days_to_last_follow_up")
#table(must %in% colnames(colData(data)))
# TRUE
# 3
#也可手动另行下载病人的临床信息(推荐)
dataClin <- GDCquery_clinic(project = "TCGA-CHOL","clinical")
- 选择临床信息中感兴趣的分组指标,进行生存分析
TCGAanalyze_survival(clin.chol,"gender",
main = "TCGA Set\n CHOL",
height = 7, width=10,
risk.table = F,
filename = "CHOL_gender_survival.pdf")
(2) TCGAanalyze_SurvivalKM()
- 可根据一个基因在病人(tumor)的表达量高低进行分组,然后进行基于基因高低表达的生存分析
- 如下代码:对最显著的100个表达差异基因做生存分析。在分别对每一个基因做单变量生存分析时,根据基因表达量,将病人划分为若干组。如下代码将67%以上高表达的样本作为高表达组;33%以下的低表达作为低表达组,根据这两组的生存信息,判断这个基因表达对于生存的影响。
dataDEGs = dataDEGs[order(dataDEGs$FDR),]
dim(dataDEGs)
group1 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
group2 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
dataSurv <- TCGAanalyze_SurvivalKM(clinical_patient = dataClin,
dataGE = dataFilt,
Genelist = rownames(dataDEGs)[1:100],
Survresult = FALSE,
ThreshTop = 0.67,
ThreshDown = 0.33,
p.cut = 0.5, group1, group2)
dim(dataSurv)
#[1] 32 7
head(dataSurv)
# pvalue Group2 Deaths Group2 Deaths with Top Group2 Deaths with Down Mean Group2 Top Mean Group2 Down Mean Group1
# IGSF3 0.05514067 13 5 8 6137.9167 1688.75000 75.111111
# YEATS2 0.08756778 10 2 8 2810.5833 1193.08333 215.222222
# C1orf170 0.09716126 10 4 6 202.0833 54.16667 4.222222
# EXO1 0.12244120 12 8 4 485.8333 103.75000 3.333333
# PSMC3IP 0.17962868 13 9 4 404.8333 120.58333 5.000000
# C20orf117 0.20058812 12 5 7 680.4167 273.58333 28.666667
- 如下代码,分批对所有基因都做一遍生存分析。值得注意的是每次最好不要超过100个基因,否则会报错。
tokenStop<- 1
tabSurvKMcomplete <- NULL
for( i in 1: round(nrow(dataFilt)/100)){
# i = 1
message( paste( i, "of ", round(nrow(dataNorm)/100)))
tokenStart <- tokenStop
tokenStop <-100*i
tabSurvKM<-TCGAanalyze_SurvivalKM(dataClin,
dataFilt,
Genelist = rownames(dataFilt)[tokenStart:tokenStop],
ThreshTop=0.67,
ThreshDown=0.33,
group1 = samplesNT, #control group
group2 = samplesTP) #disease group
# 3 groups (High, intermediate, low)
# performs SA between High and low groups
tabSurvKMcomplete <- rbind(tabSurvKMcomplete,tabSurvKM)
}
dim(tabSurvKMcomplete)
#[1] 467 7
(3) TCGAvisualize_SurvivalCoxNET()
-
TCGAvisualize_SurvivalCoxNET()
可结合上一步,将生存分析显著的基因里,存在互作关系的基因group标示出来。基因的互作关系主要来自STRING database. -
Genelist
主要根据上一步的生存分析结果,或者与差异基因相结合,但总数最好也不要超过100个。还有一个注意事项,临床信息的列名可能存在细节上的差异,需要我们核对,必要时做出调整。
It shows in the end a network build with community of genes with similar range of pvalues from Cox regression (same color) and that interaction among those genes is already validated in literatures using the STRING database (version 9.1)
library(dnet)
org.Hs.string <- dRDataLoader(RData = "org.Hs.string")
# "bcr_patient_barcode", "days_to_last_followup",
# "days_to_death", "vital_status",
# "age_at_initial_pathologic_diagnosis", "gender"
colnames(dataClin)[c(9,11)]
#[1] "days_to_last_follow_up" "age_at_diagnosis"
colnames(dataClin)[c(9,11)] = c("days_to_last_followup","age_at_initial_pathologic_diagnosis")
TabCoxNet <- TCGAvisualize_SurvivalCoxNET(dataClin,
dataFilt,
Genelist = rownames(tabSurvKMcomplete)[1:100],
scoreConfidence = 700,
org.Hs.string = org.Hs.string,
titlePlot = "CHOL dnet")
网友评论