美文网首页
2022-03-08 TCGA差异基因处理

2022-03-08 TCGA差异基因处理

作者: 千容安 | 来源:发表于2022-03-08 22:59 被阅读0次

从PRAD开始重新得到差异基因数目(n=2852),与铁死亡基因(n=255)的交集,得到19个基因做热图

library(ComplexHeatmap)
library(limma )
library(edgeR)
library(TCGAbiolinks)
library(EDASeq)
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop\\国奖")
seq<-read.csv("mk_fm.csv")
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")

# 标准化
query <- GDCquery(project ="TCGA-PRAD",
                  data.category = "Transcriptome Profiling",
                  data.type ="Gene Expression Quantification" ,
                  workflow.type="HTSeq - Counts")
# 551个barcode                 
samplesDown <- getResults(query,cols=c("cases"))
# samplesDown中筛选出TP(主要实体瘤)样本的barcode
# 498个肿瘤样本barcode
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
                                  typesample = "TP")
# 52个正常组织barcode
dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
                                  typesample = "NT")
dataSmTP_short <- dataSmTP[1:498]
dataSmNT_short <- dataSmNT[1:52]
# 根据前面的筛选,再次请求数据
queryDown <- GDCquery(project = "TCGA-PRAD", 
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - Counts", 
                      barcode = c(dataSmTP, dataSmNT))
#读取下载的数据并将其准备到R对象中,在工作目录生成brca_case1.rda文件,
#同时还产生Human_genes__GRCh38_p12_.rda文件
GDCdownload(query = queryDown)
dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, save.filename = "prad_case1.rda")
#数据处理
# 去除dataPrep1中的异常值,dataPrep数据中含有肿瘤组织和正常组织的数据
dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep1, 
                                      cor.cut = 0.6,
                                      datatype = "HTSeq - Counts")
#使用EDASeq进行标准化
dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
                                      geneInfo = geneInfoHT,
                                      method = "gcContent")
# 使用分位数来过滤基因
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut =  0.25)
# 挑选正常样本:NT
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = c("NT"))
# 挑选肿瘤样本:TP
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = c("TP"))
# 差异表达分析
dataDEGs <- TCGAanalyze_DEA(
  mat1 = dataFilt[, samplesNT],
  mat2 = dataFilt[, samplesTP],
  Cond1type = "Normal",
  Cond2type = "Tumor",
  fdr.cut = 0.01 ,
  logFC.cut = 1,
  method = "glmLRT"
)
# 获取差异表达基因的表达水平
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
  dataDEGs, "Tumor", "Normal",
  dataFilt[, samplesTP], dataFilt[, samplesNT]
)
Ensembl_ID<- data.frame(Ensembl_ID=row.names(dataDEGsFiltLevel))
rownames(Ensembl_ID)<-Ensembl_ID[,1]
dataDEGsFiltLevel<-cbind(Ensembl_ID,dataDEGsFiltLevel)

get_map=function(input){
  if(is.character(input)){
    if(!file.exists(input)) stop("Bad input file.")
    message('Treat input as file')
    input=data.table::fread(input,header=FALSE)
  }else{
    data.table::setDT(input)
  }
  input=input[input[[3]]=='gene',]
  pattern_id=".*gene_id \"([^;]+)\";.*"
  pattern_name=".*gene_name \"([^;]+)\";.*"
  gene_id=sub(pattern_id,"\\1",input[[9]])
  gene_name=sub(pattern_name,"\\1",input[[9]])
  Ensembl_ID_TO_Genename<-data.frame(gene_id=gene_id,
                                     gene_name=gene_name,
                                     stringsAsFactors = FALSE)
  return(Ensembl_ID_TO_Genename)
}
Ensembl_ID_TO_Genename<-get_map("gencode.v39.annotation.gtf.gz")

gtf_Ensembl_ID<-substr(Ensembl_ID_TO_Genename[,1],1,15)
Ensembl_ID_TO_Genename<-data.frame(gtf_Ensembl_ID,Ensembl_ID_TO_Genename[,2])
colnames(Ensembl_ID_TO_Genename)<-c("Ensembl_ID","gene_id")

#融合数据
mergeRawCounts<-merge(Ensembl_ID_TO_Genename,dataDEGsFiltLevel,by="Ensembl_ID")
dataDEGsFiltLevel<-mergeRawCounts[,-c(1,3)]
#差异基因筛选
etSig <-dataDEGsFiltLevel[which(dataDEGsFiltLevel$FDR<0.05&abs(dataDEGsFiltLevel$logFC)>1),]
# 加入一列,up_down 体现上下调信息
etSig[which(etSig$logFC>0),"up_down"]<-"Up"
etSig[which(etSig$logFC<0),"up_down"]<-"Down" 
# 保存数据到本地
options(scipen = 200)
write.table(etSig,".\\基因差异表达水平.xls",sep ="\t",col.names=TRUE, row.names = FALSE, quote =FALSE, na="")




#绘制热图
library(ComplexHeatmap)
library('circlize')
library(dplyr)
library(ggplot2)
library(ggrepel)
library(pheatmap)
# 去除name列里面的引号
data= gsub('["]', '', data)
# 去除name列里面的逗号
data= gsub('[,]', '', data)
#data<-etSig[c(-2,-3,-6,-7,-8,-9,-10)]
data<-read.table("txt.txt",sep="\t")
data<-data[c(-2,-3,-6,-7,-8,-9,-10)]
rownames(data)<-data[,1]
colnames(data)<-data[1,]
data<-data[,-1]
data<-data[-1,]
data<-apply(data, c(1,2),as.numeric)

Group = factor("Tumor", "Normal")
anndf <- data.frame(Group)

由于只有一组数据,一开始热图颜色没有渐变,不太美观,尝试了多种颜色方案,选择了最后一种

htmap<- pheatmap(data, scale = "row",
                 border="white",
                 cellwidth = 18,
                 cellheight = 6,
                 color = mycol,
                 fontsize = 7,
                 fontsize_col = 6,
                 fontsize_row = 6,
                 treeheight_row=30,
                 treeheight_col=30,
                 annotation_col=anndf,
                 annotation_colors=anncol,
                 annotation_legend=F,
                 legend_breaks=c(-2.5,-1.5,0,1.5,2.5),
                 legend_labels=c("-2.5","-1.5","0","1.5","2.5"),
                 angle_col = 90)
order <- htmap$tree_row$order
genelist <- row.names(data)[order]
cluster <- htmap$tree_row
#绘制聚类树;
plot(cluster,hang = -1,cex=0.6,axes=FALSE,ann=FALSE)

df_scaled <- t(scale(t(data)))
range(df_scaled)
col_fun = colorRamp2(c(-0.8,0,0.7), c("greenyellow","white", "red"))
class = anno_block(gp = gpar(fill = c("#c77cff","#FF9999","#99CC00","#FF9900"),
                             col="white"),height = unit(5, "mm"),
                   labels = c("Tumor", "Normal"),
                   labels_gp = gpar(col = "white", fontsize = 8,fontface="bold"))
group= HeatmapAnnotation(group=class)
Heatmap(df_scaled,
        rect_gp = gpar(col="white"),
        row_names_gp = gpar(fontsize = 6),
        column_names_gp = gpar(fontsize = 8),
        col = col_fun,
        column_split = 4,
        row_split = 4,
        column_title = NULL,
        row_title = NULL,
        cluster_rows = TRUE,
        show_row_dend = TRUE,   #当基因数较多时可以隐藏行方向的聚类树 = FALSE;
        top_annotation =group,
        name = "Exp")

library(RColorBrewer)
pheatmap(data, color = colorRampPalette(brewer.pal(9, "PiYG"))(100),display_numbers = TRUE,number_color = "white")

也可为色块标上数值:

pheatmap(data, color = colorRampPalette(brewer.pal(9, "PiYG"))(100),display_numbers = TRUE,number_color = "white")

相关文章

网友评论

      本文标题:2022-03-08 TCGA差异基因处理

      本文链接:https://www.haomeiwen.com/subject/tmpzrrtx.html