转录组专题：DESeq2以raw count数据为起点做差异表达

作者: 挽山 | 来源:发表于2020-06-30 08:00 被阅读0次

转录组专题：DESeq2以raw count数据为起点做差异表达
转录组专题：edgeR以raw count数据为起点做差异表达分
count,FPKM,TPM,CPM
TCGA差异分析——limma, DEseq2, edgeR
数据分析：转录组差异分析（DESeq2+limma+edgeR+
2.2转录组 | 差异表达分析（DESeq,edgeR,limm
转录组表达定量- Read count？CPM? RPKM? F
DEseq2 PCA plot
DeSeq2包——学习
转录组数据标准化--Normalization

一、DESeq2 正常情况

参考：

https://www.jianshu.com/p/6d385b729b27

#原始数据为count表
source("https://bioconductor.org/biocLite.R")
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#biocLite("DESeq2")

exprSet<-read.csv(file.choose(),header = T,row.names= 1, sep = ",") #file="12_gene_count_matrix.csv"
head(exprSet)
#header=T表示将文件中第一行设为列名字;row.names= 1表示第一列设为行名
#row.names(exprSet)<-exprSet[,1]
#exprSet<-exprSet[,-1]
#head(exprSet)

condition<-factor(c(rep("ASD",2),rep("Healthy",4),rep("ASD",1),rep("Healthy",2),rep("ASD",3)),levels = c("ASD","Healthy"))

#把第一列作为exprset的行名（基因），列名为样本号
colData<-data.frame(row.names = colnames(exprSet),condition)

library(DESeq2)
dds<-DESeqDataSetFromMatrix(countData=exprSet, colData=colData, design=~condition)
dds<-dds [rowSums(counts(dds)) > 1, ] #初步过滤
dds<-DESeq(dds)

# 标准化矩阵
rld<-rlogTransformation(dds)#<50小样本
exprSet_new=assay(rld)
write.csv(exprSet_new,"1_CC_normlization_matrix_re.csv")

vst<-varianceStabilizingTransformation(dds)#>=50大样本
exprSet_new=assay(vst)
write.csv(exprSet_new,"1_PFC_normlization_matrix_re.csv")

# 内置程序PCA
plotPCA(rld)

#查看差异表达结果 ----------------------------------------------------------------
res<-results(dds)#contrast=c("condition","ASD","Healthy")) 
summary(res)

pvalue<-subset(res,pvalue < 0.05)

write.csv(pvalue,"3-CC_diff_result_pvalue0.05.csv",row.names = T)

#差异基因注释（excel改列名）-------------------------------------------------
pvalue<-read.csv(file = file.choose(),header =T,sep = ',')
head(pvalue)
colnames(pvalue)[1]<-"ENSG"
#library(tidyr)
#x<-separate(DEG, col=ENSG1,into=c("ENSG","dot"),sep="\\.",remove = T);head(x)

ensg2pvalue<-merge(pvalue,ensembl2symbol,by.x="ENSG",by.y="Gene.stable.ID.version",all=F,sort=F)
head(ensg2pvalue);dim(ensg2pvalue)

#symbol2id
symbol2id<-read.table(file = file.choose(),header = T,sep = '\t')
head(symbol2id);colnames(symbol2id)<-c('gene_symbol','gene_id','gene_symbol2')
symbol2id<-symbol2id[,c(1,2)]

id_note<-merge(ensg2pvalue,symbol2id,by.x="Gene.name",by.y="gene_symbol",all=F,sort=F)
head(id_note);dim(id_note)

#判断分组
regulate<-ifelse(id_note$log2FoldChange>0,'up-regulation','down-regulation')
updown<- data.frame(id_note, regulate);head(updown)

#重新排序
resOrdered<-as.data.frame(updown[order(updown$regulate),]);head(resOrdered)
write.table(resOrdered,'4-CC_diff_pvalue0.05_note.txt',row.names = F,quote = F,sep = '\t')

#筛选差异基因（FC=2^log2FC）
dif_filter<-subset(resOrdered,pvalue < 0.05 & abs(log2FoldChange) > 0.26) #2~1, 1.5~0.58 1.2~0.26
dim(dif_filter);head(dif_filter)

write.csv(dif_filter,"4-CC_diff_0.05_1.2.csv",row.names = F)

#筛选常见基因类型
set_pc<-dif_filter[dif_filter$Gene.type=="protein_coding",][,c(1,4,9:11)]
head(set_pc);dim(set_pc)
set_lnc<-dif_filter[dif_filter$Gene.type=="lincRNA",][,c(1,9:12)]
head(set_lnc);dim(set_lnc)
set_mir<-dif_filter[dif_filter$Gene.type=="miRNA",][,c(1,9:12)]
head(set_mir);dim(set_mir)

write.table(set_pc,'5-CC_pc_0.05_1.5_522.txt',row.names = F,sep = '\t',quote = F)
#write.table(set_lnc,'5-CC_lnc_0.05_2_109.txt',row.names = F,sep = '\t',quote = F)
#write.table(set_mir,'5-CC_mir_0.05_2_12.txt',row.names = F,sep = '\t',quote = F)

full<-rbind(set_pc,set_lnc,set_mir)
head(full);tail(full)
write.table(full,'5-PFC_pc-lnc-mir_0.05_1.2.txt',row.names = F,col.names = T,quote = F,sep = '\t')

第二种情况：多个探针对应一个基因

付费参考：https://www.jianshu.com/p/845c9048325a
以下是YY师姐的方法：

#注释文件
#id2ensembl<-read.table(file.choose(),header=T, sep="\t") 
ensembl2symbol<-read.table(file.choose(),header=T, sep="\t") #用矩阵，biomart自动有标题
#symbol2id<-read.table(file.choose(),header=T, sep="\t") 
head(ensembl2symbol)

#表达矩阵注释
MAT<-read.csv(file.choose(),header=T, sep=",") #file=12_normlization_matrix_re_lxd.csv 此时基因列表存在重复
head(MAT)
colnames(MAT)[1]<-"ensembl"

#library(tidyr)
#y<-separate(MAT, col=ensembl,into=c("ENSG","dot"),sep="\\.",remove = T);head(y)

#ensembl2symbol
ensg2id_MAT<-merge(MAT,ensembl2symbol,by.x="ensembl",by.y="Gene.stable.ID.version",all=F,sort=F)
head(ensg2id_MAT)
dim(ensg2id_MAT)

exprSet_new<-ensg2id_MAT[,c(14,2:13)];head(exprSet_new);dim(exprSet_new)
#write.table(exprSet_new,"matrix_note_re.txt",row.names = F,quote = F,sep = "\t")

#但是如果真有重复我认为这步应当提前，先ID转换，count取唯一值然后再标准化做差异分析--------
#标准化矩阵中重复基因取均值
#exprSet_new<-read.table(file.choose(),header=T,sep="\t")

meanfun <- function(x) {
  x1 <- data.frame(unique(x[,1]))
  colnames(x1) <- c("Symbol")
  for (i in 2:ncol(x)){
    #i=2
    x2 <- data.frame(tapply(x[,i],x[,1],mean))# 第一列基因名为因子
    x2[,2] <- rownames(x2)#均值已经是第一列了，所以待匹配的行名（symbol）放第二列往后
    colnames(x2) <- c(colnames(x)[i], "Symbol")
    x1 <- merge(x1,x2,by.x = "Symbol",by.y = "Symbol",all=F,sort=F)
  }
  return(x1)
}

matrix<-meanfun(exprSet_new);dim(matrix)
write.table(matrix,"2-CC_normlization_matrix_note_si.txt",row.names = F,quote = F,sep = "\t")

第三种情况：批次效应

setwd('C:/2-results')

library(readr) 
datExpr<-read.csv('2-rawcount_95_miRBase.csv',header = T,row.names= 1, sep = ",") 
head(names(datExpr))
datTraits<-read_csv("2-datTraits_95.csv", col_names = T)
head(names(datTraits))

library(DESeq2) 
condition<-factor(datTraits$Diagnosis,levels = c("ASD","CTL")); condition
batch<-factor(datTraits$Library_preparation_batch,levels = c(1:12)); batch #注意改
colData<-data.frame(row.names = colnames(datExpr), condition, batch); colData

dds<-DESeqDataSetFromMatrix(countData=datExpr, colData=colData, design= ~batch+condition)

DES<-DESeq(dds)
vst<-varianceStabilizingTransformation(DES) # >=50大样本
plotPCA(vst)

#查看差异表达结果
res<-results(DES, contrast=c("condition","ASD","CTL")) #病例在前
summary(res)
pvalue<-subset(res,pvalue < 0.05)
write.csv(pvalue,"3-normdatExpr_batch_diff_result.csv",row.names = T)

# 提取标准化矩阵
exprSet_new=assay(vst)
write.csv(exprSet_new,"3-normdatExpr_batch_matrix.csv")