一、DESeq2 正常情况
参考:
#原始数据为count表
source("https://bioconductor.org/biocLite.R")
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#biocLite("DESeq2")
exprSet<-read.csv(file.choose(),header = T,row.names= 1, sep = ",") #file="12_gene_count_matrix.csv"
head(exprSet)
#header=T表示将文件中第一行设为列名字;row.names= 1表示第一列设为行名
#row.names(exprSet)<-exprSet[,1]
#exprSet<-exprSet[,-1]
#head(exprSet)
condition<-factor(c(rep("ASD",2),rep("Healthy",4),rep("ASD",1),rep("Healthy",2),rep("ASD",3)),levels = c("ASD","Healthy"))
#把第一列作为exprset的行名(基因),列名为样本号
colData<-data.frame(row.names = colnames(exprSet),condition)
library(DESeq2)
dds<-DESeqDataSetFromMatrix(countData=exprSet, colData=colData, design=~condition)
dds<-dds [rowSums(counts(dds)) > 1, ] #初步过滤
dds<-DESeq(dds)
# 标准化矩阵
rld<-rlogTransformation(dds)#<50小样本
exprSet_new=assay(rld)
write.csv(exprSet_new,"1_CC_normlization_matrix_re.csv")
vst<-varianceStabilizingTransformation(dds)#>=50大样本
exprSet_new=assay(vst)
write.csv(exprSet_new,"1_PFC_normlization_matrix_re.csv")
# 内置程序PCA
plotPCA(rld)
#查看差异表达结果 ----------------------------------------------------------------
res<-results(dds)#contrast=c("condition","ASD","Healthy"))
summary(res)
pvalue<-subset(res,pvalue < 0.05)
write.csv(pvalue,"3-CC_diff_result_pvalue0.05.csv",row.names = T)
#差异基因注释(excel改列名)-------------------------------------------------
pvalue<-read.csv(file = file.choose(),header =T,sep = ',')
head(pvalue)
colnames(pvalue)[1]<-"ENSG"
#library(tidyr)
#x<-separate(DEG, col=ENSG1,into=c("ENSG","dot"),sep="\\.",remove = T);head(x)
ensg2pvalue<-merge(pvalue,ensembl2symbol,by.x="ENSG",by.y="Gene.stable.ID.version",all=F,sort=F)
head(ensg2pvalue);dim(ensg2pvalue)
#symbol2id
symbol2id<-read.table(file = file.choose(),header = T,sep = '\t')
head(symbol2id);colnames(symbol2id)<-c('gene_symbol','gene_id','gene_symbol2')
symbol2id<-symbol2id[,c(1,2)]
id_note<-merge(ensg2pvalue,symbol2id,by.x="Gene.name",by.y="gene_symbol",all=F,sort=F)
head(id_note);dim(id_note)
#判断分组
regulate<-ifelse(id_note$log2FoldChange>0,'up-regulation','down-regulation')
updown<- data.frame(id_note, regulate);head(updown)
#重新排序
resOrdered<-as.data.frame(updown[order(updown$regulate),]);head(resOrdered)
write.table(resOrdered,'4-CC_diff_pvalue0.05_note.txt',row.names = F,quote = F,sep = '\t')
#筛选差异基因(FC=2^log2FC)
dif_filter<-subset(resOrdered,pvalue < 0.05 & abs(log2FoldChange) > 0.26) #2~1, 1.5~0.58 1.2~0.26
dim(dif_filter);head(dif_filter)
write.csv(dif_filter,"4-CC_diff_0.05_1.2.csv",row.names = F)
#筛选常见基因类型
set_pc<-dif_filter[dif_filter$Gene.type=="protein_coding",][,c(1,4,9:11)]
head(set_pc);dim(set_pc)
set_lnc<-dif_filter[dif_filter$Gene.type=="lincRNA",][,c(1,9:12)]
head(set_lnc);dim(set_lnc)
set_mir<-dif_filter[dif_filter$Gene.type=="miRNA",][,c(1,9:12)]
head(set_mir);dim(set_mir)
write.table(set_pc,'5-CC_pc_0.05_1.5_522.txt',row.names = F,sep = '\t',quote = F)
#write.table(set_lnc,'5-CC_lnc_0.05_2_109.txt',row.names = F,sep = '\t',quote = F)
#write.table(set_mir,'5-CC_mir_0.05_2_12.txt',row.names = F,sep = '\t',quote = F)
full<-rbind(set_pc,set_lnc,set_mir)
head(full);tail(full)
write.table(full,'5-PFC_pc-lnc-mir_0.05_1.2.txt',row.names = F,col.names = T,quote = F,sep = '\t')
第二种情况:多个探针对应一个基因
- 付费参考:https://www.jianshu.com/p/845c9048325a
- 以下是YY师姐的方法:
#注释文件
#id2ensembl<-read.table(file.choose(),header=T, sep="\t")
ensembl2symbol<-read.table(file.choose(),header=T, sep="\t") #用矩阵,biomart自动有标题
#symbol2id<-read.table(file.choose(),header=T, sep="\t")
head(ensembl2symbol)
#表达矩阵注释
MAT<-read.csv(file.choose(),header=T, sep=",") #file=12_normlization_matrix_re_lxd.csv 此时基因列表存在重复
head(MAT)
colnames(MAT)[1]<-"ensembl"
#library(tidyr)
#y<-separate(MAT, col=ensembl,into=c("ENSG","dot"),sep="\\.",remove = T);head(y)
#ensembl2symbol
ensg2id_MAT<-merge(MAT,ensembl2symbol,by.x="ensembl",by.y="Gene.stable.ID.version",all=F,sort=F)
head(ensg2id_MAT)
dim(ensg2id_MAT)
exprSet_new<-ensg2id_MAT[,c(14,2:13)];head(exprSet_new);dim(exprSet_new)
#write.table(exprSet_new,"matrix_note_re.txt",row.names = F,quote = F,sep = "\t")
#但是如果真有重复我认为这步应当提前,先ID转换,count取唯一值然后再标准化做差异分析--------
#标准化矩阵中重复基因取均值
#exprSet_new<-read.table(file.choose(),header=T,sep="\t")
meanfun <- function(x) {
x1 <- data.frame(unique(x[,1]))
colnames(x1) <- c("Symbol")
for (i in 2:ncol(x)){
#i=2
x2 <- data.frame(tapply(x[,i],x[,1],mean))# 第一列基因名为因子
x2[,2] <- rownames(x2)#均值已经是第一列了,所以待匹配的行名(symbol)放第二列往后
colnames(x2) <- c(colnames(x)[i], "Symbol")
x1 <- merge(x1,x2,by.x = "Symbol",by.y = "Symbol",all=F,sort=F)
}
return(x1)
}
matrix<-meanfun(exprSet_new);dim(matrix)
write.table(matrix,"2-CC_normlization_matrix_note_si.txt",row.names = F,quote = F,sep = "\t")
第三种情况:批次效应
setwd('C:/2-results')
library(readr)
datExpr<-read.csv('2-rawcount_95_miRBase.csv',header = T,row.names= 1, sep = ",")
head(names(datExpr))
datTraits<-read_csv("2-datTraits_95.csv", col_names = T)
head(names(datTraits))
library(DESeq2)
condition<-factor(datTraits$Diagnosis,levels = c("ASD","CTL")); condition
batch<-factor(datTraits$Library_preparation_batch,levels = c(1:12)); batch #注意改
colData<-data.frame(row.names = colnames(datExpr), condition, batch); colData
dds<-DESeqDataSetFromMatrix(countData=datExpr, colData=colData, design= ~batch+condition)
DES<-DESeq(dds)
vst<-varianceStabilizingTransformation(DES) # >=50大样本
plotPCA(vst)
#查看差异表达结果
res<-results(DES, contrast=c("condition","ASD","CTL")) #病例在前
summary(res)
pvalue<-subset(res,pvalue < 0.05)
write.csv(pvalue,"3-normdatExpr_batch_diff_result.csv",row.names = T)
# 提取标准化矩阵
exprSet_new=assay(vst)
write.csv(exprSet_new,"3-normdatExpr_batch_matrix.csv")
网友评论