2022年第一愿:愿天下非模式物种都有数据库~
参考
https://cloud.tencent.com/developer/article/1426130
https://www.jianshu.com/p/e9b59353cd3a
事实上,做GSEA需要三个文件:txt(表达矩阵),cls(表型分组),gmt(物种数据库)。
1.物种数据库
这个是我师兄提供给我的,具体怎么生成的之后学了再记录
2.表达矩阵
刚开始用全部的信息生成了表达矩阵,结果有6.6G,太大了,所以用了周运来老师说的Pseudocell这一个概念,什么是Pseudoucell参见https://www.jianshu.com/p/038b4cc32883
直接上代码
#制作Pseudocell
if(F){
datadf <- as.matrix(seuo@assays$RNA@data )
idd1 <- seuo@meta.data
Inter.id1<-data.frame(rownames(idd1),idd1$celltype)
rownames(Inter.id1)<-rownames(idd1)
colnames(Inter.id1)<-c("CellID","Celltype")
Inter.id1<-as.data.frame(Inter.id1)
head(Inter.id1)
Inter1<-datadf[,Inter.id1$CellID]
Inter2<-as.matrix(Inter1)
Inter2[1:4,1:4]
Inter.id1$Celltype<- factor(Inter.id1$Celltype)
pseudocell.size = 10 ## 10 test
new_ids_list1 = list()
length(levels(Inter.id1$Celltype))
for (i in 1:length(levels(Inter.id1$Celltype))) {
cluster_id = levels(Inter.id1$Celltype)[i]
cluster_cells <- rownames(Inter.id1[Inter.id1$Celltype == cluster_id,])
cluster_size <- length(cluster_cells)
pseudo_ids <- floor(seq_along(cluster_cells)/pseudocell.size)
pseudo_ids <- paste0(cluster_id, "_Cell", pseudo_ids)
names(pseudo_ids) <- sample(cluster_cells)
new_ids_list1[[i]] <- pseudo_ids
}
new_ids <- unlist(new_ids_list1)
new_ids <- as.data.frame(new_ids)
head(new_ids)
new_ids_length <- table(new_ids)
new_ids_length
new_colnames <- rownames(new_ids)
gc()
colnames(datadf)
all.data<-datadf[,as.character(new_colnames)] ###add
all.data <- t(all.data)###add
new.data<-aggregate(list(all.data[,1:length(all.data[1,])]),
list(name=new_ids[,1]),FUN=mean)
rownames(new.data)<-new.data$name
new.data<-new.data[,-1]
new_ids_length<-as.matrix(new_ids_length)##
short<-which(new_ids_length<10)##
new_good_ids<-as.matrix(new_ids_length[-short,])##
result<-t(new.data)[,rownames(new_good_ids)]
dim(result)
}
expr_data <- as.matrix(result)
# 整理GSEA需要的表达谱格式
write.table(rbind(c('symbols',colnames(expr_data)),
cbind(rownames(expr_data),expr_data)),
file='expr.txt',quote=F,sep='\t',col.names=F,row.names=F)
3. 整理表型分组
name <- c(colnames(expr_data))
s1 <- sapply(strsplit(name, split='_', fixed=TRUE), function(x) (x[1])) #需要处理一下数据,之前在制作Pseudocell时引入了“_cell”
pheno<-as.character(s1)
con<-file('pheno.cls',open='w')
write(paste(length(pheno),'8 1'),con) #8代表具有8中亚群,1是固定值
write(paste('# ',"A",' ',"B",' ',"C",' ',"D",' ',"E",' ',"F",' ',"G",' ',"H",sep=''),con) #输入表型的名称,顺序要和表达谱里的一致
classes<-''
for (i in 1:length(pheno)){
classes<-paste(classes,pheno[i])
}
write(classes,con)
close(con)
网友评论