#一、预处理,简单的数据过滤
library(Seurat)
library(tidyverse)
epi <- read.table("E:/ty_data/GSE136447_555sample_gene_count_matrix.txt", sep = "\t",header = TRUE, row.names = 1)
dim(epi)
epi.seu <- CreateSeuratObject(counts = epi, project ="epi")
epi.seu
epi.seu <- NormalizeData(epi.seu)#数据标准化
epi.seu <- FindVariableFeatures(epi.seu,selection.method ="vst", nfeatures = 2000)#提取富含信息最多的基因参与后续的降维,默认2000
epi.seu <- ScaleData(epi.seu)#对数据进行了行缩放,基因重要性一致,零上下浮动,基因标准一致
#默认是对2000做处理,ScaleData(epi.seu,features = rownames(epi.seu)) 对所有基因都进行了缩放
epi.seu #2000 variable features
epi.seu[["RNA"]]@counts#原始矩阵
epi.seu[["RNA"]]@data#Normalize以后的矩阵
epi.seu[["RNA"]]@scale.data#对data矩阵做缩放以后的矩阵
dim(epi.seu[["RNA"]]@scale.data)
dim(epi.seu[["RNA"]]@counts)
dim(epi.seu[["RNA"]]@data)
head(epi.seu@meta.data)#查看属性信息 orig.ident nCount_RNA:UMI count nFeature_RNA:细胞里有多少基因
epi.seu[["percent.mt"]] <- PercentageFeatureSet(epi.seu, pattern = "^MT-")#线粒体基因表达占比,PercentageFeatureSet:提取符合某种格式的基因,计算他们的占比
#epi.seu@meta.data$percent.mt=
head(epi.seu@meta.data)
VlnPlot(epi.seu,features = c("nCount_RNA","nFeature_RNA","percent.mt"),pt.size = 0)
#以上四个指标可以用来辅助判断,group.by = 可以用来分组 默认orig.ident,确定阈值之后就可以过滤
epi.seu = subset(epi.seu,subset = nFeature_RNA > 8000 & nFeature_RNA < 16000)#初步过滤,继续过滤可以结合其他软件
#二、利用PCA进行数据降维
epi.seu <- RunPCA(epi.seu, npcs = 50, verbose = FALSE)#仅保留了前50维
#进行聚类,50取了前30进行了聚类
epi.seu <- FindNeighbors(epi.seu, dims = 1:30)
epi.seu <- FindClusters(epi.seu, resolution = 0.5)#resolution对后续结果精细程度的调节,数值越高越精细
#降维,进一步降维到二维平面
epi.seu <- RunUMAP(epi.seu, dims = 1:30)
epi.seu <- RunTSNE(epi.seu, dims = 1:30)
#图形展示
DimPlot(epi.seu,reduction = "umap",pt.size = 1,label = T,repel = T)
head(epi.seu@meta.data)
#RNA_snn_res.0.5 与revolution相关 seurat_clusters与最近一次的revolution操作结果一致
epi.seu#3 dimensional reductions calculated:多了三个降维之后的结果
#三、数据保存
saveRDS(epi.seu,file = "test.epi.seu.rds")
网友评论