课后补充----关于单细胞空间基础分析的代码部分

作者: 单细胞空间交响乐 | 来源:发表于2024-08-01 10:52 被阅读0次

Seurat的单细胞免疫组库分析来了！
10X单细胞（10X空间转录组）轨迹分析（拟时分析）VECTOR
空间组数据和单细胞数据的相关性分析（Seurat）2022-05
单细胞与空间联合分析-CARD简介-20220721
java中引用传递、值传递的理解（数组，自定义类，基本数据类型
10X单细胞、VDJ、空间转录组助力研究转移性黑色素瘤的免疫图谱
PySCENIC（一）：python版单细胞转录组转录因子分析
空间转录组聚类分析与生态位
10X单细胞-10X空间转录组联合分析之七----CellDAR
10X单细胞空间联合分析揭示了受不同基质细胞群调节的follic

作者，Evil Genius

我们本次2024年度的系列课程基础分析是没有放进来的，但是不是说明这部分不重要，相反，这部分是一切个性化分析的基础，我们来分享一下基础分析的代码部分

首先是scRNA/snRNA的基础分析，包括常见的基础质控 + 排污 + 双细胞 + 降维聚类差异 + 生成h5ad文件,当然了，代码是官网 + 经验 + 文献搜集而来。封装类的代码一般是公司在用，大家又能力可以自己封装一下，下面是代码示例，注意，是示例，需要大家根据自己需求稍作修改，有些地方需要注意。

library (Seurat)
library (dplyr)
library (ggplot2)
library (harmony)
library (patchwork)
library (SoupX)
library (DoubletFinder)
library(reticulate)
library(sceasy)
library(anndata)
library(patchwork)
library(SeuratDisk)

#================================Ambient RNA Correction===============================#
Control1 = load10X('path/to/your/cellranger/outs/folder')
Control1 = autoEstCont(Control1)
Control1 = adjustCounts(Control1)
#Do it for all samples
#================================Doublet Removal===============================#
Control1.S <- CreateSeuratObject(counts = Control1, project = "Control1", min.cells = 3, min.features = 300) # create Seurat object
Control1.S <- NormalizeData(Control1.S)
Control1.S <- FindVariableFeatures(Control1.S, selection.method = "vst", nfeatures = 2000)
Control1.S <- ScaleData(Control1.S)
Control1.S <- RunPCA(Control1.S)
Control1.S <- RunUMAP(Control1.S, dims = 1:50)
sweep.res.list_kidney <- paramSweep_v3(Control1.S, PCs = 1:30, sct = FALSE)
sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
bcmvn_kidney <- find.pK(sweep.stats_kidney) #determine the pk
nExp_poi <- round(0.075*nrow(Control1.S@meta.data)) 
Control1.S <- doubletFinder_v3(seu_kidney, PCs = 1:30, pN = 0.25, pK = #depends on the previous sterp, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
Control1.S.Filtered <- subset(Control1.S, subset = #Doublet_related_column == "singlet")                                 
#Do it for all samples
#============================Merge Datssets======================================================#
HK_SC_RNA = merge (Control1.S.Filtered, y = c (#All other sc filtered objects))

#============================Add percent.mt to dataset======================================================#
HK_SC_RNA[["percent.mt"]] <- PercentageFeatureSet(HK_SC_RNA, pattern = "^MT-")

#============================Pre Processing======================================================#
HK_SC_RNA <- subset(HK_SC_RNA, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 50 & percent.rpl < 15 & percent.rps < 15)

HK_SC_RNA <- FindVariableFeatures(object = HK_SC_RNA,selection.method = "vst")
HK_SC_RNA <- NormalizeData(HK_SC_RNA)
HK_SC_RNA <- ScaleData(HK_SC_RNA, vars.to.regress = c ("nCount_RNA", "percent.mt", "percent.rpl", "percent.rps"))
HK_SC_RNA <- RunPCA(HK_SC_RNA, assay = 'RNA', npcs = 30, features = VariableFeatures(object = HK_SC_RNA), 
                              verbose = TRUE, ndims.print = 1:5, nfeatures.print = 10)

#============= Run Harmony =============
HK_SC_RNA <- RunHarmony(HK_SC_RNA, group.by.vars = "orig.ident")
HK_SC_RNA <- RunUMAP(HK_SC_RNA, reduction = "harmony", dims = 1:30)
HK_SC_RNA <- FindNeighbors(HK_SC_RNA, reduction = "harmony", dims = 1:30) %>% FindClusters()                   
DimPlot(HK_SC_RNA, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

#================================== DEG List and cell type annotation==============================#
HK_SC_RNA.Markers <- FindAllMarkers(HK_SC_RNA, only.pos = TRUE, logfc.threshold = 0.25)

#================================== Annotation and store data as Idents==============================#
HK_SC_RNA <- RenameIdents(HK_SC_RNA, `0` = "PT")

#==================================Convert to adata using sceasy ==============================#
HK_SC_RNA_anndata <- convertFormat(HK_SC_RNA, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
write_h5ad(HK_SC_RNA_anndata, filename = "HK_SC_RNA_anndata.h5ad")

ATAC部分的基础分析

library (Signac)
library(Seurat)
library (dplyr)
library(ggplot2)
library (harmony)
library (patchwork)
library (EnsDb.Hsapiens.v86)
library(rtracklayer)
library(reticulate)
library(sceasy)
library(anndata)
library(patchwork)
library(SeuratDisk)
#--------------------------Prepare Object---------------------------------#
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
genome(annotation) <- "hg38"
seqlevelsStyle(annotation) <- "UCSC"
counts <- Read10X_h5(filename = "/Your_Path/filtered_peak_bc_matrix.h5")
metadata <- read.csv(file = "/Your_Path/singlecell.csv", header = TRUE,row.names = 1)
fragpath <- "/Your_Path/outs/fragments.tsv.gz"
chromatin.assay <- CreateChromatinAssay(counts = counts, sep = c(":", "-"),fragments = fragpath,annotation = annotation)
Control1.S <- CreateSeuratObject(counts = chromatin.assay,assay = "peaks",meta.data = metadata)

#Annotation
hg38 <- import("/Your_Path_To_GTF_File/genes.gtf.gz")
genome(hg38) <- "hg38"
seqlevelsStyle(hg38) <- "UCSC"
hg38$gene_biotype <- hg38$gene_type
Annotation(Control1.S) <- hg38
#QC Filtering
Control1.S <- NucleosomeSignal(object = Control1.S)
Control1.S <- TSSEnrichment(Control1.S, fast = FALSE)
Control1.S$pct_reads_in_peaks <- Control1.S$peak_region_fragments / Control1.S$passed_filters * 100
Control1.S$blacklist_ratio <- Control1.S$blacklist_region_fragments / Control1.S$peak_region_fragments
Control1.S$high.tss <- ifelse(Control1.S$TSS.enrichment > 2, 'High', 'Low')
pdf("Viloin.HK.Fetal.Mito.pdf", width=6, height=4)
TSSPlot(Control1.S, group.by = 'high.tss') + NoLegend()
Control1.S$nucleosome_group <- ifelse(Control1.S$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
FragmentHistogram(object = Control1.S, group.by = 'nucleosome_group')
pdf("HK2481_1.ATAC.QC.pdf", width=18, height=4)
VlnPlot(object = Control1.S, features = c('pct_reads_in_peaks', 'peak_region_fragments', 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), pt.size = 0.1, ncol = 5)
Control1.S <- subset(x = Control1.S, subset = peak_region_fragments > 3000 & peak_region_fragments < 20000 & pct_reads_in_peaks > 15 & blacklist_ratio < 0.05 & nucleosome_signal < 4 & TSS.enrichment > 2)
#Do this step for all 20 samples
#--------------------------Merge Datasets---------------------------------#
HK.SN.ATAC = merge (x = Control1.S, y = list(#list of all other objects))
#--------------------------Pre-processing---------------------------------#  
HK.SN.ATAC <- RunTFIDF(HK.SN.ATAC)
HK.SN.ATAC <- FindTopFeatures(HK.SN.ATAC, min.cutoff = 'q0')
HK.SN.ATAC <- RunSVD(HK.SN.ATAC)
HK.SN.ATAC <- RunHarmony(
  object = HK.SN.ATAC,
  group.by.vars = 'dataset',
  reduction = 'lsi',
  assay.use = 'peaks',
  project.dim = FALSE
)
HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, dims = 2:50, reduction = 'harmony')
HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, reduction = "harmony", dims = 2:50)
HK.SN.ATAC <- FindNeighbors(HK.SN.ATAC, reduction = "harmony", dims = 2:50) %>% FindClusters()
#--------------------------DAR---------------------------------#
DARs <- FindAllMarkers(object = HK.SN.ATAC, min.pct = 0.05, test.use = 'LR', latent.vars = 'peak_region_fragments')

HK_SN_ATAC_anndata <- convertFormat(HK_SN_ATAC_Filtered, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
write_h5ad(HK_SN_ATAC_anndata, filename = "HK_SN_ATAC_anndata.h5ad")

空间基础分析部分，包括NMF寻找微环境的部分

library (dplyr)
library (ggplot2)
library(SeuratData)
library (harmony)
library (patchwork)
library (Seurat)
options(stringsAsFactors = F)
library(CellTrek)
library(Seurat)
library(viridis)
library(ConsensusClusterPlus)
library(SpotClean)
library(S4Vectors)
library (STUtility)
set.seed(123)
# Load 10x Visium data
Control1.ST_raw <- read10xRaw("/path/to/matrix/folder")
Control1.ST_slide_info <- read10xSlide("/path/to/tissue/csv", 
                                  "/path/to/tissue/image", 
                                  "/path/to/scale/factor")

# Visualize raw data
Control1.ST_obj <- createSlide(count_mat = Control1.ST_raw, 
                          slide_info = Control1.ST_slide_info)
visualizeSlide(slide_obj = Control1.ST_obj)
visualizeHeatmap(Control1.ST_obj,rownames(Control1.ST_raw)[1])

# Decontaminate raw data
Control1.ST.decont_obj <- spotclean(Control1.ST_obj)

# Repeat this step for all 14 ST samples

#-----------------------------Preprocessing------------------------------#
Control1.ST <- SCTransform(Control1.ST_obj, assay = "Spatial", verbose = FALSE)
#Do it for all samples
#-----------------------------Merge All SCT spatial samples------------------------------#
HK.ST = merge (Control1.ST, y =c (#All the ST objects))
  
#-----------------------------Merge All SCT spatial samples------------------------------# 
HK.ST <- RunPCA(HK.ST, assay = "SCT", verbose = FALSE)
HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident")
HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()

#-----------------------------Find DEGs between Clusters------------------------------# 
HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

#-----------------------------Run NMF for microenvironments------------------------------# 
RunNMF(
 HK.ST,
 features = NULL,
  nfactors = 20,
  rescale = TRUE,
  reduction.name = "NMF")
HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident", reduction = "NMF")  
HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()
#-----------------------------Find DEGs between MEs------------------------------# 
HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

#-----------------------------------Mapping Back snRNA-seq, scRNA-seq, and snATAC-seq to Spatial dataset----------------------#
HK.SC.SN.ATAC <- HK.SC.SN.ATAC[, sample(colnames(HK.SN), size =20000, replace=F)] #Downsample the annotated snRNA-seq matrix
Control1.ST = Load10X_Spatial("/Your_10X-Folder/outs", filename = "filtered_feature_bc_matrix.h5", assay = "Spatial", slice = "slice1")
Control1.ST <- RenameCells(Control1.ST, new.names=make.names(Cells(Control1.ST)))
HK.SC.SN.ATAC <- RenameCells(HK.SC.SN.ATAC, new.names=make.names(Cells(HK.SC.SN.ATAC)))
Coltrol1.ST.Samples_traint <- CellTrek::traint(st_data=Control1.ST, sc_data=HK.SC.SN.ATAC, sc_assay='RNA', cell_names='seurat_clusters')
Control1.ST.HK.SN_celltrek <- CellTrek::celltrek(st_sc_int=Coltrol1.ST.Samples_traint, int_assay='traint', sc_data=HK.SC.SN.ATAC, sc_assay = 'RNA', reduction='pca', intp=T, intp_pnt=5000, intp_lin=F, nPCs=30, ntree=1000, dist_thresh=0.55, top_spot=5, spot_n=5, repel_r=20, repel_iter=20, keep_model=T)$celltrek
Control1.ST.HK.SN_celltrek$seurat_clusters <- factor(Control1.ST.HK.SN_celltrek$seurat_clusters, levels=sort(Control1.ST.HK.SN_celltrek$seurat_clusters)))
Idents = Control1.ST.HK.SN_celltrek[["Idents2"]]
Idents (Control1.ST.HK.SN_celltrek) = Idents

基础的解卷积部分，cell2location + 共定位

import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc

import cell2location

import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')


from google.colab import drive
drive.mount('/content/drive')


import cell2location

results_folder = '/content/drive/MyDrive/Spatial/HK2529_ST'
ref_run_name = f'{results_folder}/reference_signatures'
run_name = f'{results_folder}/cell2location_map'


adata = sc.read_visium(path ='/content/drive/MyDrive/HK2529_ST/', count_file= "filtered_feature_bc_matrix.h5", load_images=True)

adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)


sns.distplot(adata.obs["total_counts"], kde=False)


#sc.pp.filter_cells(adata, min_counts=5000)
#sc.pp.filter_cells(adata, max_counts=35000) 
#adata = adata[adata.obs["pct_counts_mt"] < 50]
print(f"#cells after MT filter: {adata.n_obs}")
sc.pp.filter_genes(adata, min_cells=10)


sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")


sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")


sc.pl.spatial(adata,color=["COL1A1", "NPHS1"], img_key=None, size=1,vmin=0, cmap='magma', vmax='p99.0', gene_symbols='SYMBOL')


adata_ref = sc.read ("/content/drive/MyDrive/adata_ref_withmodel_4.3.2023.h5ad")

# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
    inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}' 
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
else:
    inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}' 
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]


intersect = np.intersect1d(adata.var_names, inf_aver.index)
adata = adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()

# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata)
# can add additional slides to adata and add in a batch_key


mod = cell2location.models.Cell2location(
    adata, cell_state_df=inf_aver, 
    # the expected average cell abundance: tissue-dependent 
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=30,
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    detection_alpha=20
) 
mod.view_anndata_setup()


mod.train(max_epochs=30000, 
          # train using full data (batch_size=None)
          batch_size=None, 
          # use all data points in training because 
          # we need to estimate cell abundance at all locations
          train_size=1,
          use_gpu=True)

# plot ELBO loss history during training, removing first 100 epochs from the plot
mod.plot_history(1000)
plt.legend(labels=['full data training']);



# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata = mod.export_posterior(
    adata, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
)

adata.obs[adata.uns['mod']['factor_names']] = adata.obsm['q05_cell_abundance_w_sf']


# Save model
mod.save(f"{run_name}", overwrite=True)

# mod = cell2location.models.Cell2location.load(f"{run_name}", adata)

# Save anndata object with results
adata_file = f"{run_name}/sp.h5ad"
adata.write(adata_file)
adata_file



# select one slide
from cell2location.utils import select_slide
sc.pl.spatial(adata, cmap='magma',
                  color=['Podo',"Endo_G","PEC", "Mes", "iPT", "GS_Stromal", "PT_S1", "PT_S2", "PT_S3", "Fibroblast_1", "Fibroblast_2", "DCT1" , "M_TAL", "DTL",
                         "CD4T", "CD8T", "Mac", "Dedifferentiated_Tubule"], 
                  ncols=4, size=1.3, 
                  img_key='hires',
                  # limit color scale at 99.2% quantile of cell abundance
                  vmin=0, vmax='p99.2' 
                 )
                 
                 
                 sc.pp.neighbors(adata, use_rep='q05_cell_abundance_w_sf',
                n_neighbors = 15)
sc.tl.leiden(adata, resolution=1.1)
adata.obs["region_cluster"] = adata.obs["leiden"].astype("category")
sc.tl.umap(adata, min_dist = 0.3, spread = 1)


sc.pl.spatial(adata, color=['region_cluster'],
                  size=1.3, img_key='hires', alpha=0.5)
                  
                  
                  # Set the value of the "sample" column for all rows to "HK3513_ST"
adata.obs['sample'] = "HK2529_ST"

# Print the updated "obs" attribute
print(adata.obs)


# Now we use cell2location plotter that allows showing multiple cell types in one panel
from cell2location.plt import plot_spatial

# select up to 6 clusters
clust_labels = [ 'Podo', 'PEC', "GS_Stromal", "Fibroblast_1"]
clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels

slide = select_slide(adata, 'HK2529_ST')

with mpl.rc_context({'figure.figsize': (15, 15)}):
    fig = plot_spatial(
        adata=slide,
        # labels to show on a plot
        color=clust_col, labels=clust_labels,
        show_img=True,
        # 'fast' (white background) or 'dark_background'
        style='fast',
        # limit color scale at 99.2% quantile of cell abundance
        max_color_quantile=0.992,
        # size of locations (adjust depending on figure size)
        circle_diameter=12,
        colorbar_position='right'
    )
    
    from cell2location import run_colocation
res_dict, adata = run_colocation(
    adata,
    model_name='CoLocatedGroupsSklearnNMF',
    train_args={'n_fact': np.arange(5, 25), 'n_restarts': 3 })

生活很好，有你更好

Seurat的单细胞免疫组库分析来了！
使用Seurat进行单细胞VDJ免疫分析 NGS系列文章包括NGS基础、转录组分析（Nature重磅综述|关于RN...
10X单细胞（10X空间转录组）轨迹分析（拟时分析）VECTOR
hello，昨天我们分享了VECTOR的示例代码，文章在10X单细胞（10X空间转录组）轨迹分析（拟时分析）之VE...
空间组数据和单细胞数据的相关性分析（Seurat）2022-05
相似关键词单细胞数据集相关性分析空间组与单细胞数据集相关性分析空间组数据集相关性分析适用背景近年来，单细...
单细胞与空间联合分析-CARD简介-20220721
今天上午听了作者讲解CARD，基于去卷积方法进行空间与单细胞联合分析的软件，可以做：单细胞cluster与空间数...
java中引用传递、值传递的理解（数组，自定义类，基本数据类型
代码部分：方法补充，如下：输出结果补充，如下：结果分析：将数据赋值或当做参数传递的时候：1，对于基本数据类...
10X单细胞、VDJ、空间转录组助力研究转移性黑色素瘤的免疫图谱
hello，大家好，今天我们继续分享单细胞空间的分析内容，不过我们今天重点放在免疫部分，参考文献在Resident...
PySCENIC（一）：python版单细胞转录组转录因子分析
关于单细胞转录组转录因子的分析我们之前在单细胞系列讲过R语言版本的，参考：跟着Cell学单细胞转录组分析(十二):...
空间转录组聚类分析与生态位
作者，追风少年i 对于单细胞和空间转录组的数据分析而言，聚类分析都是绕不开的一个话题，从单细胞数据分析的角度看，聚...
10X单细胞-10X空间转录组联合分析之七----CellDAR
hello，大家好，今天再次给大家带来10X单细胞空间联合分析的一个新方法，CellDART，有关10X单细胞空间...
10X单细胞空间联合分析揭示了受不同基质细胞群调节的follic
hello，大家好，我这个loser又来了，又是单细胞空间联合，不知道大家的单细胞空间数据分析的怎么样了？？？发文...