美文网首页
课后补充----关于单细胞空间基础分析的代码部分

课后补充----关于单细胞空间基础分析的代码部分

作者: 单细胞空间交响乐 | 来源:发表于2024-08-01 10:52 被阅读0次

    作者,Evil Genius

    我们本次2024年度的系列课程基础分析是没有放进来的,但是不是说明这部分不重要,相反,这部分是一切个性化分析的基础,我们来分享一下基础分析的代码部分

    首先是scRNA/snRNA的基础分析,包括常见的基础质控 + 排污 + 双细胞 + 降维聚类差异 + 生成h5ad文件,当然了,代码是官网 + 经验 + 文献搜集而来。封装类的代码一般是公司在用,大家又能力可以自己封装一下,下面是代码示例,注意,是示例,需要大家根据自己需求稍作修改,有些地方需要注意。

    library (Seurat)
    library (dplyr)
    library (ggplot2)
    library (harmony)
    library (patchwork)
    library (SoupX)
    library (DoubletFinder)
    library(reticulate)
    library(sceasy)
    library(anndata)
    library(patchwork)
    library(SeuratDisk)
    
    #================================Ambient RNA Correction===============================#
    Control1 = load10X('path/to/your/cellranger/outs/folder')
    Control1 = autoEstCont(Control1)
    Control1 = adjustCounts(Control1)
    #Do it for all samples
    #================================Doublet Removal===============================#
    Control1.S <- CreateSeuratObject(counts = Control1, project = "Control1", min.cells = 3, min.features = 300) # create Seurat object
    Control1.S <- NormalizeData(Control1.S)
    Control1.S <- FindVariableFeatures(Control1.S, selection.method = "vst", nfeatures = 2000)
    Control1.S <- ScaleData(Control1.S)
    Control1.S <- RunPCA(Control1.S)
    Control1.S <- RunUMAP(Control1.S, dims = 1:50)
    sweep.res.list_kidney <- paramSweep_v3(Control1.S, PCs = 1:30, sct = FALSE)
    sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
    bcmvn_kidney <- find.pK(sweep.stats_kidney) #determine the pk
    nExp_poi <- round(0.075*nrow(Control1.S@meta.data)) 
    Control1.S <- doubletFinder_v3(seu_kidney, PCs = 1:30, pN = 0.25, pK = #depends on the previous sterp, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
    Control1.S.Filtered <- subset(Control1.S, subset = #Doublet_related_column == "singlet")                                 
    #Do it for all samples
    #============================Merge Datssets======================================================#
    HK_SC_RNA = merge (Control1.S.Filtered, y = c (#All other sc filtered objects))
    
    #============================Add percent.mt to dataset======================================================#
    HK_SC_RNA[["percent.mt"]] <- PercentageFeatureSet(HK_SC_RNA, pattern = "^MT-")
    
    #============================Pre Processing======================================================#
    HK_SC_RNA <- subset(HK_SC_RNA, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 50 & percent.rpl < 15 & percent.rps < 15)
    
    HK_SC_RNA <- FindVariableFeatures(object = HK_SC_RNA,selection.method = "vst")
    HK_SC_RNA <- NormalizeData(HK_SC_RNA)
    HK_SC_RNA <- ScaleData(HK_SC_RNA, vars.to.regress = c ("nCount_RNA", "percent.mt", "percent.rpl", "percent.rps"))
    HK_SC_RNA <- RunPCA(HK_SC_RNA, assay = 'RNA', npcs = 30, features = VariableFeatures(object = HK_SC_RNA), 
                                  verbose = TRUE, ndims.print = 1:5, nfeatures.print = 10)
    
    #============= Run Harmony =============
    HK_SC_RNA <- RunHarmony(HK_SC_RNA, group.by.vars = "orig.ident")
    HK_SC_RNA <- RunUMAP(HK_SC_RNA, reduction = "harmony", dims = 1:30)
    HK_SC_RNA <- FindNeighbors(HK_SC_RNA, reduction = "harmony", dims = 1:30) %>% FindClusters()                   
    DimPlot(HK_SC_RNA, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
    
    #================================== DEG List and cell type annotation==============================#
    HK_SC_RNA.Markers <- FindAllMarkers(HK_SC_RNA, only.pos = TRUE, logfc.threshold = 0.25)
    
    #================================== Annotation and store data as Idents==============================#
    HK_SC_RNA <- RenameIdents(HK_SC_RNA, `0` = "PT")
    
    #==================================Convert to adata using sceasy ==============================#
    HK_SC_RNA_anndata <- convertFormat(HK_SC_RNA, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
    write_h5ad(HK_SC_RNA_anndata, filename = "HK_SC_RNA_anndata.h5ad")
    

    ATAC部分的基础分析

    library (Signac)
    library(Seurat)
    library (dplyr)
    library(ggplot2)
    library (harmony)
    library (patchwork)
    library (EnsDb.Hsapiens.v86)
    library(rtracklayer)
    library(reticulate)
    library(sceasy)
    library(anndata)
    library(patchwork)
    library(SeuratDisk)
    #--------------------------Prepare Object---------------------------------#
    annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
    genome(annotation) <- "hg38"
    seqlevelsStyle(annotation) <- "UCSC"
    counts <- Read10X_h5(filename = "/Your_Path/filtered_peak_bc_matrix.h5")
    metadata <- read.csv(file = "/Your_Path/singlecell.csv", header = TRUE,row.names = 1)
    fragpath <- "/Your_Path/outs/fragments.tsv.gz"
    chromatin.assay <- CreateChromatinAssay(counts = counts, sep = c(":", "-"),fragments = fragpath,annotation = annotation)
    Control1.S <- CreateSeuratObject(counts = chromatin.assay,assay = "peaks",meta.data = metadata)
    
    #Annotation
    hg38 <- import("/Your_Path_To_GTF_File/genes.gtf.gz")
    genome(hg38) <- "hg38"
    seqlevelsStyle(hg38) <- "UCSC"
    hg38$gene_biotype <- hg38$gene_type
    Annotation(Control1.S) <- hg38
    #QC Filtering
    Control1.S <- NucleosomeSignal(object = Control1.S)
    Control1.S <- TSSEnrichment(Control1.S, fast = FALSE)
    Control1.S$pct_reads_in_peaks <- Control1.S$peak_region_fragments / Control1.S$passed_filters * 100
    Control1.S$blacklist_ratio <- Control1.S$blacklist_region_fragments / Control1.S$peak_region_fragments
    Control1.S$high.tss <- ifelse(Control1.S$TSS.enrichment > 2, 'High', 'Low')
    pdf("Viloin.HK.Fetal.Mito.pdf", width=6, height=4)
    TSSPlot(Control1.S, group.by = 'high.tss') + NoLegend()
    Control1.S$nucleosome_group <- ifelse(Control1.S$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
    FragmentHistogram(object = Control1.S, group.by = 'nucleosome_group')
    pdf("HK2481_1.ATAC.QC.pdf", width=18, height=4)
    VlnPlot(object = Control1.S, features = c('pct_reads_in_peaks', 'peak_region_fragments', 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), pt.size = 0.1, ncol = 5)
    Control1.S <- subset(x = Control1.S, subset = peak_region_fragments > 3000 & peak_region_fragments < 20000 & pct_reads_in_peaks > 15 & blacklist_ratio < 0.05 & nucleosome_signal < 4 & TSS.enrichment > 2)
    #Do this step for all 20 samples
    #--------------------------Merge Datasets---------------------------------#
    HK.SN.ATAC = merge (x = Control1.S, y = list(#list of all other objects))
    #--------------------------Pre-processing---------------------------------#  
    HK.SN.ATAC <- RunTFIDF(HK.SN.ATAC)
    HK.SN.ATAC <- FindTopFeatures(HK.SN.ATAC, min.cutoff = 'q0')
    HK.SN.ATAC <- RunSVD(HK.SN.ATAC)
    HK.SN.ATAC <- RunHarmony(
      object = HK.SN.ATAC,
      group.by.vars = 'dataset',
      reduction = 'lsi',
      assay.use = 'peaks',
      project.dim = FALSE
    )
    HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, dims = 2:50, reduction = 'harmony')
    HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, reduction = "harmony", dims = 2:50)
    HK.SN.ATAC <- FindNeighbors(HK.SN.ATAC, reduction = "harmony", dims = 2:50) %>% FindClusters()
    #--------------------------DAR---------------------------------#
    DARs <- FindAllMarkers(object = HK.SN.ATAC, min.pct = 0.05, test.use = 'LR', latent.vars = 'peak_region_fragments')
    
    HK_SN_ATAC_anndata <- convertFormat(HK_SN_ATAC_Filtered, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
    write_h5ad(HK_SN_ATAC_anndata, filename = "HK_SN_ATAC_anndata.h5ad")
    

    空间基础分析部分 ,包括NMF寻找微环境的部分

    library (dplyr)
    library (ggplot2)
    library(SeuratData)
    library (harmony)
    library (patchwork)
    library (Seurat)
    options(stringsAsFactors = F)
    library(CellTrek)
    library(Seurat)
    library(viridis)
    library(ConsensusClusterPlus)
    library(SpotClean)
    library(S4Vectors)
    library (STUtility)
    set.seed(123)
    # Load 10x Visium data
    Control1.ST_raw <- read10xRaw("/path/to/matrix/folder")
    Control1.ST_slide_info <- read10xSlide("/path/to/tissue/csv", 
                                      "/path/to/tissue/image", 
                                      "/path/to/scale/factor")
    
    # Visualize raw data
    Control1.ST_obj <- createSlide(count_mat = Control1.ST_raw, 
                              slide_info = Control1.ST_slide_info)
    visualizeSlide(slide_obj = Control1.ST_obj)
    visualizeHeatmap(Control1.ST_obj,rownames(Control1.ST_raw)[1])
    
    # Decontaminate raw data
    Control1.ST.decont_obj <- spotclean(Control1.ST_obj)
    
    # Repeat this step for all 14 ST samples
    
    #-----------------------------Preprocessing------------------------------#
    Control1.ST <- SCTransform(Control1.ST_obj, assay = "Spatial", verbose = FALSE)
    #Do it for all samples
    #-----------------------------Merge All SCT spatial samples------------------------------#
    HK.ST = merge (Control1.ST, y =c (#All the ST objects))
      
    #-----------------------------Merge All SCT spatial samples------------------------------# 
    HK.ST <- RunPCA(HK.ST, assay = "SCT", verbose = FALSE)
    HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident")
    HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
    HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()
    
    #-----------------------------Find DEGs between Clusters------------------------------# 
    HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    
    #-----------------------------Run NMF for microenvironments------------------------------# 
    RunNMF(
     HK.ST,
     features = NULL,
      nfactors = 20,
      rescale = TRUE,
      reduction.name = "NMF")
    HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident", reduction = "NMF")  
    HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
    HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()
    #-----------------------------Find DEGs between MEs------------------------------# 
    HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    
    #-----------------------------------Mapping Back snRNA-seq, scRNA-seq, and snATAC-seq to Spatial dataset----------------------#
    HK.SC.SN.ATAC <- HK.SC.SN.ATAC[, sample(colnames(HK.SN), size =20000, replace=F)] #Downsample the annotated snRNA-seq matrix
    Control1.ST = Load10X_Spatial("/Your_10X-Folder/outs", filename = "filtered_feature_bc_matrix.h5", assay = "Spatial", slice = "slice1")
    Control1.ST <- RenameCells(Control1.ST, new.names=make.names(Cells(Control1.ST)))
    HK.SC.SN.ATAC <- RenameCells(HK.SC.SN.ATAC, new.names=make.names(Cells(HK.SC.SN.ATAC)))
    Coltrol1.ST.Samples_traint <- CellTrek::traint(st_data=Control1.ST, sc_data=HK.SC.SN.ATAC, sc_assay='RNA', cell_names='seurat_clusters')
    Control1.ST.HK.SN_celltrek <- CellTrek::celltrek(st_sc_int=Coltrol1.ST.Samples_traint, int_assay='traint', sc_data=HK.SC.SN.ATAC, sc_assay = 'RNA', reduction='pca', intp=T, intp_pnt=5000, intp_lin=F, nPCs=30, ntree=1000, dist_thresh=0.55, top_spot=5, spot_n=5, repel_r=20, repel_iter=20, keep_model=T)$celltrek
    Control1.ST.HK.SN_celltrek$seurat_clusters <- factor(Control1.ST.HK.SN_celltrek$seurat_clusters, levels=sort(Control1.ST.HK.SN_celltrek$seurat_clusters)))
    Idents = Control1.ST.HK.SN_celltrek[["Idents2"]]
    Idents (Control1.ST.HK.SN_celltrek) = Idents
    

    基础的解卷积部分,cell2location + 共定位

    import scanpy as sc
    import anndata
    import pandas as pd
    import numpy as np
    import os
    import gc
    
    import cell2location
    
    import matplotlib as mpl
    from matplotlib import rcParams
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # silence scanpy that prints a lot of warnings
    import warnings
    warnings.filterwarnings('ignore')
    
    
    from google.colab import drive
    drive.mount('/content/drive')
    
    
    import cell2location
    
    results_folder = '/content/drive/MyDrive/Spatial/HK2529_ST'
    ref_run_name = f'{results_folder}/reference_signatures'
    run_name = f'{results_folder}/cell2location_map'
    
    
    adata = sc.read_visium(path ='/content/drive/MyDrive/HK2529_ST/', count_file= "filtered_feature_bc_matrix.h5", load_images=True)
    
    adata.var_names_make_unique()
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
    
    
    sns.distplot(adata.obs["total_counts"], kde=False)
    
    
    #sc.pp.filter_cells(adata, min_counts=5000)
    #sc.pp.filter_cells(adata, max_counts=35000) 
    #adata = adata[adata.obs["pct_counts_mt"] < 50]
    print(f"#cells after MT filter: {adata.n_obs}")
    sc.pp.filter_genes(adata, min_cells=10)
    
    
    sc.pp.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, key_added="clusters")
    
    
    sc.pp.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, key_added="clusters")
    
    
    sc.pl.spatial(adata,color=["COL1A1", "NPHS1"], img_key=None, size=1,vmin=0, cmap='magma', vmax='p99.0', gene_symbols='SYMBOL')
    
    
    adata_ref = sc.read ("/content/drive/MyDrive/adata_ref_withmodel_4.3.2023.h5ad")
    
    # export estimated expression in each cluster
    if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
        inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}' 
                                        for i in adata_ref.uns['mod']['factor_names']]].copy()
    else:
        inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}' 
                                        for i in adata_ref.uns['mod']['factor_names']]].copy()
    inf_aver.columns = adata_ref.uns['mod']['factor_names']
    inf_aver.iloc[0:5, 0:5]
    
    
    intersect = np.intersect1d(adata.var_names, inf_aver.index)
    adata = adata[:, intersect].copy()
    inf_aver = inf_aver.loc[intersect, :].copy()
    
    # prepare anndata for cell2location model
    cell2location.models.Cell2location.setup_anndata(adata=adata)
    # can add additional slides to adata and add in a batch_key
    
    
    mod = cell2location.models.Cell2location(
        adata, cell_state_df=inf_aver, 
        # the expected average cell abundance: tissue-dependent 
        # hyper-prior which can be estimated from paired histology:
        N_cells_per_location=30,
        # hyperparameter controlling normalisation of
        # within-experiment variation in RNA detection:
        detection_alpha=20
    ) 
    mod.view_anndata_setup()
    
    
    mod.train(max_epochs=30000, 
              # train using full data (batch_size=None)
              batch_size=None, 
              # use all data points in training because 
              # we need to estimate cell abundance at all locations
              train_size=1,
              use_gpu=True)
    
    # plot ELBO loss history during training, removing first 100 epochs from the plot
    mod.plot_history(1000)
    plt.legend(labels=['full data training']);
    
    
    
    # In this section, we export the estimated cell abundance (summary of the posterior distribution).
    adata = mod.export_posterior(
        adata, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
    )
    
    adata.obs[adata.uns['mod']['factor_names']] = adata.obsm['q05_cell_abundance_w_sf']
    
    
    # Save model
    mod.save(f"{run_name}", overwrite=True)
    
    # mod = cell2location.models.Cell2location.load(f"{run_name}", adata)
    
    # Save anndata object with results
    adata_file = f"{run_name}/sp.h5ad"
    adata.write(adata_file)
    adata_file
    
    
    
    # select one slide
    from cell2location.utils import select_slide
    sc.pl.spatial(adata, cmap='magma',
                      color=['Podo',"Endo_G","PEC", "Mes", "iPT", "GS_Stromal", "PT_S1", "PT_S2", "PT_S3", "Fibroblast_1", "Fibroblast_2", "DCT1" , "M_TAL", "DTL",
                             "CD4T", "CD8T", "Mac", "Dedifferentiated_Tubule"], 
                      ncols=4, size=1.3, 
                      img_key='hires',
                      # limit color scale at 99.2% quantile of cell abundance
                      vmin=0, vmax='p99.2' 
                     )
                     
                     
                     sc.pp.neighbors(adata, use_rep='q05_cell_abundance_w_sf',
                    n_neighbors = 15)
    sc.tl.leiden(adata, resolution=1.1)
    adata.obs["region_cluster"] = adata.obs["leiden"].astype("category")
    sc.tl.umap(adata, min_dist = 0.3, spread = 1)
    
    
    sc.pl.spatial(adata, color=['region_cluster'],
                      size=1.3, img_key='hires', alpha=0.5)
                      
                      
                      # Set the value of the "sample" column for all rows to "HK3513_ST"
    adata.obs['sample'] = "HK2529_ST"
    
    # Print the updated "obs" attribute
    print(adata.obs)
    
    
    # Now we use cell2location plotter that allows showing multiple cell types in one panel
    from cell2location.plt import plot_spatial
    
    # select up to 6 clusters
    clust_labels = [ 'Podo', 'PEC', "GS_Stromal", "Fibroblast_1"]
    clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels
    
    slide = select_slide(adata, 'HK2529_ST')
    
    with mpl.rc_context({'figure.figsize': (15, 15)}):
        fig = plot_spatial(
            adata=slide,
            # labels to show on a plot
            color=clust_col, labels=clust_labels,
            show_img=True,
            # 'fast' (white background) or 'dark_background'
            style='fast',
            # limit color scale at 99.2% quantile of cell abundance
            max_color_quantile=0.992,
            # size of locations (adjust depending on figure size)
            circle_diameter=12,
            colorbar_position='right'
        )
        
        from cell2location import run_colocation
    res_dict, adata = run_colocation(
        adata,
        model_name='CoLocatedGroupsSklearnNMF',
        train_args={'n_fact': np.arange(5, 25), 'n_restarts': 3 })
    

    生活很好,有你更好

    相关文章

      网友评论

          本文标题:课后补充----关于单细胞空间基础分析的代码部分

          本文链接:https://www.haomeiwen.com/subject/fhdohjtx.html