美文网首页
课前准备-单细胞velocity分析(贝叶斯模型)

课前准备-单细胞velocity分析(贝叶斯模型)

作者: 单细胞空间交响乐 | 来源:发表于2024-06-15 11:08 被阅读0次

    作者,Evil Genius

    速率

    • Probabilistic modeling of RNA velocity
    • Direct modeling of raw spliced and unspliced read count
    • Multiple uncertainty diagnostics analysis and visualizations
    • Synchronized cell time estimation across genes
    • Multivariate denoised gene expression and velocity prediction
    image image image image

    实现目标

    image

    整体框架

    1、前处理
    import scvelo as scv
    adata = scv.read("local_file.h5ad")
    
    adata.layers['raw_spliced']   = adata.layers['spliced']
    adata.layers['raw_unspliced'] = adata.layers['unspliced']
    adata.obs['u_lib_size_raw'] = adata.layers['raw_unspliced'].toarray().sum(-1)
    adata.obs['s_lib_size_raw'] = adata.layers['raw_spliced'].toarray().sum(-1)
    scv.pp.filter_and_normalize(adata, min_shared_counts=30, n_top_genes=2000)
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    
    
    2、训练模型
    from pyrovelocity.api import train_model
    from pyrovelocity.plot import vector_field_uncertainty
    num_epochs = 1000 # large data
    # num_epochs = 4000 # small data
    # Model 1
    adata_model_pos = train_model(adata,
                                   max_epochs=num_epochs, svi_train=True, log_every=100,
                                   patient_init=45,
                                   batch_size=4000, use_gpu=0, cell_state='state_info',
                                   include_prior=True,
                                   offset=False,
                                   library_size=True,
                                   patient_improve=1e-3,
                                   model_type='auto',
                                   guide_type='auto_t0_constraint',
                                   train_size=1.0)
    # Or Model 2
    adata_model_pos = train_model(adata,
                                   max_epochs=num_epochs, svi_train=True, log_every=100,
                                   patient_init=45,
                                   batch_size=4000, use_gpu=0, cell_state='state_info',
                                   include_prior=True,
                                   offset=True,
                                   library_size=True,
                                   patient_improve=1e-3,
                                   model_type='auto',
                                   guide_type='auto',
                                   train_size=1.0)
    
    # adata_model_pos is a returned list in which 0th element is the trained model,
    # the 1st element is the posterior samples of all random variables
    
    trained_model = adata_model_pos[0]
    posterior_samples = adata_model_pos[1]
    v_map_all, embeds_radian, fdri = vector_field_uncertainty(
        adata,
        posterior_samples=posterior_samples,
        basis="umap"
    )
    
    save_res = True
    if save_res:
        trained_model.save('saved_model', overwrite=True)
        result_dict = {"adata_model_pos": posterior_samples,
                       "v_map_all": v_map_all,
                       "embeds_radian": embeds_radian, "fdri": fdri} #, "embed_mean": embed_mean}
        import pickle
        with open("posterior_samples.pkl", "wb") as f:
             pickle.dump(result_dict, f)
    
    
    3、速率估计
    from pyrovelocity.plot import plot_state_uncertainty
    from pyrovelocity.plot import plot_posterior_time, plot_gene_ranking,\
          vector_field_uncertainty, plot_vector_field_uncertain,\
          plot_mean_vector_field, project_grid_points,rainbowplot,denoised_umap,\
          us_rainbowplot, plot_arrow_examples, set_colorbar
    
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    embedding = 'umap' # change to umap or tsne based on your embedding method
    
    # This generates the posterior samples of all vector fields
    # and statistical testing results from Rayleigh test
    v_map_all, embeds_radian, fdri = vector_field_uncertainty(adata, posterior_samples,
                                                              basis=embedding, denoised=False, n_jobs=30)
    fig, ax = plt.subplots()
    # This returns the posterior mean of the vector field
    embed_mean = plot_mean_vector_field(posterior_samples, adata, ax=ax, n_jobs=30, basis=embedding)
    # This plot single-cell level vector field uncertainty
    # and averaged cell vector field uncertainty on the grid points
    # based on angular standard deviation
    fig, ax = plt.subplots(1, 2)
    fig.set_size_inches(11.5, 5)
    plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
                                ax=ax,
                                fig=fig, cbar=False, basis=embedding, scale=None)
    
    # This generates shared time uncertainty plot with contour lines
    fig, ax = plt.subplots(1, 3)
    fig.set_size_inches(12, 2.8)
    adata.obs['shared_time_uncertain'] = posterior_samples['cell_time'].std(0).flatten()
    ax_cb = scv.pl.scatter(adata, c='shared_time_uncertain', ax=ax[0], show=False, cmap='inferno', fontsize=7, s=20, colorbar=True, basis=embedding)
    select = adata.obs['shared_time_uncertain'] > np.quantile(adata.obs['shared_time_uncertain'], 0.9)
    sns.kdeplot(adata.obsm[f'X_{embedding}'][:, 0][select],
                adata.obsm[f'X_{embedding}'][:, 1][select],
                ax=ax[0], levels=3, fill=False)
    
    # This generates vector field uncertainty based on Rayleigh test.
    adata.obs.loc[:, 'vector_field_rayleigh_test'] = fdri
    im = ax[1].scatter(adata.obsm[f'X_{embedding}'][:, 0],
                       adata.obsm[f'X_{embedding}'][:, 1], s=3, alpha=0.9,
                       c=adata.obs['vector_field_rayleigh_test'], cmap='inferno_r',
                       linewidth=0)
    set_colorbar(im, ax[1], labelsize=5, fig=fig, position='right')
    select = adata.obs['vector_field_rayleigh_test'] > np.quantile(adata.obs['vector_field_rayleigh_test'], 0.95)
    sns.kdeplot(adata.obsm[f'X_{embedding}'][:, 0][select],
                adata.obsm[f'X_{embedding}'][:, 1][select], ax=ax[1], levels=3, fill=False)
    ax[1].axis('off')
    ax[1].set_title("vector field\nrayleigh test\nfdr<0.05: %s%%" % (round((fdri < 0.05).sum()/fdri.shape[0], 2)*100), fontsize=7)
    
    
    4、可视化
    fig = plt.figure(figsize=(7.07, 4.5))
    subfig = fig.subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[1.6, 4])
    ax = fig.subplots(1)
    # This generates the selected cell fate markers and output in DataFrame
    volcano_data, _ = plot_gene_ranking([posterior_samples], [adata], ax=ax,
                                         time_correlation_with='st', assemble=True)
    # This generates the rainbow plots for the selected markers.
    _ = rainbowplot(volcano_data, adata, posterior_samples,
                    subfig[1], data=['st', 'ut'], num_genes=4)
    
    
    方法示例
    %load_ext autoreload
    %autoreload 2
    import scvelo as scv
    import numpy as np
    import torch
    import matplotlib.pyplot as plt
    from pyrovelocity.data import load_data
    from scipy.stats import spearmanr, pearsonr
    from pyrovelocity.api import train_model
    import seaborn as sns
    import pandas as pd
    from pyrovelocity.plot import plot_posterior_time, plot_gene_ranking,\
          vector_field_uncertainty, plot_vector_field_uncertain,\
          plot_mean_vector_field, project_grid_points,rainbowplot,denoised_umap,\
          us_rainbowplot, plot_arrow_examples
    from pyrovelocity.utils import mae, mae_evaluate
    
    adata = load_data(top_n=2000, min_shared_counts=30)
    
    
    模型训练
    adata_model_pos_split = train_model(adata, max_epochs=4000, svi_train=False, log_every=100,
                                        patient_init=45, batch_size=-1, use_gpu=1,
                                        include_prior=True, offset=False, library_size=True,
                                        patient_improve=1e-4, guide_type='auto_t0_constraint', train_size=0.67)
    
    mae_evaluate(adata_model_pos_split, adata)
    
    adata_model_pos = train_model(adata, max_epochs=4000, svi_train=False, log_every=100,
                                  patient_init=45, batch_size=-1, use_gpu=0,
                                  include_prior=True, offset=False, library_size=True,
                                  patient_improve=1e-4, guide_type='auto_t0_constraint', train_size=1.0)
    
    mae_evaluate(adata_model_pos[1], adata)
    
    v_map_all, embeds_radian, fdri = vector_field_uncertainty(adata, adata_model_pos[1], basis='umap', n_jobs=10)
    
    
    可视化
    fig, ax = plt.subplots()
    embed_mean = plot_mean_vector_field(adata_model_pos[1], adata, ax=ax, n_jobs=10)
    
    

    图片排列

    fig = plt.figure(figsize=(7.07, 4.5))
    subfig_A = fig.subfigures(2, 1, wspace=0.0, hspace=0, height_ratios=[1.2, 3])
    dot_size = 3
    font_size = 7
    
    ress = pd.DataFrame({"cell_type": adata.obs['clusters'].values,
                         "X1": adata.obsm['X_umap'][:,0],
                         "X2": adata.obsm['X_umap'][:,1]})
    subfig_A0 = subfig_A[0].subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[4, 2])
    
    ax = subfig_A0[0].subplots(1, 4)
    sns.scatterplot(x='X1', y='X2', data=ress, alpha=0.9, s=dot_size,
                    linewidth=0, edgecolor="none", hue='cell_type',
                    ax=ax[0], legend='brief')
    ax[0].axis('off')
    ax[0].set_title("Cell types\n", fontsize=font_size)
    ax[0].legend(bbox_to_anchor=[2.9, -0.01], ncol=4, prop={'size': font_size}, fontsize=font_size, frameon=False)
    kwargs = dict(color='gray', density=.8, add_margin=.1, s=dot_size,
                  show=False, alpha=.2, min_mass=3.5, frameon=False)
    scv.pl.velocity_embedding_stream(adata, fontsize=font_size, ax=ax[1], title='', **kwargs)
    ax[1].set_title("Scvelo\n", fontsize=7)
    
    scv.pl.velocity_embedding_stream(adata, fontsize=font_size, basis='umap',
                                     title='', ax=ax[2], vkey='velocity_pyro', **kwargs)
    ax[2].set_title("Pyro-Velocity\n", fontsize=7)
    
    # plot_arrow_examples(adata_train, np.transpose(v_map_all, (1, 2, 0)), embeds_radian, ax=ax[3],
    #                     n_sample=30, fig=fig, basis='umap', scale=0.004, alpha=0.18)
    plot_arrow_examples(adata, np.transpose(v_map_all, (1, 2, 0)), embeds_radian, ax=ax[3],
                        n_sample=30, fig=fig, basis='umap', scale=0.008, alpha=0.2, index=5,
                        scale2=0.015, num_certain=6)
    ax[3].set_title("Single cell\nvector field", fontsize=7)
    
    plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
                                fig=subfig_A0[1], cbar=True, basis='umap', scale=0.05)
    
    subfig_A0[0].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.35)
    subfig_A0[1].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.35)
    
    subfig_A[0].text(-0.06, 0.58, "Pancreas", size=7, rotation="vertical", va="center")
    
    subfig_B = subfig_A[1].subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[1.6, 4])
    ##subfig_B[0].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.1)
    
    ax = subfig_B[0].subplots(2, 1)
    plot_posterior_time(adata_model_pos[1], adata, ax=ax[0], fig=subfig_B[0], addition=False)
    subfig_B[0].subplots_adjust(hspace=0.3, wspace=0.1, left=0.01, right=0.8, top=0.92, bottom=0.17)
    
    volcano_data2, _ = plot_gene_ranking([adata_model_pos[1]], [adata], ax=ax[1],
                                        time_correlation_with='st', assemble=True)
    _ = rainbowplot(volcano_data2, adata, adata_model_pos[1], subfig_B[1], data=['st', 'ut'], num_genes=4)
    
    
    image
    fig.savefig("pancreas_output_figure1.tif", facecolor=fig.get_facecolor(), bbox_inches='tight', edgecolor='none', dpi=300)
    
    fig, ax = plt.subplots(1, 2)
    fig.set_size_inches(11.5, 5)
    plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
                                ax=ax,
                                fig=fig, cbar=False, basis='umap', scale=0.05)
    
    
    image
    More uncertainty evaluation metrics
    from pyrovelocity.plot import plot_state_uncertainty
    fig, ax = plt.subplots(2, 3)
    fig.set_size_inches(7.07, 3.5)
    pos = adata_model_pos[1]
    bin = 30
    adata.obs['cell_time_uncertain'] = adata_model_pos[1]['cell_time'].std(0).flatten()
    scv.pl.scatter(adata, c='cell_time_uncertain', ax=ax[0][0], show=False, cmap='inferno', fontsize=7)
    
    _ = ax[1][0].hist(adata.obs.cell_time_uncertain, bins=bin, color='black', alpha=0.9)
    ax[1][0].set_xlabel("shared time\nstandard deviation", fontsize=7)
    select = adata.obs['cell_time_uncertain'] > np.quantile(adata.obs['cell_time_uncertain'], 0.9)
    sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
                adata.obsm['X_umap'][:, 1][select], ax=ax[0][0], levels=3, fill=False)
    
    adata.obs['vector_field_rayleigh_test'] = fdri
    scv.pl.scatter(adata, c='vector_field_rayleigh_test', ax=ax[0][1], show=False, cmap='inferno_r', fontsize=7)
    select = adata.obs['vector_field_rayleigh_test'] > np.quantile(adata.obs['vector_field_rayleigh_test'], 0.9)
    sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
                adata.obsm['X_umap'][:, 1][select], ax=ax[0][1], levels=3, fill=False)
    
    _ = ax[1][1].hist(adata.obs.vector_field_rayleigh_test, bins=bin, color='black', alpha=0.9)
    ax[1][1].set_xlabel("vector field\nrayleigh test fdr", fontsize=7)
    ax[1][1].text(0.08, 2000, "<1%% FDR:%s%%" % int((fdri < 0.01).sum()/fdri.shape[0]*100), fontsize=7)
    ax[1][1].text(0.08, 1500, "<5%% FDR:%s%%" % int((fdri < 0.05).sum()/fdri.shape[0]*100), fontsize=7)
    
    fig.subplots_adjust(hspace=0.3, wspace=0.7, left=0.01, right=0.8, top=0.92, bottom=0.17)
    plot_state_uncertainty(pos, adata, kde=False, data='denoised', top_percentile=0.9, ax=ax[0][2])
    
    _ = ax[1][2].hist(adata.obs['state_uncertain'], bins=bin, color='black', alpha=0.9)
    ax[1][2].set_xlabel("averaged state uncertainty", fontsize=7)
    select = adata.obs['state_uncertain'] > np.quantile(adata.obs['state_uncertain'], 0.9)
    sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
                adata.obsm['X_umap'][:, 1][select], ax=ax[0][2], levels=3, fill=False)
    
    ax[1][0].set_ylabel("Frequency", fontsize=7)
    fig.tight_layout()
    
    

    生活很好,有你更好

    相关文章

      网友评论

          本文标题:课前准备-单细胞velocity分析(贝叶斯模型)

          本文链接:https://www.haomeiwen.com/subject/idqzqjtx.html