美文网首页
代码库5-Scanpy标准流程

代码库5-Scanpy标准流程

作者: 江湾青年 | 来源:发表于2023-04-01 22:06 被阅读0次

    载入包

    import numpy as np
    import pandas as pd
    import scanpy as sc
    
    sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
    sc.logging.print_header()
    sc.settings.set_figure_params(dpi=80, facecolor='white')
    
    results_file = 'write/pbmc3k.h5ad'  # the file that will store the analysis results
    

    两种方式读取数据

    直接读取.h5ad文件

    adata = sc.read('pancreas.h5ad')
    

    读取10X测序文件

    adata = sc.read_10x_mtx(
        'data/filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
        var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
        cache=True)                              # write a cache file for faster subsequent reading
    

    预处理、质控

    adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
    sc.pl.highest_expr_genes(adata, n_top=20, )
    # Basic filtering
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    # Filter mitochondrial genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    # genes,counts分布
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                 jitter=0.4, multi_panel=True)
    # Scatter plot
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
    # Filtering by 
    adata = adata[adata.obs.n_genes_by_counts < 2500, :]
    adata = adata[adata.obs.pct_counts_mt < 5, :]
    

    标准流程

    # Normalize
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    # Identify HVG
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pl.highly_variable_genes(adata)
    
    # 储存counts到adata.raw
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    
    # regress_out total_counts & pct_counts_mt
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    
    # Scale
    sc.pp.scale(adata, max_value=10)
    
    # PCA
    sc.pl.pca_variance_ratio(adata, log=True)
    adata.write(results_file)
    
    # Computing the neighborhood graph
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    
    # UMAP
    sc.tl.umap(adata)
    
    # Unsupervised clustering
    sc.tl.leiden(adata)
    
    # Save the result
    adata.write(results_file)
    

    寻找差异基因

    # each one vs all
    sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
    sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
    
    # Show the 10 top ranked genes per cluster 0, 1, …, 7 in a dataframe
    pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
    
    # Get a table with the scores and groups
    result = adata.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame(
        {group + '_' + key[:1]: result[key][group]
        for group in groups for key in ['names', 'pvals']}).head(5)
    
    # 0 vs 1
    sc.tl.rank_genes_groups(adata, 'leiden', groups=['0'], reference='1', method='wilcoxon')
    sc.pl.rank_genes_groups(adata, groups=['0'], n_genes=20)
    
    

    作图函数

    # FeaturePlot
    sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'])
    
    # DimPlot
    # 当adata.obsm的keys为'umap'时:
    sc.pl.umap(adata, color='leiden', legend_loc='on data')
    # 当adata.obsm的keys为其他时:(这里为'UMAP')
    sc.pl.embedding(adata_atac, basis='UMAP', color='Clusters')
    
    
    # VlnPlot
    sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')
    

    参考

    https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

    相关文章

      网友评论

          本文标题:代码库5-Scanpy标准流程

          本文链接:https://www.haomeiwen.com/subject/splcddtx.html