美文网首页单细胞转录组
单细胞36计之26---过滤线粒体基因表达过高的细胞

单细胞36计之26---过滤线粒体基因表达过高的细胞

作者: Seurat_Satija | 来源:发表于2021-03-24 10:42 被阅读0次

    26、第二十六计 指桑骂槐
    指着桑树骂槐树。比喻借题发挥,指着这个骂那个。
    此计是指强大者要控制弱下者,要用警戒的办法去诱导他;治军时有时采取适当的强刚手段便会得到应和,行险则遇顺。

    因为损坏细胞和死细胞常表现出过大量的线粒体污染,因此需要过滤高线粒体基因表达的细胞。过滤原则为,移去线粒体基因表达比例过高的细胞,但是不能大量丢失样本细胞信息。

    library(Seurat)
    library(tidyverse)
    library(Matrix)
    #加载所需R包,如果没有自行安装下
    

    读入Rdata文件并查看Rdata文件保存的变量

    load("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")attach("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")# 注:路径改为自己文夹存放的路径
    
    ## The following object is masked _by_ .GlobalEnv:## ##     object
    
    ls("file:H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata") 
    
    ## [1] "object"
    

    查看保存变量名后,读入Rdata文件

    load('H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata')table(object@active.ident)
    
    ## ##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 ## 100 100 100 100 100 100 100 100 100 100 100 100 100 100  33  92
    
    head(object@meta.data)
    
    ##                    nCount_RNA nFeature_RNA percent.mito percent.ribo cluster## B-AAACGAAGTGAGTAGC       4000         1284    10.525000    18.700000      12## B-AAACGCTGTAGTCACT       7268         2502     9.506122    11.019397      10## B-AAACGCTGTATTTCCT      10096         3035    21.018225     5.645800       3## B-AAAGTGAAGTAACCTC       1741         1022     4.075775     6.142365       6## B-AAAGTGACAGCACAAG       3720         1276     2.607527    15.967742      12## B-AAAGTGAGTCGAGTGA      27745         5055     4.080014     9.511624      16##                    sample## B-AAACGAAGTGAGTAGC      B## B-AAACGCTGTAGTCACT      B## B-AAACGCTGTATTTCCT      B## B-AAAGTGAAGTAACCTC      B## B-AAAGTGACAGCACAAG      B## B-AAAGTGAGTCGAGTGA      B
    

    第一种方法:设置固定值过滤

    # Set threshold# mitochondrial gene expression ratio must least than 20%clean_data1 <- subset(object, percent.mito <= 20)print(ncol(object))
    
    ## [1] 1525
    
    print(ncol(clean_data1))
    
    ## [1] 1328
    
    # Decreasing cell ratio is:print(1 - ncol(clean_data1)/ncol(object))
    
    ## [1] 0.1291803
    

    第二种方法:设置固定值过滤并考虑线粒体基因表达分布情况

    Filter_high_mito_genes <-   function(mito_quantile, max_ratio){    for (i in 1:length(mito_quantile)){      while (mito_quantile[i] >= max_ratio){        i = i - 1        print(mito_quantile[i])        return(m_raw[i])        break      }    }  }# Set thresholdm_raw <- quantile(object$percent.mito, probs=seq(0,1,0.01)) m_raw_max <- Filter_high_mito_genes(m_raw, 20)
    
    ##      87% ## 19.87992
    
    clean_data2 <- subset(object, percent.mito <= m_raw_max)print(ncol(object))
    
    ## [1] 1525
    
    print(ncol(clean_data2))
    
    ## [1] 1326
    
    # Decreasing cell ratio is:print(1 - ncol(clean_data2)/ncol(object))
    
    ## [1] 0.1304918
    

    第三种方法:找寻离群值并删除

    • 使用绝对中位差(median absolute deviation, MAD)
    # MAD: https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/# Median absolute deviationseurat_clean <- objectmedian(seurat_clean$percent.mito)
    
    ## [1] 4.252895
    
    median(abs(seurat_clean$percent.mito-median(seurat_clean$percent.mito)))
    
    ## [1] 3.02637
    
    mad(seurat_clean$percent.mito, constant=1)
    
    ## [1] 3.02637
    
    mad_value <- mad(seurat_clean$percent.mito, constant=1)
    

    第一种寻找离群值的方法

    Hampel filter: https://towardsdatascience.com/outliers-detection-in-r-6c835f14e554Hampel filter认为在区间[median - 3*MAD, median + 3*MAD]之外的值为离群值。考虑到能接受线粒体基因表达比例过低的细胞,因此,这里线粒体基因表达比例高于median + 3*MAD的值认为是离群值。

    # I = [median - 3*MAD; median + 3*MAD]lower_bound <- median(seurat_clean$percent.mito - 3*mad_value)upper_bound <- median(seurat_clean$percent.mito + 3*mad_value)# outlier_ind <- which(seurat_clean$percent.mito < lower_bound | seurat_clean$percent.mito > upper_bound)# Set thresholdclean_data3_1 <- subset(object, percent.mito <= upper_bound)print(ncol(object))
    
    ## [1] 1525
    
    print(ncol(clean_data3_1))
    
    ## [1] 1212
    
    # Decreasing cell ratio is:print(1 - ncol(clean_data3_1)/ncol(object))
    
    ## [1] 0.2052459
    

    另一种寻找离群值的方法

    • 1 对线粒体基因表达分布计算MAD值
    • 2 根据MAD值计算MAD 偏差
    • 3 计算MAD偏差的中位数
    • 4 以MAD偏差的中位数中心化数据,拟合正态分布,过滤对p值BH检验后,p<0.01的细胞。
    # using a median-centered median absolute deviation (MAD)-variance normal distribution, # and then removed the cells with significantly higher expression levels than expected (determined by Benjamini-Hochberg corrected p < 0.01,                                                                                                                                                                # step1: calculate mad_value# step2: calculate median mad variance# step3: calculate median-centered mad variancelibrary(MASS)
    
    ## ## Attaching package: 'MASS'## The following object is masked from 'package:dplyr':## ##     select
    
    mad_value <- mad(seurat_clean$percent.mito, constant=1)median_mad_value <- median(seurat_clean$percent.mito-mad_value)mad_data <- seurat_clean$percent.mito-median_mad_valuequantile(mad_data, probs=seq(0,1,0.01))
    
    ##          0%          1%          2%          3%          4%          5% ## -1.22652582 -1.12677189 -1.04505412 -0.96783664 -0.88353556 -0.81879921 ##          6%          7%          8%          9%         10%         11% ## -0.73403847 -0.66443843 -0.56898173 -0.50345527 -0.40002066 -0.31144517 ##         12%         13%         14%         15%         16%         17% ## -0.24071698 -0.16165378 -0.10054114 -0.02779893  0.04874940  0.10832668 ##         18%         19%         20%         21%         22%         23% ##  0.16474711  0.25546482  0.32277906  0.37170908  0.45207025  0.53549713 ##         24%         25%         26%         27%         28%         29% ##  0.65528358  0.70880294  0.77405797  0.84589355  0.92257368  0.97663224 ##         30%         31%         32%         33%         34%         35% ##  1.11049555  1.17586328  1.27531613  1.34020968  1.40574423  1.46427697 ##         36%         37%         38%         39%         40%         41% ##  1.52484548  1.60613395  1.68158378  1.77465127  1.86783711  1.94859642 ##         42%         43%         44%         45%         46%         47% ##  2.10190300  2.17907731  2.32396704  2.46017357  2.56372110  2.70605979 ##         48%         49%         50%         51%         52%         53% ##  2.77613932  2.87061865  3.02636956  3.17350690  3.33353077  3.45084425 ##         54%         55%         56%         57%         58%         59% ##  3.65666815  3.77864615  3.94223341  4.15836488  4.37500052  4.51630880 ##         60%         61%         62%         63%         64%         65% ##  4.76932730  4.99745931  5.29929727  5.60190488  5.79174774  6.01009750 ##         66%         67%         68%         69%         70%         71% ##  6.25622972  6.52175657  6.86668520  7.20998333  7.53006017  7.82706009 ##         72%         73%         74%         75%         76%         77% ##  8.16781243  8.45596040  8.93308566  9.59124010 10.08937106 10.79636937 ##         78%         79%         80%         81%         82%         83% ## 11.41592714 11.76810435 12.36376280 13.17405483 14.26994801 15.17521921 ##         84%         85%         86%         87%         88%         89% ## 15.90015886 16.84398507 17.66219670 18.65338941 19.79228322 21.11463974 ##         90%         91%         92%         93%         94%         95% ## 22.60018875 24.04406053 26.04609465 29.19720742 33.30219617 37.55305544 ##         96%         97%         98%         99%        100% ## 40.53573859 43.55046754 48.29564234 54.91861069 66.97004349
    
    fit <- fitdistr(mad_data, densfun='normal')hist(mad_data, prob=T, main='')curve(dnorm(x, fit$estimate[1], fit$estimate[2]), col='red', add=T)
    
    图片
    p_data <- pnorm(mad_data, mean=fit$estimate[1], sd=fit$estimate[2], lower.tail=F)# quantile(p_data, probs=seq(0,1,0.01))a_data <- p.adjust(p_data, method='BH')# quantile(a_data, probs=seq(0,1,0.01))# Set thresholdclean_data3_2 <- subset(object, cells=which(a_data > 0.01))print(ncol(object))
    
    ## [1] 1525
    
    print(ncol(clean_data3_2))
    
    ## [1] 1502
    
    # Decreasing cell ratio is:print(1 - ncol(clean_data3_2)/ncol(object))
    
    ## [1] 0.01508197
    

    总之,过滤细胞比例为1.5%到20.5%,具体线粒体基因表达比例的选择因样本而异,言之有理即可。

    比较以上几种方法过滤后细胞线粒体基因分布情况

    mito_distribution <- function(seurat_object){  metadata <- seurat_object@meta.data  metadata %>%     ggplot(aes(color=sample, x=percent.mito, fill=sample)) +     geom_density(alpha = 0.2) +     scale_x_continuous(breaks = seq(0, max(object$percent.mito), by=5)) +     theme_classic() }p1 <- mito_distribution(object)p2 <- mito_distribution(clean_data1)p3 <- mito_distribution(clean_data2)p4 <- mito_distribution(clean_data3_1)p5 <- mito_distribution(clean_data3_2)library(cowplot)plot_grid(p1, p2, p3, p4, p5, labels=c('original', '1', '2', '3-1', '3-2'))
    
    图片

    原文链接:过滤线粒体基因表达过高的细胞

    相关文章

      网友评论

        本文标题:单细胞36计之26---过滤线粒体基因表达过高的细胞

        本文链接:https://www.haomeiwen.com/subject/rysfhltx.html