26、第二十六计 指桑骂槐
指着桑树骂槐树。比喻借题发挥,指着这个骂那个。
此计是指强大者要控制弱下者,要用警戒的办法去诱导他;治军时有时采取适当的强刚手段便会得到应和,行险则遇顺。
因为损坏细胞和死细胞常表现出过大量的线粒体污染,因此需要过滤高线粒体基因表达的细胞。过滤原则为,移去线粒体基因表达比例过高的细胞,但是不能大量丢失样本细胞信息。
library(Seurat)
library(tidyverse)
library(Matrix)
#加载所需R包,如果没有自行安装下
读入Rdata文件并查看Rdata文件保存的变量
load("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")attach("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")# 注:路径改为自己文夹存放的路径
## The following object is masked _by_ .GlobalEnv:## ## object
ls("file:H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")
## [1] "object"
查看保存变量名后,读入Rdata文件
load('H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata')table(object@active.ident)
## ## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ## 100 100 100 100 100 100 100 100 100 100 100 100 100 100 33 92
head(object@meta.data)
## nCount_RNA nFeature_RNA percent.mito percent.ribo cluster## B-AAACGAAGTGAGTAGC 4000 1284 10.525000 18.700000 12## B-AAACGCTGTAGTCACT 7268 2502 9.506122 11.019397 10## B-AAACGCTGTATTTCCT 10096 3035 21.018225 5.645800 3## B-AAAGTGAAGTAACCTC 1741 1022 4.075775 6.142365 6## B-AAAGTGACAGCACAAG 3720 1276 2.607527 15.967742 12## B-AAAGTGAGTCGAGTGA 27745 5055 4.080014 9.511624 16## sample## B-AAACGAAGTGAGTAGC B## B-AAACGCTGTAGTCACT B## B-AAACGCTGTATTTCCT B## B-AAAGTGAAGTAACCTC B## B-AAAGTGACAGCACAAG B## B-AAAGTGAGTCGAGTGA B
第一种方法:设置固定值过滤
# Set threshold# mitochondrial gene expression ratio must least than 20%clean_data1 <- subset(object, percent.mito <= 20)print(ncol(object))
## [1] 1525
print(ncol(clean_data1))
## [1] 1328
# Decreasing cell ratio is:print(1 - ncol(clean_data1)/ncol(object))
## [1] 0.1291803
第二种方法:设置固定值过滤并考虑线粒体基因表达分布情况
Filter_high_mito_genes <- function(mito_quantile, max_ratio){ for (i in 1:length(mito_quantile)){ while (mito_quantile[i] >= max_ratio){ i = i - 1 print(mito_quantile[i]) return(m_raw[i]) break } } }# Set thresholdm_raw <- quantile(object$percent.mito, probs=seq(0,1,0.01)) m_raw_max <- Filter_high_mito_genes(m_raw, 20)
## 87% ## 19.87992
clean_data2 <- subset(object, percent.mito <= m_raw_max)print(ncol(object))
## [1] 1525
print(ncol(clean_data2))
## [1] 1326
# Decreasing cell ratio is:print(1 - ncol(clean_data2)/ncol(object))
## [1] 0.1304918
第三种方法:找寻离群值并删除
- 使用绝对中位差(median absolute deviation, MAD)
# MAD: https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/# Median absolute deviationseurat_clean <- objectmedian(seurat_clean$percent.mito)
## [1] 4.252895
median(abs(seurat_clean$percent.mito-median(seurat_clean$percent.mito)))
## [1] 3.02637
mad(seurat_clean$percent.mito, constant=1)
## [1] 3.02637
mad_value <- mad(seurat_clean$percent.mito, constant=1)
第一种寻找离群值的方法
Hampel filter: https://towardsdatascience.com/outliers-detection-in-r-6c835f14e554Hampel filter认为在区间[median - 3*MAD, median + 3*MAD]
之外的值为离群值。考虑到能接受线粒体基因表达比例过低的细胞,因此,这里线粒体基因表达比例高于median + 3*MAD
的值认为是离群值。
# I = [median - 3*MAD; median + 3*MAD]lower_bound <- median(seurat_clean$percent.mito - 3*mad_value)upper_bound <- median(seurat_clean$percent.mito + 3*mad_value)# outlier_ind <- which(seurat_clean$percent.mito < lower_bound | seurat_clean$percent.mito > upper_bound)# Set thresholdclean_data3_1 <- subset(object, percent.mito <= upper_bound)print(ncol(object))
## [1] 1525
print(ncol(clean_data3_1))
## [1] 1212
# Decreasing cell ratio is:print(1 - ncol(clean_data3_1)/ncol(object))
## [1] 0.2052459
另一种寻找离群值的方法
- 1 对线粒体基因表达分布计算MAD值
- 2 根据MAD值计算MAD 偏差
- 3 计算MAD偏差的中位数
- 4 以MAD偏差的中位数中心化数据,拟合正态分布,过滤对p值BH检验后,p<0.01的细胞。
# using a median-centered median absolute deviation (MAD)-variance normal distribution, # and then removed the cells with significantly higher expression levels than expected (determined by Benjamini-Hochberg corrected p < 0.01, # step1: calculate mad_value# step2: calculate median mad variance# step3: calculate median-centered mad variancelibrary(MASS)
## ## Attaching package: 'MASS'## The following object is masked from 'package:dplyr':## ## select
mad_value <- mad(seurat_clean$percent.mito, constant=1)median_mad_value <- median(seurat_clean$percent.mito-mad_value)mad_data <- seurat_clean$percent.mito-median_mad_valuequantile(mad_data, probs=seq(0,1,0.01))
## 0% 1% 2% 3% 4% 5% ## -1.22652582 -1.12677189 -1.04505412 -0.96783664 -0.88353556 -0.81879921 ## 6% 7% 8% 9% 10% 11% ## -0.73403847 -0.66443843 -0.56898173 -0.50345527 -0.40002066 -0.31144517 ## 12% 13% 14% 15% 16% 17% ## -0.24071698 -0.16165378 -0.10054114 -0.02779893 0.04874940 0.10832668 ## 18% 19% 20% 21% 22% 23% ## 0.16474711 0.25546482 0.32277906 0.37170908 0.45207025 0.53549713 ## 24% 25% 26% 27% 28% 29% ## 0.65528358 0.70880294 0.77405797 0.84589355 0.92257368 0.97663224 ## 30% 31% 32% 33% 34% 35% ## 1.11049555 1.17586328 1.27531613 1.34020968 1.40574423 1.46427697 ## 36% 37% 38% 39% 40% 41% ## 1.52484548 1.60613395 1.68158378 1.77465127 1.86783711 1.94859642 ## 42% 43% 44% 45% 46% 47% ## 2.10190300 2.17907731 2.32396704 2.46017357 2.56372110 2.70605979 ## 48% 49% 50% 51% 52% 53% ## 2.77613932 2.87061865 3.02636956 3.17350690 3.33353077 3.45084425 ## 54% 55% 56% 57% 58% 59% ## 3.65666815 3.77864615 3.94223341 4.15836488 4.37500052 4.51630880 ## 60% 61% 62% 63% 64% 65% ## 4.76932730 4.99745931 5.29929727 5.60190488 5.79174774 6.01009750 ## 66% 67% 68% 69% 70% 71% ## 6.25622972 6.52175657 6.86668520 7.20998333 7.53006017 7.82706009 ## 72% 73% 74% 75% 76% 77% ## 8.16781243 8.45596040 8.93308566 9.59124010 10.08937106 10.79636937 ## 78% 79% 80% 81% 82% 83% ## 11.41592714 11.76810435 12.36376280 13.17405483 14.26994801 15.17521921 ## 84% 85% 86% 87% 88% 89% ## 15.90015886 16.84398507 17.66219670 18.65338941 19.79228322 21.11463974 ## 90% 91% 92% 93% 94% 95% ## 22.60018875 24.04406053 26.04609465 29.19720742 33.30219617 37.55305544 ## 96% 97% 98% 99% 100% ## 40.53573859 43.55046754 48.29564234 54.91861069 66.97004349
fit <- fitdistr(mad_data, densfun='normal')hist(mad_data, prob=T, main='')curve(dnorm(x, fit$estimate[1], fit$estimate[2]), col='red', add=T)
图片
p_data <- pnorm(mad_data, mean=fit$estimate[1], sd=fit$estimate[2], lower.tail=F)# quantile(p_data, probs=seq(0,1,0.01))a_data <- p.adjust(p_data, method='BH')# quantile(a_data, probs=seq(0,1,0.01))# Set thresholdclean_data3_2 <- subset(object, cells=which(a_data > 0.01))print(ncol(object))
## [1] 1525
print(ncol(clean_data3_2))
## [1] 1502
# Decreasing cell ratio is:print(1 - ncol(clean_data3_2)/ncol(object))
## [1] 0.01508197
总之,过滤细胞比例为1.5%到20.5%,具体线粒体基因表达比例的选择因样本而异,言之有理即可。
比较以上几种方法过滤后细胞线粒体基因分布情况
mito_distribution <- function(seurat_object){ metadata <- seurat_object@meta.data metadata %>% ggplot(aes(color=sample, x=percent.mito, fill=sample)) + geom_density(alpha = 0.2) + scale_x_continuous(breaks = seq(0, max(object$percent.mito), by=5)) + theme_classic() }p1 <- mito_distribution(object)p2 <- mito_distribution(clean_data1)p3 <- mito_distribution(clean_data2)p4 <- mito_distribution(clean_data3_1)p5 <- mito_distribution(clean_data3_2)library(cowplot)plot_grid(p1, p2, p3, p4, p5, labels=c('original', '1', '2', '3-1', '3-2'))
图片
原文链接:过滤线粒体基因表达过高的细胞
网友评论