美文网首页单细胞测序专题集合
利用nnls进行反卷积运算

利用nnls进行反卷积运算

作者: 小潤澤 | 来源:发表于2022-07-25 21:33 被阅读0次

    相比较SVR而言,这里有另外一种解决单细胞反卷积的方法,nnls(非负最小二乘)
    文章链接:《Bulk tissue cell type deconvolution with multi-subject single-cell expression reference》

    核心思想

    1. Xjp 代表给定tissue的 sample j 中gene g的mRNA分子数
    2. Xjpc 代表给定tissue的 sample j ,某一cell c中gene g的mRNA分子数
    3. Ckj 代表cell type k
    4. mkj 代表cell type k的细胞数

    这个式子代表给定tissue的 sample j ,cell type k 中gene g 的平均相对表达量
    上式可以改编为
    1.

    代表cell type k中细胞总mRNA分子数的平均值(除以cell type k的总细胞数记作平均)

    1. 记作total number of cells in the bulk tissue

    2. 记作proportion of cells from cell type k

    3. 记作 bulk tissue 中 gene g 的相对表达量

    因此基于上述关系我们容易得知:


    这样的正比关系也暗示我们可以用线性模型来衡量这种关系,因此我们的目标就是估计 pkj 来表示细胞成分

    代码分析

    1. 读取数据并运算

    library(MuSiC)
    
    GSE50244.bulk.eset = readRDS('C:/Users/lenovo/Downloads/GSE50244bulkeset.rds')
    GSE50244.bulk.eset
    
    EMTAB.eset = readRDS('C:/Users/lenovo/Downloads/EMTABesethealthy.rds')
    EMTAB.eset
    
    
    XinT2D.eset = readRDS('C:/Users/lenovo/Downloads/XinT2Deset.rds')
    XinT2D.eset
    
    
    Est.prop.GSE50244 = music_prop(bulk.eset = GSE50244.bulk.eset, sc.eset = EMTAB.eset, clusters = 'cellType',
                                   samples = 'sampleID', select.ct = c('alpha', 'beta', 'delta', 'gamma',
                                                                       'acinar', 'ductal'), verbose = F)
    
    

    对应函数music_prop分步解析:

    # 读入数据
    bulk.eset = GSE50244.bulk.eset ## 读入bulk 的数据
    sc.eset = EMTAB.eset  ## 读入scRNA 的数据
    markers = NULL
    clusters = 'cellType'
    samples = 'sampleID'
    select.ct =  c('alpha', 'beta', 'delta',
               'gamma','acinar', 'ductal')   ## cell type 的名称
    cell_size = NULL
    ct.cov = FALSE
    verbose = TRUE
    iter.max = 1000
    nu = 0.0001
    eps = 0.01
    centered = FALSE
    normalize = FALSE
    
    # bulk 中选择基因表达不全为0的基因
    bulk.gene = rownames(bulk.eset)[rowMeans(exprs(bulk.eset)) != 0]
    bulk.eset = bulk.eset[bulk.gene, , drop = FALSE]
    
    # 把这部分的基因定义为scRNA的markers
    sc.markers = bulk.gene
    # 对scRNA 数据进行标准化
    sc.basis = music_basis(sc.eset, non.zero = TRUE, markers = sc.markers, clusters = clusters, samples = samples, select.ct = select.ct, cell_size = cell_size, ct.cov = ct.cov, verbose = verbose)
    
    # sc.basis$Disgn.mtx 代表着单细胞不同 cell type 的基因表达矩阵,取bulk和scRNA基因的交集
    cm.gene = intersect(rownames(sc.basis$Disgn.mtx), bulk.gene)
    m.sc = match(cm.gene, rownames(sc.basis$Disgn.mtx))
    m.bulk = match(cm.gene, bulk.gene)
    
    # 筛选scRNA矩阵的交集基因
    D1 = sc.basis$Disgn.mtx[m.sc, ]; 
    M.S = colMeans(sc.basis$S, na.rm = T);
    
    # 定义 bulk 表达矩阵的相对表达量
    Yjg = relative.ab(exprs(bulk.eset)[m.bulk, ]); N.bulk = ncol(bulk.eset);
    
    Sigma = sc.basis$Sigma[m.sc, ]
    
    # 对scRNA的cell type表达情况进行筛选
    valid.ct = (colSums(is.na(Sigma)) == 0)&(colSums(is.na(D1)) == 0)&(!is.na(M.S))
    D1 = D1[, valid.ct]
    M.S = M.S[valid.ct]
    Sigma = Sigma[, valid.ct]
      
    # 初始化
    Est.prop.allgene = NULL
    Est.prop.weighted = NULL
    Weight.gene = NULL
    r.squared.full = NULL
    Var.prop = NULL
    
    # 对bulk 表达矩阵的每个 sample 进行遍历
    for(i in 1:N.bulk){
      if(sum(Yjg[, i] == 0) > 0){
        # 筛选非零表达的基因
        D1.temp = D1[Yjg[, i]!=0, ];
        Yjg.temp = Yjg[Yjg[, i]!=0, i];
        Sigma.temp = Sigma[Yjg[,i]!=0, ];
      }else{
        D1.temp = D1;
        Yjg.temp = Yjg[, i];
        Sigma.temp = Sigma;
      }
    
      # 对每个 sample 的bulk表达矩阵 (向量)与 scRNA 表达矩阵做 nnls
      lm.D1.weighted = music.iter(Yjg.temp, D1.temp, M.S, Sigma.temp, iter.max = iter.max,
                                  nu = nu, eps = eps, centered = centered, normalize = normalize)
      Est.prop.allgene = rbind(Est.prop.allgene, lm.D1.weighted$p.nnls)
      Est.prop.weighted = rbind(Est.prop.weighted, lm.D1.weighted$p.weight)
      weight.gene.temp = rep(NA, nrow(Yjg)); weight.gene.temp[Yjg[,i]!=0] = lm.D1.weighted$weight.gene;
      Weight.gene = cbind(Weight.gene, weight.gene.temp)
      r.squared.full = c(r.squared.full, lm.D1.weighted$R.squared)
      Var.prop = rbind(Var.prop, lm.D1.weighted$var.p)
    }
    colnames(Est.prop.weighted) = colnames(D1)
    rownames(Est.prop.weighted) = colnames(Yjg)
    colnames(Est.prop.allgene) = colnames(D1)
    rownames(Est.prop.allgene) = colnames(Yjg)
    names(r.squared.full) = colnames(Yjg)
    colnames(Weight.gene) = colnames(Yjg)
    rownames(Weight.gene) = cm.gene
    colnames(Var.prop) = colnames(D1)
    rownames(Var.prop) = colnames(Yjg)
    
    ## 其中 Est.prop.weighted 表示细胞组分
    

    分析music.iter和music.basic(music.iter的内置函数)的源码:

    # 对于函数 music.iter
    ## 读入数据
    Y = Yjg.temp   # 某个 bulk 的表达矩阵
    D = D1.temp    # scRNA 的表达矩阵
    S = M.S
    Sigma = Sigma.temp
    iter.max = 1000
    nu = 0.0001
    eps = 0.01
    centered = FALSE
    normalize = FALSE
    
    
    k = ncol(D); # number of cell types
    # 计算 bulk 和 scRNA 基因的交集,得到相应的表达矩阵
    common.gene = intersect(names(Y), rownames(D))
    Y = Y[match(common.gene, names(Y))];
    D = D[match(common.gene, rownames(D)), ]
    Sigma = Sigma[match(common.gene, rownames(Sigma)), ]
    X = D
    Y = Y*100
    ## nnls计算
    lm.D = music.basic(Y, X, S, Sigma, iter.max = iter.max, nu = nu, eps = eps)
    
    
    # 对于函数music.basic
    k = ncol(X)
    ## nnls 计算某个 bulk (Y 矩阵) 的表达矩阵和 scRNA (X 矩阵) 的表达矩阵
    lm.D = nnls(X, Y)
    r = resid(lm.D)
    
    ## 定义基因的权重,lm.D$x 代表的是 nnls 回归系数
    weight.gene = 1/(nu + r^2 + colSums((lm.D$x*S)^2*t(Sigma)))
    
    ## 响应变量为 bulk 表达矩阵乘开根号后的weight.gene
    Y.weight = Y*sqrt(weight.gene)
    ## 决策变量为 scRNA 表达矩阵对应元素乘根号后的weight.gene
    D.weight = sweep(X, 1, sqrt(weight.gene), '*')
    
    ## 再次利用nnls计算
    lm.D.weight = nnls(D.weight, Y.weight)
    ## 细胞组分比例为 p.weight
    p.weight = lm.D.weight$x/sum(lm.D.weight$x)
    p.weight.iter = p.weight
    r = resid(lm.D.weight)
    
    ## 循环的目的是不停的迭代更新 weight.gene 使得 胞组分比例 p.weight 收敛
    for(iter in 1:iter.max){
      weight.gene = 1/(nu + r^2 + colSums( (lm.D.weight$x*S)^2*t(Sigma)  ))
      Y.weight = Y*sqrt(weight.gene)
      D.weight = X * as.matrix(sqrt(weight.gene))[,rep(1,k)]
      ## 不断更迭scRNA的weight和bulk的weight,使得最终细胞组分 p.weight 达到稳定
      lm.D.weight = nnls(D.weight, Y.weight )
      p.weight.new = lm.D.weight$x/sum(lm.D.weight$x)
      r.new = resid(lm.D.weight)
      if(sum(abs(p.weight.new - p.weight)) < eps){
        p.weight = p.weight.new;
        r = r.new
        R.squared = 1 - var(Y - X%*%as.matrix(lm.D.weight$x))/var(Y)
        fitted = X%*%as.matrix(lm.D.weight$x)
        var.p = diag(solve(t(D.weight)%*%D.weight)) * mean(r^2)/sum(lm.D.weight$x)^2
        return(list(p.nnls = (lm.D$x)/sum(lm.D$x), q.nnls = lm.D$x, fit.nnls = fitted(lm.D), resid.nnls = resid(lm.D),
                    p.weight = p.weight, q.weight = lm.D.weight$x, fit.weight = fitted, resid.weight =  Y - X%*%as.matrix(lm.D.weight$x),
                    weight.gene = weight.gene, converge = paste0('Converge at ', iter),
                    rsd = r, R.squared = R.squared, var.p = var.p));
      }
      p.weight = p.weight.new;
      r = r.new;
    }
    

    其中,lm.D$x 代表 nnls 模型的回归系数

    由此可见,作者并不是简单的直接运用scRNA表达矩阵和bulk矩阵,利用nnls直接计算细胞组分 p.weight,而是先定义一个 weight.genel来衡量基因的贡献程度,然后再分别得到矫正后的scRNA表达矩阵 D.weight 和校正后的 bulk 表达矩阵 Y.weight (进行表达矩阵数值的矫正),利用矫正后的矩阵进行nnls的计算(bulk 矩阵的 Y.weight 为Y.weight = Y*sqrt(weight.gene);scRNA 矩阵 D.weight 为D.weight = sweep(X, 1, sqrt(weight.gene), '*'))。通过不断迭代使得 细胞组分 p.weight 收敛

    相关文章

      网友评论

        本文标题:利用nnls进行反卷积运算

        本文链接:https://www.haomeiwen.com/subject/neqnirtx.html