美文网首页单细胞组学试读社会热点
Seurat中FindMarker寻找两个cell type差异

Seurat中FindMarker寻找两个cell type差异

作者: 小潤澤 | 来源:发表于2021-11-21 16:41 被阅读0次

    加载示例数据

    1.运行Seurat

    pbmc.data <- read.csv("GSE117988_raw.expMatrix_PBMC.csv",header=TRUE,row.names = 1)
    pbmc.data <- log2(1 + sweep(pbmc.data, 2, median(colSums(pbmc.data))/colSums(pbmc.data), '*'))
    pbmc<- CreateSeuratObject(counts = pbmc.data, project = "10X pbmc", min.cells = 1, min.features = 0)
    
    pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
    
    sce=pbmc
    sce <- NormalizeData(sce, normalization.method =  "LogNormalize", scale.factor = 10000)
    GetAssay(sce,assay = "RNA")
    sce <- FindVariableFeatures(sce, selection.method = "vst", nfeatures = 2000) 
    
    sce <- ScaleData(sce) 
    sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
    sce <- FindNeighbors(sce, dims = 1:15)
    sce <- FindClusters(sce, resolution = 0.2)
    table(sce@meta.data$RNA_snn_res.0.2) 
    
    set.seed(123)
    sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
    
    # 差异分析
    markers_df <- FindMarkers(object = sce, ident.1 = 0, ident.2 = 1, min.pct = 0.25)
    

    2.差异分析计算logFC值

    object = sce
    ident.1 = 0
    ident.2 = 1
    group.by = NULL
    subset.ident = NULL
    assay = NULL
    slot = 'data'
    reduction = NULL
    features = NULL
    logfc.threshold = 0.25
    test.use = "wilcox"
    min.pct = 0.25
    min.diff.pct = -Inf
    verbose = TRUE
    only.pos = FALSE
    max.cells.per.ident = Inf
    random.seed = 1
    latent.vars = NULL
    min.cells.feature = 3
    min.cells.group = 3
    pseudocount.use = 1
    mean.fxn = NULL
    fc.name = NULL
    base = 2
    densify = FALSE
      
    # select which data to use
    assay <- assay %||% DefaultAssay(object = object)
    data.use <- object[[assay]]
    cellnames.use <-  colnames(x = data.use)
    
    # IdentsToCells是提取细胞亚型的函数,可以提取某一亚型的细胞barcodes
    cells <- IdentsToCells(
      object = object,
      ident.1 = ident.1,
      ident.2 = ident.2,
      cellnames.use = cellnames.use
    )
    # check normalization method
    norm.command <- paste0("NormalizeData.", assay)
    norm.method <- Command(
      object = object,
      command = norm.command,
      value = "normalization.method"
    )
    
    # 读取相应的数据
    data.slot <- ifelse(
      test = test.use %in% DEmethods_counts(),
      yes = 'counts',
      no = slot
    )
    data.use <-  GetAssayData(object = object, slot = data.slot)
    counts <- switch(
      EXPR = data.slot,
      'scale.data' = GetAssayData(object = object, slot = "counts"),
      numeric()
    )
    
    # 计算两个cell type直接的logFC值
    fc.results <- FoldChange(
      object = object,
      slot = data.slot,
      ident.1 = 0,
      ident.2 = 1,
      features = features,
      pseudocount.use = pseudocount.use,
      mean.fxn = mean.fxn,
      fc.name = fc.name,
      base = base
    )
    

    其中:

    1. 对象 cells cells

      包括两种cell type的细胞barcodes(即每一个单独的细胞)

    2. 对象 fc.results fc.results

      计算了上述两种细胞类型每个基因的平均log2FC值

    那么软件是如何计算平均差异表达的logFC的呢?

    base = 2
    pseudocount.use = 1
    mean.fxn <- function(x) {
      return(log(x = rowMeans(x = x) + pseudocount.use, base = base))
    }
    
    # 加载数据
    ## data.use代表所有基因的表达矩阵
    object = data.use
    
    # Calculate percent expressed
    ## 定义阈值
    thresh.min <- 0
    # 过滤掉 cell type 1 基因在所有细胞表达都为 0 的基因
    pct.1 <- round(
      x = rowSums(x = object[, cells.1, drop = FALSE] > thresh.min) /
        length(x = cells.1),
      digits = 3
    )
    # 过滤掉 cell type 2 基因在所有细胞表达都为 0 的基因
    pct.2 <- round(
      x = rowSums(x = object[, cells.2, drop = FALSE] > thresh.min) /
        length(x = cells.2),
      digits = 3
    )
    # Calculate fold change
    ## mean.fxn 计算的是某基因在其中一种 cell type 中所有细胞表达量的对数平均值
    data.1 <- mean.fxn(object[, cells.1, drop = FALSE])
    data.2 <- mean.fxn(object[, cells.2, drop = FALSE])
    # 求logFC
    fc <- (data.1 - data.2)
    fc.results <- as.data.frame(x = cbind(fc, pct.1, pct.2))
    colnames(fc.results) <- c(fc.name, "pct.1", "pct.2")
    

    接下来就是计算差异logFC的p_val

    # 差异基因p_val计算
    ## 初始化对象
    object = data.use
    slot = data.slot
    counts = counts
    cells.1 = cells.1
    cells.2 = cells.2
    features = features
    logfc.threshold = logfc.threshold
    test.use = test.use
    min.pct = min.pct
    min.diff.pct = min.diff.pct
    verbose = verbose
    only.pos = only.pos
    max.cells.per.ident = max.cells.per.ident
    random.seed = random.seed
    latent.vars = latent.vars
    min.cells.feature = min.cells.feature
    min.cells.group = min.cells.group
    pseudocount.use = pseudocount.use
    fc.results = fc.results
    densify = densify
    
    ValidateCellGroups(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      min.cells.group = min.cells.group
    )
    features <- features %||% rownames(x = object)
    data <- switch(
      EXPR = slot,
      'scale.data' = counts,
      object
    )
    # feature selection (based on percentages)
    alpha.min <- pmax(fc.results$pct.1, fc.results$pct.2)
    names(x = alpha.min) <- rownames(x = fc.results)
    features <- names(x = which(x = alpha.min >= min.pct))
    
    alpha.diff <- alpha.min - pmin(fc.results$pct.1, fc.results$pct.2)
    features <- names(
      x = which(x = alpha.min >= min.pct & alpha.diff >= min.diff.pct)
    )
    
    # feature selection (based on logFC)
    ## 根据logFC的情况筛选基因,该例子的 logfc.threshold = 0.25
    total.diff <- fc.results[, 1] #first column is logFC
    names(total.diff) <- rownames(fc.results)
    features.diff <- names(x = which(x = abs(x = total.diff) >= logfc.threshold))
    
    features <- intersect(x = features, y = features.diff)
    
    # subsample cell groups if they are too large
    ## 计算差异显著性
    de.results <- PerformDE(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      features = features,
      test.use = test.use,
      verbose = verbose,
      min.cells.feature = min.cells.feature,
      latent.vars = latent.vars,
      densify = densify,
    )
    

    其中data.use代表单细胞的表达矩阵

    data.use
    行代表特征基因,列代表不同的细胞

    3.FindMarkers计算差异显著性的方法

    # 初始化
    ## 这里的data.use代表所有基因的表达矩阵
    object = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    features = features
    test.use = test.use
    verbose = verbose
    min.cells.feature = min.cells.feature
    latent.vars = latent.vars
    densify = densify
    
    #提取特征基因的表达谱
    ## data.use代表筛选的特征基因的表达矩阵
    data.use <- object[features, c(cells.1, cells.2), drop = FALSE]
    
    #不同的检测差异基因的方法
    de.results <- switch(
      EXPR = test.use,
      'wilcox' = WilcoxDETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        verbose = verbose,
        ...
      ),
      'bimod' = DiffExpTest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        verbose = verbose
      ),
      'roc' = MarkerTest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        verbose = verbose
      ),
      't' = DiffTTest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        verbose = verbose
      ),
      'negbinom' = GLMDETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        min.cells = min.cells.feature,
        latent.vars = latent.vars,
        test.use = test.use,
        verbose = verbose
      ),
      'poisson' = GLMDETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        min.cells = min.cells.feature,
        latent.vars = latent.vars,
        test.use = test.use,
        verbose = verbose
      ),
      'MAST' = MASTDETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        latent.vars = latent.vars,
        verbose = verbose,
        ...
      ),
      "DESeq2" = DESeq2DETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        verbose = verbose,
        ...
      ),
      "LR" = LRDETest(
        data.use = data.use,
        cells.1 = cells.1,
        cells.2 = cells.2,
        latent.vars = latent.vars,
        verbose = verbose
      ),
      stop("Unknown test: ", test.use)
    )
    

    由源代码我们可以知道,这里一共提供了9种检测差异基因的手段,那么接下来我们可以一种一种的讨论:

    [1].wilcox

    这一部分分为两种检测方式供用户选择,一种是用limma的wilcox检验

    # 初始化
    data.use = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    verbose = verbose
    
    data.use <- data.use[, c(cells.1, cells.2), drop = FALSE]
    #以cell type 1 的长度作为索引,该例子中一共有两类细胞,共计6737个细胞
    j <- seq_len(length.out = length(x = cells.1))
    
    # limma wilcox test  计算 p_val
    ## 对第一个基因进行检验
    p_val = min(2 * min(limma::rankSumTestWithCorrelation(index = j, statistics = data.use[1, ])), 1)
    ### data.use[1, ]代表单细胞表达矩阵的第一个基因在两个种类型细胞中的表达量;其余基因依次遍历
    
    limma wilcox test p_val

    而另外一种则是用普通的wilcox test函数进行检验

    data.use = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    verbose = verbose
    
    data.use <- data.use[, c(cells.1, cells.2), drop = FALSE]
    #以cell type 1 的长度作为索引,该例子中一共有两类细胞,共计6737个细胞
    j <- seq_len(length.out = length(x = cells.1))
    
    # 创立wilcox test的背景集
    group.info <- data.frame(row.names = c(cells.1, cells.2))
    group.info[cells.1, "group"] <- "Group1"
    group.info[cells.2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    data.use <- data.use[, rownames(x = group.info), drop = FALSE]
    
    # wilcox test  计算 p_val
    ## 对第一个基因进行检验
    p_val = wilcox.test(data.use[1, ] ~ group.info[, "group"])$p.value
    ### data.use[1, ]代表单细胞表达矩阵的第一个基因在两个种类型细胞中的表达量;其余基因依次遍历
    
    wilcox test p_val

    可以看到两种wilcox test 计算的p值没有差别

    [2].bimod

    这种检验方式巧用了两个cell type中其中一个基因的表达量所构成的似然值
    根据文章:Data exploration, quality control and testing in single-cell qPCR-based gene expression experiments 的补充材料所述,对于单细胞表达数据,构造某基因在各个细胞中表达情况的似然函数如下:

    其中:

    1. Πk 表示某基因 A 在细胞中表达的比例
    2. 1-Πk 表示某基因 A 在细胞中未表达的比例
    3. nk 表示某基因 A 在细胞中表达的细胞个数
    4. 1-nk 表示某基因 A 在细胞中表达的细胞个数
    5. g() 为某基因 A 在细胞中表达了的pdf函数
    data.use = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    verbose = verbose
    
    x = as.numeric(x = data.use[1, cells.1])
    y = as.numeric(x = data.use[1, cells.2])
    
    # 分别计算两种 cell type 某基因表达的对数似然值
    lrtX <- bimodLikData(x = x)
    lrtY <- bimodLikData(x = y)
    # 计算两种 cell type 混合时某基因表达的对似然值
    lrtZ <- bimodLikData(x = c(x, y))
    # 构造似然比,对数的加减相当于原式的乘除
    lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
    # 进行对数似然比检验
    p_val = pchisq(q = lrt_diff, df = 3, lower.tail = F)
    
    
    bimodLikData <- function(x, xmin = 0) {
      # 提取某基因 A 未表达的细胞
      x1 <- x[x <= xmin]
      # 提取某基因 A 表达的细胞
      x2 <- x[x > xmin]
    
      xal <- MinMax(
        # 计算某基因 A 在细胞中表达的比例
        data = length(x = x2) / length(x = x),
        min = 1e-5,
        max = (1 - 1e-5)
      )
      # 计算 Πk^nk
      likA <- length(x = x1) * log(x = 1 - xal)
    
     # 计算分布的sd值
      if (length(x = x2) < 2) {
        mysd <- 1
      } else {
        mysd <- sd(x = x2)
      }
      # 其中  length(x = x2) * log(x = xal) 计算的是 (1-Πk)^(1-nk)
      # sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE) 计算的是某基因 A 在细胞中表达,对应于分布似然值的乘积(由于对数化,加和表示乘积)
      likB <- length(x = x2) *
        log(x = xal) +
        sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
      return(likA + likB)
    }
    

    其中:(默认取了对数)

    1. length(x = x1) * log(x = 1 - xal) 计算的是 Πknk
    2. length(x = x2) * log(x = xal) 计算的是 (1-Πk)(1-nk)
    3. sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE) 计算的是了 ∏ g()

    那么如何和利用分布间的对数似然和关系反应差异呢?
    假设红,蓝分别代表两类细胞中基因A(有表达的情况)的表达量分布;黑色代表两者合并的分布,我们分如下三种情况来讨论这个问题:

    情况 1:
    两类细胞中基因A(有表达的情况)的表达量分布很远; 的表达量比 低,则它们表达的均值关系就会是 mean(蓝) << mean(c(蓝,红)) << mean(红)

    X1 = seq(2, 11, by = .1)
    X2 = seq(11, 17, by = .1)
    
    Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
    Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)
    
    v = c(X1,X2)
    g = dnorm(v, mean = mean(v), sd = sd(v),log = T)
    
    qq = data.frame(v,g)
    ww = data.frame(X1,Y1)
    ee = data.frame(X2,Y2)
    
    ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
      geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
      geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))
    
    sum(Y1)
    [1] -217.0104
    sum(Y2)
    [1] -121.0672
    sum(g)
    [1] -439.0152
    

    显然三个分布的对数似然值之和满足 Y1 + Y2 >> g,即 Y1 + Y2 的对数似然值之和与 g 的对数似然值之和相差很大

    情况 2:
    两类细胞中基因A(有表达的情况)的表达量分布重合; 的表达量和 的相同,则它们表达的均值关系就会是 mean(蓝) = mean(c(蓝,红)) = mean(红)

    X1 = seq(11, 15, by = .1)
    X2 = seq(11, 15, by = .1)
    
    Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
    Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)
    
    v = c(X1,X2)
    g = dnorm(v, mean = mean(v), sd = sd(v),log = T)
    
    qq = data.frame(v,g)
    ww = data.frame(X1,Y1)
    ee = data.frame(X2,Y2)
    
    ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
      geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
      geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))
    
    sum(Y1)
    [1] -65.08036
    sum(Y2)
    [1] -65.08036
    sum(g)
    [1] -130.1514
    

    显然三个分布的对数似然值之和满足 Y1 + Y2 = g,即 Y1 + Y2的对数似然值之和与 g 的对数似然值之和一样大

    情况 3:
    两类细胞中基因A(有表达的情况)的表达量分布不是很远; 的表达量比 低,则它们表达的均值关系就会是 mean(蓝) < mean(c(蓝,红)) < mean(红)

    X1 = seq(9, 13, by = .1)
    X2 = seq(11, 15, by = .1)
    
    Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
    Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)
    
    v = c(X1,X2)
    g = dnorm(v, mean = mean(v), sd = sd(v),log = T)
    
    qq = data.frame(v,g)
    ww = data.frame(X1,Y1)
    ee = data.frame(X2,Y2)
    
    ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
      geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
      geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))
    
    sum(Y1)
    [1] -65.08036
    sum(Y2)
    [1] -65.08036
    sum(g)
    [1] -152.2503
    

    显然三个分布的对数似然值之和满足 Y1 + Y2 > g,即 Y1 + Y2的对数似然值之和与 g 的对数似然值之和相差一般大

    所以综上所述,Y1 + Y2g 的关系可以反应 Y1Y2 两个分布差异的大小(也可以解释为Y1Y2 两个分布相离的距离)

    对于:

    x = as.numeric(x = data.use[1, cells.1])
    y = as.numeric(x = data.use[1, cells.2])
    
    lrtX <- bimodLikData(x = x)
    lrtY <- bimodLikData(x = y)
    lrtZ <- bimodLikData(x = c(x, y))
    lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
    pchisq(q = lrt_diff, df = 3, lower.tail = F)
    

    我们就可以通过讨论lrtX + lrtY 与 lrtZ的关系,通过pchisq来计算p值,显著代表lrtX + lrtY >> lrtZ;不显著代表lrtX + lrtY ≈ lrtZ

    [3].roc

    roc计算p_val的方式巧用了二分类器来计算AUC值,通过AUC值定义出p_val

    data.use = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    verbose = verbose
    
    data.use = data.use
    cells.1 = cells.1
    cells.2 = cells.2
    
    
    data1 = data.use[, cells.1, drop = FALSE]
    data2 = data.use[, cells.2, drop = FALSE]
    mygenes = rownames(x = data.use)
    print.bar = verbose
    
    # 以两类细胞某基因的表达量为特征值
    x = as.numeric(x = data1[1, ])
    y = as.numeric(x = data2[1, ])
    
    # 进行二分类的学习
    prediction.use <- prediction(
      ## 以两类细胞某基因的表达量为特征值
      predictions = c(x, y),
      ## 以两类细胞作为分类标签
      labels = c(rep(x = 1, length(x = x)), rep(x = 0, length(x = y))),
      label.ordering = 0:1
    )
    
    # 预测
    perf.use <- performance(prediction.obj = prediction.use, measure = "auc")
    # 计算AUC值
    auc.use <- round(x = perf.use@y.values[[1]], digits = 3)
    
    # AUC排序
    to.return = data.frame(myAUC = rev(x = order(toRet$myAUC)))
    # 计算p_val
    p_val = to.return$power <- abs(x = to.return$myAUC - 0.5) * 2
    
    [4].t-test

    还有一种计算p_val的方式是直接用t-test

    data.use = data.use
    cells.1 = cells$cells.1
    cells.2 = cells$cells.2
    
    # 两类细胞中某基因表达量的t-test
    t.test(x = data.use[1, cells.1], y = data.use[1, cells.2])$p.value
    
    [5].广义线性回归模型

    这里分为三种情况,一种是某基因A在两类细胞中的表达量满足负二项分布,利用负二项分布广义线性回归模型计算p_val;第二种是某基因A在两类细胞中的表达量满足泊松分布,利用泊松分布广义线性回归模型计算p_val;最后一种是零膨胀负二项分布的广义线性回归模型

    对于负二项分布和泊松分布的广义线性回归模型:

    #数据准备
    data.use = data.use
    cells.1 = cells$cells.1
    cells.2 = cells$cells.2
    verbose = verbose
    
    min.cells = 3
    latent.vars = NULL
    test.use = NULL
    
    # 定义分组信息
    group.info <- data.frame(
      group = rep(
        x = c('Group1', 'Group2'),
        times = c(length(x = cells.1), length(x = cells.2))
      )
    )
    # 分组信息因子化
    rownames(group.info) <- c(cells.1, cells.2)
    group.info[, "group"] <- factor(x = group.info[, "group"])
    
    # 定义两类细胞与分组之间的关系
    latent.vars <- if (is.null(x = latent.vars)) {
      group.info
    } else {
      cbind(x = group.info, latent.vars)
    }
    
    # 将某基因A的表达量加入到latent.vars中
    latent.var.names <- colnames(x = latent.vars)
    latent.vars[, "GENE"] <- as.numeric(x = data.use[1, ])
    
    # 定义广义线性回归模型方程,回归响应变量和决策变量为  "GENE ~ group"
    fmla <- as.formula(object = paste(
      "GENE ~",
      paste(latent.var.names, collapse = "+")
    ))
    p.estimate <- 2
    # 负二项分布
    if (test.use == "negbinom") {
        expr = p.estimate <- summary(
          object = glm.nb(formula = fmla, data = latent.vars)
        )$coef[2, 4]
    } 
    # 泊松分布
    else if (test.use == "poisson") {
      expr = summary(object = glm(
        formula = fmla,
        data = latent.vars,
        family = "poisson"
      ))$coef[2,4]
    }
    ## coef[2,4] 为模型的p_val
    

    其中:

    1. latent.vars 为:

      第一列代表两类细胞的barcodes;第二列为两类细胞的分组信息;第三列为某基因A在两类细胞中的表达量

    2. 广义线性模型回归的响应变量和决策变量为 "GENE ~ group"
    3. coef[2,4] 为模型的p_val

    对于零膨胀负二项分布的广义线性回归模型:
    首先单细胞的表达矩阵是一个稀疏矩阵,里面包含着大量的 0 值,因此零膨胀模型会区分零表达和有表达的情况,从而扣除零表达对数据拟合造成的影响,利用有表达的细胞进行数据拟合,再建立广义线性回归模型

    # 读出数据
    data.use = data.use
    cells.1 = cells$cells.1
    cells.2 = cells$cells.2
    verbose = verbose
    
    latent.vars = NULL
    verbose = TRUE
    
    
    group.info <- data.frame(row.names = c(cells.1, cells.2))
    latent.vars <- latent.vars %||% group.info
    # 对两类细胞分组,分为Group1和Group2
    group.info[cells.1, "group"] <- "Group1"
    group.info[cells.2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    latent.vars.names <- c("condition", colnames(x = latent.vars))
    latent.vars <- cbind(latent.vars, group.info)
    latent.vars$wellKey <- rownames(x = latent.vars)
    fdat <- data.frame(rownames(x = data.use))
    colnames(x = fdat)[1] <- "primerid"
    rownames(x = fdat) <- fdat[, 1]
    
    # 创建数据矩阵,这里是一次性读入所有的基因
    sca <- MAST::FromMatrix(
      exprsArray = as.matrix(x = data.use),
      check_sanity = FALSE,
      cData = latent.vars,
      fData = fdat
    )
    
    # 定义回归因子
    cond <- factor(x = SummarizedExperiment::colData(sca)$group)
    cond <- relevel(x = cond, ref = "Group1")
    SummarizedExperiment::colData(sca)$condition <- cond
    
    # 建立响应变量和决策变量之间的关系
    fmla <- as.formula(
      object = paste0(" ~ ", paste(latent.vars.names, collapse = "+"))
    )
    # 建立模型
    zlmCond <- MAST::zlm(formula = fmla, sca = sca)
    summaryCond <- summary(object = zlmCond, doLRT = 'conditionGroup2')
    summaryDt <- summaryCond$datatable
    # 计算p_val
    p_val <- summaryDt[summaryDt[, "component"] == "H", 4]
    
    其中:对于summaryDt summaryDt

    第四列为p_val

    利用广义线性回归模型计算两组差异的p_val的原理可以参照:

    1. 因子变量回归模型
    2. 类别型决策变量的线性回归

    至于是哪一种分布类型,则根据数据的拟合分布情况来决定

    [6].DESeq2差异表达计算p_val

    还有一种方式是利用DESeq2两组比较的方式,计算每个基因的p_val

    #读入数据
    data.use = data.use
    cells.1 = cells$cells.1
    cells.2 = cells$cells.2
    
    # DESeq2的标准流程
    ## 细胞分组
    group.info <- data.frame(row.names = c(cells.1, cells.2))
    group.info[cells.1, "group"] <- "Group1"
    group.info[cells.2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    group.info$wellKey <- rownames(x = group.info)
    ## 创建数据矩阵,这里是一次性读入所有的基因
    dds1 <- DESeq2::DESeqDataSetFromMatrix(
      countData = data.use,
      colData = group.info,
      design = ~ group
    )
    dds1 <- DESeq2::estimateSizeFactors(object = dds1)
    dds1 <- DESeq2::estimateDispersions(object = dds1, fitType = "local")
    dds1 <- DESeq2::nbinomWaldTest(object = dds1)
    res <- DESeq2::results(
      object = dds1,
      contrast = c("group", "Group1", "Group2"),
      alpha = 0.05
    )
    # 计算p_val
    to.return <- data.frame(p_val = res$pvalue, row.names = rownames(res))
    

    其实DESeq2的底层逻辑也是利用广义线性回归模型计算p_val的,故理解可参照 [5].广义线性回归模型

    [7].回归模型似然比检验

    这一部分的原理参见:DESeq2中的似然比检验(LRT)
    直接上代码:

    # 读取数据
    data.use = data.use
    cells.1 = cells$cells.1
    cells.2 = cells$cells.2
    verbose = verbose
    
    latent.vars = NULL
    verbose = TRUE
    
    # 设立分组信息
    group.info <- data.frame(row.names = c(cells.1, cells.2))
    group.info[cells.1, "group"] <- "Group1"
    group.info[cells.2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    data.use <- data.use[, rownames(group.info), drop = FALSE]
    latent.vars <- latent.vars[rownames(group.info), , drop = FALSE]
    
    # 建立模型,以第一个基因在两类细胞中的表达量为例建模
    model.data <- cbind(GENE = data.use[1, ], group.info)
    # 以基因为决策变量,二元变量group为响应变量
    fmla <- as.formula(object = "group ~ GENE")
    # 对照模型
    fmla2 <- as.formula(object = "group ~ 1")
    
    # 以负二项分布建立glm
    model1 <- glm(formula = fmla, data = model.data, family = "binomial")
    model2 <- glm(formula = fmla2, data = model.data, family = "binomial")
    # model1和model2的似然比检验,计算p_val
    p_val <- lrtest <- lrtest(model1, model2)
    

    其中:

    1. 对于model.data来说 model.data

      第一列代表两类细胞的barcodes;第二列为某基因A在两类细胞中的表达量;第三列为两类细胞的分组信息

    2. 对于model1:
    # 以基因为决策变量,二元变量group为响应变量
    fmla <- as.formula(object = "group ~ GENE")
    model1 <- glm(formula = fmla, data = model.data, family = "binomial")
    
    model1
    横坐标为基因表达的情况(基因的表达数值),纵坐标为二元变量(Group1和Group2);如果两组细胞某基因A的表达相差很大,那么model1更 fit
    1. 对于model2:
    # 对照模型
    fmla2 <- as.formula(object = "group ~ 1")
    model2 <- glm(formula = fmla, data = model.data, family = "binomial")
    
    model2
    model2 不fit

    如果两组细胞某基因A的表达相差很大,那么model1更 fit,model1与model2的似然比远大于1;
    如果两组细胞某基因A的表达相差不大,那么model1和model2 都 不fit,model1与model2的似然比约等于1
    显然,根据实际的表达情况,model1比model2更 fit,所以以似然比检验的p_val为最终的p_val,更为显著

    相关文章

      网友评论

        本文标题:Seurat中FindMarker寻找两个cell type差异

        本文链接:https://www.haomeiwen.com/subject/fawytrtx.html