美文网首页单细胞测序
单细胞笔记14-Signac::GeneActivity()函数

单细胞笔记14-Signac::GeneActivity()函数

作者: 江湾青年 | 来源:发表于2022-07-07 22:18 被阅读0次

GeneActivity():计算Seurat对象中的基因活性

  • transcripts:GeneActivity最重要的作用是使用Seurat对象中的annotation生成transcripts,即拓展基因区域后的每个转录本,然后transcripts作为features作为FeatureMatrix()的输入。
  • frags:Seurat对象中存储的fragments对象
  • cells:所有细胞名
function (object, assay = NULL, features = NULL, extend.upstream = 2000, 
    extend.downstream = 0, biotypes = "protein_coding", max.width = 5e+05, 
    gene.id = FALSE, verbose = TRUE, ...) 
{
    if (!is.null(x = features)) {
        if (length(x = features) == 0) {
            stop("Empty list of features provided")
        }
    }
    assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
    if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
        stop("The requested assay is not a ChromatinAssay.")
    }
    annotation <- Annotation(object = object[[assay]])
    if (length(x = annotation) == 0) {
        stop("No gene annotations present in object")
    }
    if (verbose) {
        message("Extracting gene coordinates")
    }
    transcripts <- CollapseToLongestTranscript(ranges = annotation)
    if (gene.id) {
        transcripts$gene_name <- transcripts$gene_id
    }
    if (!is.null(x = biotypes)) {
        transcripts <- transcripts[transcripts$gene_biotype %in% 
            biotypes]
        if (length(x = transcripts) == 0) {
            stop("No genes remaining after filtering for requested biotypes")
        }
    }
    if (!is.null(x = features)) {
        transcripts <- transcripts[transcripts$gene_name %in% 
            features]
        if (length(x = transcripts) == 0) {
            stop("None of the requested genes were found in the gene annotation")
        }
    }
    if (!is.null(x = max.width)) {
        transcript.keep <- which(x = width(x = transcripts) < 
            max.width)
        transcripts <- transcripts[transcript.keep]
        if (length(x = transcripts) == 0) {
            stop("No genes remaining after filtering for max.width")
        }
    }
    transcripts <- Extend(x = transcripts, upstream = extend.upstream, 
        downstream = extend.downstream)
    frags <- Fragments(object = object[[assay]])
    if (length(x = frags) == 0) {
        stop("No fragment information found for requested assay")
    }
    cells <- colnames(x = object[[assay]])
###################################################################################################
    counts <- FeatureMatrix(fragments = frags, features = transcripts, 
        cells = cells, verbose = verbose, ...)
###################################################################################################
    transcripts$gene_name <- ifelse(test = is.na(x = transcripts$gene_name), 
        yes = transcripts$gene_id, no = transcripts$gene_name)
    gene.key <- transcripts$gene_name
    names(x = gene.key) <- GRangesToString(grange = transcripts)
    rownames(x = counts) <- as.vector(x = gene.key[rownames(x = counts)])
    counts <- counts[rownames(x = counts) != "", ]
    return(counts)
}

重要函数:FeatureMatrix()


FeatureMatrix(): 从多个fragments文件中提取特征矩阵

  • 这个函数的作用主要是对fragments中的每个frag并行化运行SingleFeatureMatrix()
function (fragments, features, cells = NULL, process_n = 2000, 
    sep = c("-", "-"), verbose = TRUE) 
{
    if (!inherits(x = features, what = "GRanges")) {
        if (inherits(x = features, what = "character")) {
            features <- StringToGRanges(regions = features, sep = sep)
        }
        else {
            stop("features should be a GRanges object")
        }
    }
    if (!inherits(x = fragments, what = "list")) {
        if (inherits(x = fragments, what = "Fragment")) {
            fragments <- list(fragments)
        }
        else {
            stop("fragments should be a list of Fragment objects")
        }
    }
    if (!is.null(x = cells)) {
        obj.use <- c()
        for (i in seq_along(along.with = fragments)) {
            if (is.null(x = Cells(fragments[[i]]))) {
                obj.use <- c(obj.use, i)
            }
            else if (any(cells %in% Cells(x = fragments[[i]]))) {
                obj.use <- c(obj.use, i)
            }
        }
    }
    else {
        obj.use <- seq_along(along.with = fragments)
    }
###################################################################################################
    mat.list <- sapply(X = obj.use, FUN = function(x) {
        SingleFeatureMatrix(fragment = fragments[[x]], features = features, 
            cells = cells, sep = sep, verbose = verbose, process_n = process_n)
    })
###################################################################################################
    if (length(x = mat.list) == 1) {
        return(mat.list[[1]])
    }
    else {
        featmat <- Reduce(f = RowMergeSparseMatrices, x = mat.list)
        return(featmat)
    }
}

重要函数:Signac:::SingleFeatureMatrix()


Signac:::SingleFeatureMatrix():获取单个fragment文件的特征矩阵

  • tbx:fragment文件的索引文件
  • feature.list:将feature(transcripts的GRanges对象)分成多个chunks的list对象,后续作为PartialMatrix()的regions参数进行并行化计算
function (fragment, features, cells = NULL, process_n = 2000, 
    sep = c("-", "-"), verbose = TRUE) 
{
    fragment.path <- GetFragmentData(object = fragment, slot = "path")
    if (!is.null(cells)) {
        frag.cells <- GetFragmentData(object = fragment, slot = "cells")
        if (is.null(x = frag.cells)) {
            names(x = cells) <- cells
        }
        else {
            cell.idx <- fmatch(x = names(x = frag.cells), table = cells, 
                nomatch = 0L) > 0
            cells <- frag.cells[cell.idx]
        }
    }
    tbx <- TabixFile(file = fragment.path)
    features <- keepSeqlevels(x = features, value = intersect(x = seqnames(x = features), 
        y = seqnamesTabix(file = tbx)), pruning.mode = "coarse")
    if (length(x = features) == 0) {
        stop("No matching chromosomes found in fragment file.")
    }
    feature.list <- ChunkGRanges(granges = features, nchunk = ceiling(x = length(x = features)/process_n))    
    # 将feature(transcripts的GRanges对象)分成多个chunks
    if (verbose) {
        message("Extracting reads overlapping genomic regions")
    }
    if (nbrOfWorkers() > 1) {
        matrix.parts <- future_lapply(X = feature.list, FUN = PartialMatrix, 
            tabix = tbx, cells = cells, sep = sep, future.globals = list(), 
            future.scheduling = FALSE)
    }
    else {
        mylapply <- ifelse(test = verbose, yes = pblapply, no = lapply)    # 有无progress bar的lapply,这种技巧可以学习
###################################################################################################
        matrix.parts <- mylapply(X = feature.list, FUN = PartialMatrix, 
            tabix = tbx, cells = cells, sep = sep)
###################################################################################################
    }
    null.parts <- sapply(X = matrix.parts, FUN = is.null)
    matrix.parts <- matrix.parts[!null.parts]
    if (is.null(x = cells)) {
        all.cells <- unique(x = unlist(x = lapply(X = matrix.parts, 
            FUN = colnames)))
        matrix.parts <- lapply(X = matrix.parts, FUN = AddMissingCells, 
            cells = all.cells)
    }
    featmat <- do.call(what = rbind, args = matrix.parts)    # do.call():todo
    if (!is.null(x = cells)) {
        cell.convert <- names(x = cells)
        names(x = cell.convert) <- cells
        colnames(x = featmat) <- unname(obj = cell.convert[colnames(x = featmat)])
    }
    feat.str <- GRangesToString(grange = features, sep = sep)
    featmat <- featmat[feat.str, , drop = FALSE]
    return(featmat)
}

重要函数:PartialMatrix()


Signac:::PartialMatrix():在fragment的tabix文件中搜索特征对应的细胞,构成counts矩阵

  • cells.in.regions:一个list,里面为每个feature(在这里为基因的坐标)所包含的细胞有哪些
    cells.in.regions
  • cells.in.regions后来又经过unlist()和uname()变为最终counts稀疏矩阵的
  • feature.vec:每个特征在多少个细胞中有,构成counts稀疏矩阵的
function (tabix, regions, sep = c("-", "-"), cells = NULL) 
{
    open(con = tabix)
###################################################################################################
    cells.in.regions <- GetCellsInRegion(tabix = tabix, region = regions, 
        cells = cells)
###################################################################################################
    close(con = tabix)
    gc(verbose = FALSE)
    nrep <- elementNROWS(x = cells.in.regions)    # 每个feature在多少个细胞中有
    if (all(nrep == 0) & !is.null(x = cells)) {
        featmat <- sparseMatrix(dims = c(length(x = regions), 
            length(x = cells)), i = NULL, j = NULL)
        rownames(x = featmat) <- GRangesToString(grange = regions)
        colnames(x = featmat) <- cells
        featmat <- as(object = featmat, Class = "dgCMatrix")
        return(featmat)
    }
    else if (all(nrep == 0)) {
        featmat <- sparseMatrix(dims = c(length(x = regions), 
            0), i = NULL, j = NULL)
        rownames(x = featmat) <- GRangesToString(grange = regions)
        featmat <- as(object = featmat, Class = "dgCMatrix")
        return(featmat)
    }
    else {
        if (is.null(x = cells)) {
            all.cells <- unique(x = unlist(x = cells.in.regions))
            cell.lookup <- seq_along(along.with = all.cells)
            names(x = cell.lookup) <- all.cells
        }
        else {
            cell.lookup <- seq_along(along.with = cells)
            names(cell.lookup) <- cells
        }
        cells.in.regions <- unlist(x = cells.in.regions)
        cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
        all.features <- GRangesToString(grange = regions, sep = sep)
        feature.vec <- rep(x = seq_along(along.with = all.features), 
            nrep)    # 稀疏矩阵的x
###################################################################################################
        featmat <- sparseMatrix(i = feature.vec, j = cells.in.regions, 
            x = rep(x = 1, length(x = cells.in.regions)))
###################################################################################################
        featmat <- as(Class = "dgCMatrix", object = featmat)
        rownames(x = featmat) <- all.features[1:max(feature.vec)]
        colnames(x = featmat) <- names(x = cell.lookup)[1:max(cells.in.regions)]
        if (!is.null(x = cells)) {
            featmat <- AddMissingCells(x = featmat, cells = cells)
        }
        missing.features <- all.features[!(all.features %in% 
            rownames(x = featmat))]
        if (length(x = missing.features) > 0) {
            null.mat <- sparseMatrix(i = c(), j = c(), dims = c(length(x = missing.features), 
                ncol(x = featmat)))
            rownames(x = null.mat) <- missing.features
            null.mat <- as(object = null.mat, Class = "dgCMatrix")
            featmat <- rbind(featmat, null.mat)
        }
        return(featmat)
    }
}

重要函数:GetCellsInRegion()


GetCellsInRegion():在fragment文件中搜索特定基因组区域对应的细胞

  • reads <- scanTabix(file = tabix, param = region)reads为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞的fragments,格式如下:

    reads <- Rsamtools::scanTabix(file = tbx, param = feature.list[[1]])
  • reads <- lapply(X = reads, FUN = ExtractCell)reads为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞,这些细胞有重复的

    reads <- lapply(X = reads, FUN = ExtractCell)
  • 最后reads再经过细胞的筛选,得到最终所提供region中所包含的细胞,这些细胞作为稀疏矩阵的列,重复的细胞会出现多次,在稀疏矩阵中的第三列counts均为1

function (tabix, region, cells = NULL) 
{
    if (!is(object = region, class2 = "GRanges")) {
        region <- StringToGRanges(regions = region)
    }
    reads <- scanTabix(file = tabix, param = region)    # region\t对应的细胞,字符串格式
    reads <- lapply(X = reads, FUN = ExtractCell)    # 将字符串用\t分割,提取细胞名
    if (!is.null(x = cells)) {
        reads <- sapply(X = reads, FUN = function(x) {
            x <- x[fmatch(x = x, table = cells, nomatch = 0L) > 
                0L]    # 只提取在所提供的细胞里面有的细胞(fragments文件里面存在很多最终peak counts矩阵里没有的细胞)
            if (length(x = x) == 0) {
                return(NULL)
            }
            else {
                return(x)
            }
        }, simplify = FALSE)
    }
    return(reads)
}


解读完毕,真不容易!!

最后附上每一步的测试代码:

###### GeneActivity() ######
transcripts <- Signac:::CollapseToLongestTranscript(ranges = Annotation(pbmc.ATAC))
transcripts[which(transcripts$gene_name == 'Pag1'),]
transcripts <- Extend(x = transcripts, upstream = 2000, downstream = 0)
transcripts[which(transcripts$gene_name == 'Pag1'),]
frags <- Fragments(pbmc.ATAC)
cells <- colnames(x = pbmc.ATAC)
# counts <- FeatureMatrix(fragments = frags, features = transcripts, cells = cells)


###### FeatureMatrix() ######
obj.use <- c()
for (i in seq_along(along.with = frags)) {
  if (is.null(x = Cells(frags[[i]]))) {
    obj.use <- c(obj.use, i)
  }
  else if (any(cells %in% Cells(x = frags[[i]]))) {
    obj.use <- c(obj.use, i)
  }
}
obj.use
mat.list <- sapply(X = 1, FUN = function(x) {
  Signac:::SingleFeatureMatrix(fragment = frags[[1]], features = transcripts, 
                      cells = cells, sep = c("-", "-"), process_n = 2000)
})


###### SingleFeatureMatrix() ######
tbx <- Rsamtools::TabixFile(file = frags[[1]]@path)
feature.list <- Signac:::ChunkGRanges(granges = transcripts, nchunk = ceiling(x = length(x = transcripts)/2000))


###### PartialMatrix() ######
open(con = tbx)
cells.in.regions <- GetCellsInRegion(tabix = tbx, region = feature.list[[1]], cells = cells)
close(con = tabix)
nrep <- S4Vectors::elementNROWS(x = cells.in.regions)
cell.lookup <- seq_along(along.with = cells)
names(cell.lookup) <- cells
cells.in.regions <- unlist(x = cells.in.regions)
cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
all.features <- GRangesToString(grange = feature.list[[1]], sep = c("-", "-"))
feature.vec <- rep(x = seq_along(along.with = all.features), nrep)
featmat <- Matrix::sparseMatrix(i = feature.vec, j = cells.in.regions, 
                                x = rep(x = 1, length(x = cells.in.regions)))


###### GetCellsInRegion() ######
reads <- Rsamtools::scanTabix(file = tbx, param = feature.list[[1]])
reads <- lapply(X = reads, FUN = Signac:::ExtractCell)
if (!is.null(x = cells)) {
  reads <- sapply(X = reads, FUN = function(x) {
    x <- x[fastmatch::fmatch(x = x, table = cells, nomatch = 0L) > 
             0L]
    if (length(x = x) == 0) {
      return(NULL)
    }
    else {
      return(x)
    }
  }, simplify = FALSE)
}

相关文章

网友评论

    本文标题:单细胞笔记14-Signac::GeneActivity()函数

    本文链接:https://www.haomeiwen.com/subject/iynpbrtx.html