美文网首页
pagoda1(scde包)+pagoda2

pagoda1(scde包)+pagoda2

作者: 一只烟酒僧 | 来源:发表于2020-11-03 16:24 被阅读0次

    最近在使用pagoda做单细胞的基因集分析,不得不说,算法是真的慢,,,

    pbmc3k数据集
    5个线程跑满
    用时为:
    knn.error.models
        user   system  elapsed
    6033.772   23.041 2105.214
    
    pagoda.varnorm
     user   system  elapsed
    3387.265   49.590  734.011
    
    pagoda.pathway.wPCA
     user    system   elapsed
     8713.815 12737.006  1091.431
    
    pagoda.gene.clusters
    user    system   elapsed
    11324.177  8925.026  1199.914
    
    大概用时:2105+734+1091+1199=5129  约为1.4h
    

    一个快速跑的示例文件

    library(scde)
    library(SeuratData)
    data("pbmc3k")
    #导入为count矩阵,同时要改为integer
    pbmc3k<-apply(as.matrix(pbmc3k@assays$RNA@counts),2,function(x){storage.mode(x)<-"integer";x})
    pbmc3k[1:6,1:66]
    #为了加快速度,我取其中的1/10的细胞数用于后续分析
    set.seed(2020)
    pbmc3k<-pbmc3k[,sample(1:ncol(pbmc3k),round(ncol(pbmc3k)/10),replace = F)]
    dim(pbmc3k)
    #去掉低质量的细胞和基因
    pbmc3k<-clean.counts(pbmc3k,min.lib.size = 200,min.reads = 10)
    dim(pbmc3k)
    #对每个细胞构建误差模型,
    knn <- knn.error.models(pbmc3k, 
                            #groups = ,可选
                            k = ncol(pbmc3k)/4, #猜测整个细胞群可以分为4个群
                            n.cores = 5, #用10个核
                            min.count.threshold = 2, #质控参数参数
                            min.nonfailed = 5, #质控参数
                            max.model.plots = 10)
    
    #Normalizing variance
    varinfo <- pagoda.varnorm(knn, #
                              counts = pbmc3k, 
                              trim = 3/ncol(pbmc3k), 
                              max.adj.var = 5, 
                              n.cores = 5, 
                              plot = TRUE)
    #Controlling for sequencing depth
    varinfo <- pagoda.subtract.aspect(varinfo, colSums(pbmc3k[, rownames(knn)]>0))
    
    #Evaluate overdispersion of pre-defined gene sets
    library(org.Hs.eg.db)
    library(msigdbr)
    C2<-msigdbr(category = "C2")
    C2_kegg<-C2[grep("KEGG",C2$gs_subcat),]
    C2_kegg<-split(C2_kegg$human_gene_symbol,C2_kegg$gs_name)
    C2_kegg
    C2_kegg<-list2env(C2_kegg)
    #对每个基因集计算加权的第一个主成分的量级
    pwpca <- pagoda.pathway.wPCA(varinfo, 
                                C2_kegg,
                                 n.components = 1, 
                                 n.cores = 5)
    #对上述结果做统计学检验
    df <- pagoda.top.aspects(pwpca, 
                             return.table = F,
                             plot = F, 
                             z.score = 2)
    dim(df)
    # #寻找denovo的基因集
    clpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 10, plot = TRUE)
    df <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
    #列先聚类,在做热图(很奇怪!!)
    hc <- pagoda.cluster.cells(df, varinfo)
    pagoda.view.aspects(df,cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20))
    
    pheatmap(pbmc3k[names(df$gw[order(df$gw,decreasing = T)][1:200]),hc$order],
             scale = "row",
             cluster_rows = F,breaks = seq(-5,5,10/100),cluster_cols = F)
    
    
    

    scde下的pagoda1实在是太慢了!!!!!
    无奈还是要使用pagoda2包进行pagoda分析
    但是pagoda2的对象结构有些特殊,属于对象中嵌套操作函数。而比较坑爹的是这种嵌套的操作函数却没有对应的帮助文档,,,,
    为此我去浏览了pagoda2工程文件的全部内容


    image.png

    当我们打开pagoda2/R路径时,进入存放所有脚本的地方


    image.png

    如果对一些说明文档中未提及的函数感兴趣,可以直接通过.R脚本查看函数的源代码!

        testPathwayOverdispersion=function(setenv, 
                                           type='counts', 
                                           max.pathway.size=1e3, 
                                           min.pathway.size=10, 
                                           n.randomizations=5, 
                                           verbose=FALSE, 
                                           score.alpha=0.05,
                                           plot=FALSE, 
                                           cells=NULL,
                                           adjusted.pvalues=TRUE,
                                           z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                           use.oe.scale = FALSE, return.table=FALSE,
                                           name='pathwayPCA',
                                           correlation.distance.threshold=0.2,
                                           loading.distance.threshold=0.01,
                                           top.aspects=Inf,
                                           recalculate.pca=FALSE,
                                           save.pca=TRUE) {
          nPcs <- 1;
          
          if(type=='counts') {
            x <- counts;
            # apply scaling if using raw counts
            x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
          } else {
            if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
            x <- reductions[[type]]
          }
          if(!is.null(cells)) {
            x <- x[cells,]
          }
          
          proper.gene.names <- colnames(x);
          
          if(is.null(misc[['pwpca']]) || recalculate.pca) {
            if(verbose) {
              message("determining valid pathways")
            }
            
            # determine valid pathways
            gsl <- ls(envir = setenv)
            gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
            gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
            names(gsl) <- gsl
            
            if(verbose) {
              message("processing ", length(gsl), " valid pathways")
            }
            
            cm <- Matrix::colMeans(x)
            
            pwpca <- papply(gsl, function(sn) {
              lab <- proper.gene.names %in% get(sn, envir = setenv)
              if(sum(lab)<1) { return(NULL) }
              pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
              pcs$d <- pcs$d/sqrt(nrow(x))
              pcs$rotation <- pcs$v;
              pcs$v <- NULL;
              
              # get standard deviations for the random samples
              ngenes <- sum(lab)
              z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
                si <- sample(ncol(x), ngenes)
                pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
              }))
              z <- z/sqrt(nrow(x));
              
              # local normalization of each component relative to sampled PC1 sd
              avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
              
              if(avar>0.5) {
                # flip orientations to roughly correspond with the means
                pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
                cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
                pcs$scores <- pcs$scores*cs
                pcs$rotation <- pcs$rotation*cs
                rownames(pcs$rotation) <- colnames(x)[lab];
              } # don't bother otherwise - it's not significant
              return(list(xp=pcs,z=z,n=ngenes))
            }, n.cores = n.cores,mc.preschedule=TRUE)
            if(save.pca) {
              misc[['pwpca']] <<- pwpca;
            }
          } else {
            if(verbose) {
              message("re-using previous overdispersion calculations")
              pwpca <- misc[['pwpca']];
            }
          }
          
          if(verbose) {
            message("scoring pathway overdispersion signifcance")
          }
          
          # score overdispersion
          true.n.cells <- nrow(x)
          
          pagoda.effective.cells <- function(pwpca, start = NULL) {
            n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
            var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
            if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
            of <- function(p, v, sp) {
              sn <- p[1]
              vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
              residuals <- (v-vfit)^2
              return(sum(residuals))
            }
            x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
            return((x$par)^2+1/2)
          }
          n.cells <- pagoda.effective.cells(pwpca)
          
          vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
            vars <- as.numeric((pwpca[[i]]$xp$d))
            cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
          })))
          
          # fix p-to-q mistake in qWishartSpike
          qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
            params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
            qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
          }
          
          # add right tail approximation to ptw, which gives up quite early
          pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
            params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
            q.tw <- (q - params$centering)/(params$scaling)
            p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
            p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
            p
          }
          
          vshift <- 0
          ev <- 0
          
          vdf$var <- vdf$var-(vshift-ev)*vdf$n
          basevar <- 1
          vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
          vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
          vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
          vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          
          if(plot) {
            par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
            un <- sort(unique(vdf$n))
            on <- order(vdf$n, decreasing = FALSE)
            pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
            plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
            lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
            lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
          }
          
          rs <- (vshift-ev)*vdf$n
          vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
          vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
          
          df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
          if(adjusted.pvalues) {
            vdf$valid <- vdf$cz  >=  z.score
          } else {
            vdf$valid <- vdf$z  >=  z.score
          }
          
          if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
          
          # apply additional filtering based on >0.5 sd above the local random estimate
          vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
          vdf$name <- names(pwpca)[vdf$i];
          
          if(return.table) {
            df <- df[vdf$valid, ]
            df <- df[order(df$score, decreasing = TRUE), ]
            return(df)
          }
          if(verbose) {
            message("compiling pathway reduction")
          }
          # calculate pathway reduction matrix
          
          # return scaled patterns
          xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
            xm <- x$xp$scores
          }))
          
          if(use.oe.scale) {
            xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
            vdf$sd <- as.numeric(vdf$oe)
          } else {
            # chi-squared
            xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
            vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
            
          }
          rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
          rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
          misc[['pathwayODInfo']] <<- vdf
          
          # collapse gene loading
          if(verbose) {
            message("clustering aspects based on gene loading ... ",appendLF=FALSE)
          }
          tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
          if(verbose) {
            message(nrow(tam2$xv)," aspects remaining")
          }
          if(verbose) {
            message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
          }
          tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
          if(verbose) {
            message(nrow(tam3$xv)," aspects remaining\n")
          }
          tam2$xvw <- tam3$xvw <- NULL; # to save space
          tam3$env <- setenv;
          
          # clean up aspect names, as GO ids are meaningless
          names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
          
          misc[['pathwayOD']] <<- tam3;
          reductions[[name]] <<- tam3$xv;
          return(invisible(tam3))
        },
        
    
    
    #' @useDynLib pagoda2
    #' @import MASS
    #' @import Matrix
    #' @importFrom Rcpp evalCpp
    #' @import Rook
    #' @import igraph
    #' @import sccore
    #' @importFrom irlba irlba
    #' @importFrom parallel mclapply
    #' @importFrom Rcpp sourceCpp
    #' @importFrom magrittr %>%
    NULL
    
    #' A Reference Class, which holds and process single cell RNA-seq data.
    #'
    #' @field counts gene count matrix, normalized on total counts
    #' @field clusters results of clustering
    #' @field graphs graph representations of the dataset
    #' @field reductions results of reductions, i.e. PCA
    #' @field embeddings results of visualization algorithms, i.e. t-SNE or largeVis
    #' @field diffgenes lists of differentially expressed genes
    #' @field pathways pathway information
    #' @field n.cores number of cores, used for the analyses
    #' @field misc a list of miscelaneous structures
    #' @field batch batch factor for the dataset
    #' @field modelType plain (default) or raw (expression matrix taken as is without normalization, though log.scale still applies)
    #' @field verbose verbosity level
    #' @field depth cell size factor
    #' @field genegraphs a slot to store graphical representations in gene space (i.e. gene kNN graphs)
    #' @export Pagoda2
    #' @exportClass Pagoda2
    Pagoda2 <- setRefClass(
      "Pagoda2",
      
      fields=c('counts','clusters','graphs','reductions','embeddings','diffgenes',
               'pathways','n.cores','misc','batch','modelType','verbose','depth','batchNorm','mat','genegraphs'),
      
      methods = list(
        initialize=function(x, ..., modelType='plain', batchNorm='glm',
                            n.cores=parallel::detectCores(logical=FALSE), verbose=TRUE,
                            min.cells.per.gene=0, trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10,
                            lib.sizes=NULL, log.scale=TRUE, keep.genes = NULL) {
          # # init all the output lists
          embeddings <<- list();
          graphs <<- list();
          diffgenes <<- list();
          reductions <<-list();
          clusters <<- list();
          pathways <<- list();
          genegraphs <<- list();
          misc <<-list(lib.sizes=lib.sizes, log.scale=log.scale, model.type=modelType, trim=trim);
          batch <<- NULL;
          counts <<- NULL;
          
          
          if(!missing(x) && ('Pagoda2' %in% class(x))) { # copy constructor
            callSuper(x, ..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores);
          } else {
            callSuper(..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores,verbose=verbose);
            if(!missing(x) && is.null(counts)) { # interpret x as a countMatrix
              if ('matrix' %in% class(x)) {
                x <- as(Matrix(x, sparse=TRUE), "dgCMatrix")
              }
              if(!('dgCMatrix' %in% class(x))) {
                stop("x is not of class dgCMatrix or matrix");
              }
              #if(any(x@x < 0)) {
              #  stop("x contains negative values");
              #}
              setCountMatrix(x, min.cells.per.gene=min.cells.per.gene, trim=trim, 
                             min.transcripts.per.cell=min.transcripts.per.cell, lib.sizes=lib.sizes,
                             log.scale=log.scale, keep.genes=keep.genes)
            }
          }
        },
        
        # provide the initial count matrix, and estimate deviance residual matrix (correcting for depth and batch)
        setCountMatrix=function(countMatrix, depthScale=1e3, min.cells.per.gene=30,
                                trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10, 
                                lib.sizes=NULL, log.scale=FALSE, keep.genes = NULL) {
          # check names
          if(any(duplicated(rownames(countMatrix)))) {
            stop("duplicate gene names are not allowed - please reduce")
          }
          if(any(duplicated(colnames(countMatrix)))) {
            stop("duplicate cell names are not allowed - please reduce")
          }
          
          if(any(is.na(rownames(countMatrix)))) {
            stop("NA gene names are not allowed - please fix")
          }
          if(any(is.na(colnames(countMatrix)))) {
            stop("NA cell names are not allowed - please fix")
          }
          
          if(ncol(countMatrix)<3) { stop("too few cells remaining after min.count.per.cell filter applied - have you pre-filtered the count matrix to include only cells of a realistic size?") }
          
          counts <<- t(countMatrix)
          
          # Keep genes of sufficient coverage or genes that are in the keep.genes list
          counts <<- counts[,diff(counts@p) >= min.cells.per.gene | colnames(counts) %in% keep.genes]
          
          # Save the filtered count matrix in misc$rawCounts
          misc[['rawCounts']] <<- counts;
          misc$depthScale <<- depthScale;
          
          if (modelType == 'raw') {
            return()
          }
          
          if(!is.null(batch)) {
            if(!all(colnames(countMatrix) %in% names(batch))) { 
              stop("the supplied batch vector doesn't contain all the cells in its names attribute")
            }
            colBatch <- as.factor(batch[colnames(countMatrix)])
            batch <<- colBatch;
          }
          
          if(!is.null(lib.sizes)) {
            if(!all(colnames(countMatrix) %in% names(lib.sizes))) { 
              stop("the supplied lib.sizes vector doesn't contain all the cells in its names attribute")
            }
            lib.sizes <- lib.sizes[colnames(countMatrix)]
            depth <<- lib.sizes/mean(lib.sizes)*mean(Matrix::colSums(countMatrix))
          } else {
            depth <<- Matrix::colSums(countMatrix);
          }
          
          cell.filt.mask <- (depth >= min.transcripts.per.cell)
          counts <<- counts[cell.filt.mask,]
          depth <<- depth[cell.filt.mask]
          
          if (any(depth == 0)) {
            stop("Cells with zero expression over all genes are not allowed")
          }
          
          if(verbose) message(nrow(counts)," cells, ",ncol(counts)," genes; normalizing ... ")
          
          # get normalized matrix
          if(modelType=='linearObs') { # this shoudln't work well, since the depth dependency is not completely normalized out
            
            # winsorize in normalized space first in hopes of getting a more stable depth estimate
            if(trim>0) {
              counts <<- counts / as.numeric(depth);
              inplaceWinsorizeSparseCols(counts,trim,n.cores);
              counts <<- counts*as.numeric(depth);
              if(is.null(lib.sizes)) {
                depth <<- round(Matrix::rowSums(counts))
              }
            }
            
            ldepth <- log(depth);
            
            # rank cells, cut into n pieces
            n.depth.slices <- 20;
            #depth.fac <- as.factor(floor(rank(depth)/(length(depth)+1)*n.depth.slices)+1); names(depth.fac) <- rownames(counts);
            depth.fac <- cut(cumsum(sort(depth)),breaks=seq(0,sum(depth),length.out=n.depth.slices)); names(depth.fac) <- rownames(counts);
            depth.fac <- depth.fac[rank(depth)]
            # dataset-wide gene average
            gene.av <- (Matrix::colSums(counts)+n.depth.slices)/(sum(depth)+n.depth.slices)
            
            # pooled counts, df for all genes
            tc <- colSumByFac(counts,as.integer(depth.fac))[-1,,drop=FALSE]
            tc <- log(tc+1)- log(as.numeric(tapply(depth,depth.fac,sum))+1)
            md <- log(as.numeric(tapply(depth,depth.fac,mean)))
            # combined lm
            cm <- lm(tc ~ md)
            colnames(cm$coef) <- colnames(counts)
            # adjust counts
            # predict log(p) for each non-0 entry
            count.gene <- rep(1:counts@Dim[2],diff(counts@p))
            exp.x <- exp(log(gene.av)[count.gene] - cm$coef[1,count.gene] - ldepth[counts@i+1]*cm$coef[2,count.gene])
            counts@x <<- as.numeric(counts@x*exp.x/(depth[counts@i+1]/depthScale)); # normalize by depth as well
            # performa another round of trim
            if(trim>0) {
              inplaceWinsorizeSparseCols(counts,trim,n.cores);
            }
            
            
            # regress out on non-0 observations of ecah gene
            #non0LogColLmS(counts,mx,ldepth)
          } else if(modelType=='plain') {
            if(verbose) message("using plain model ")
            
            if(!is.null(batch)) {
              if(verbose) message("batch ... ")
              
              # dataset-wide gene average
              gene.av <- (Matrix::colSums(counts)+length(levels(batch)))/(sum(depth)+length(levels(batch)))
              
              # pooled counts, df for all genes
              tc <- colSumByFac(counts,as.integer(batch))[-1,,drop=FALSE]
              tc <- t(log(tc+1)- log(as.numeric(tapply(depth,batch,sum))+1))
              bc <- exp(tc-log(gene.av))
              
              # adjust every non-0 entry
              count.gene <- rep(1:counts@Dim[2],diff(counts@p))
              
              counts@x <<- as.numeric(counts@x/bc[cbind(count.gene,as.integer(batch)[counts@i+1])])
            }
            
            if(trim>0) {
              if(verbose) message("winsorizing ... ")
              counts <<- counts/as.numeric(depth);
              
              inplaceWinsorizeSparseCols(counts,trim,n.cores);
              counts <<- counts*as.numeric(depth);
              
              if(is.null(lib.sizes)) {
                depth <<- round(Matrix::rowSums(counts))
              }
            }
            
            counts <<- counts/as.numeric(depth/depthScale);
          } else {
            stop('modelType ',modelType,' is not implemented');
          }
          if(log.scale) {
            if(verbose) message("log scale ... ")
            counts@x <<- as.numeric(log(counts@x+1))
          }
          misc[['rescaled.mat']] <<- NULL;
          if(verbose) message("done.\n")
        },
        
        # adjust variance of the residual matrix, determine overdispersed sites
        adjustVariance=function(gam.k=5, alpha=5e-2, plot=FALSE, use.raw.variance=(.self$modelType=='raw'), use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, cells=NULL, verbose=TRUE, min.gene.cells=0, persist=is.null(cells), n.cores = .self$n.cores) {
          #persist <- is.null(cells) # persist results only if variance normalization is performed for all cells (not a subset)
          if(!is.null(cells)) { # translate cells into a rowSel boolean vector
            if(!(is.logical(cells) && length(cells)==nrow(counts))) {
              if(is.character(cells) || is.integer(cells)) {
                rowSel <- rep(FALSE,nrow(counts)); names(rowSel) <- rownames(counts); rowSel[cells] <- TRUE;
              } else {
                stop("cells argument must be either a logical vector over rows of the count matrix (cells), a vector of cell names or cell integer ids (row numbers)");
              }
            }
          } else {
            rowSel <- NULL;
          }
          
          if(verbose) message("calculating variance fit ...")
          df <- colMeanVarS(counts,rowSel,n.cores);
          
          if(use.raw.variance) { # use raw variance estimates without relative adjustments
            rownames(df) <- colnames(counts);
            vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
            df$lp <- df$lpa <- log(df$v);
            df$qv <- df$v
            df$gsf <- 1; # no rescaling of variance
            ods <- order(df$v,decreasing=TRUE); if(length(ods)>1e3) { ods <- ods[1:1e3] }
            if(persist) misc[['odgenes']] <<- rownames(df)[ods];
          } else {
            # gene-relative normalizaton 
            df$m <- log(df$m); df$v <- log(df$v);
            rownames(df) <- colnames(counts);
            vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
            if(length(vi)<gam.k*1.5) { gam.k=1 };# too few genes
            if(gam.k<2) {
              if(verbose) message(" using lm ")
              m <- lm(v ~ m, data = df[vi,])
            } else {
              if(verbose) message(" using gam ")
              m <- mgcv::gam(as.formula(paste0('v ~ s(m, k = ',gam.k,')')), data = df[vi,])
            }
            df$res <- -Inf;  df$res[vi] <- resid(m,type='response')
            n.obs <- df$nobs; #diff(counts@p)
            suppressWarnings(df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=FALSE,log.p=TRUE)))
            df$lpa <- bh.adjust(df$lp,log=TRUE)
            n.cells <- nrow(counts)
            df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE, log.p=TRUE)/n.cells)
            
            if(use.unadjusted.pvals) {
              ods <- which(df$lp<log(alpha))
            } else {
              ods <- which(df$lpa<log(alpha))
            }
            
            if(persist) misc[['odgenes']] <<- rownames(df)[ods];
            if(verbose) message(length(ods),' overdispersed genes ... ',length(ods) )
            
            df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
            df$gsf[!is.finite(df$gsf)] <- 0;
          }
          
          if(persist) {
            if(verbose) message('persisting ... ')
            misc[['varinfo']] <<- df;
          }
          
          # rescale mat variance
          ## if(rescale.mat) {
          ##   if(verbose) cat("rescaling signal matrix ... ")
          ##   #df$gsf <- geneScaleFactors <- sqrt(1/exp(df$v));
          ##   inplaceColMult(counts,geneScaleFactors,rowSel);  # normalize variance of each gene
          ##   #inplaceColMult(counts,rep(1/mean(Matrix::colSums(counts)),ncol(counts))); # normalize the column sums to be around 1
          ##   if(persist) misc[['rescaled.mat']] <<- geneScaleFactors;
          ## }
          if(plot) {
            if(do.par) {
              par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
            }
            ## convert to log10 from natural log() by factor log10(exp(1))
            smoothScatter(log10(exp(1))*df$m,log10(exp(1))*df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
            grid <- seq(min(log10(exp(1))*df$m[vi]),max(log10(exp(1))*df$m[vi]),length.out=1000)
            lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
            ## NOTE: overdispersed genes ods calcuated with log()-based corrections...
            if(length(ods)>0) {
              points(log10(exp(1))*df$m[ods],log10(exp(1))*df$v[ods],pch='.',col=2,cex=1)
            }
            smoothScatter(log10(exp(1))*df$m[vi],log10(exp(1))*df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
            abline(h=1,lty=2,col=8)
            if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
            points(log10(exp(1))*df$m[ods],log10(exp(1))*df$qv[ods],col=2,pch='.')
          }
          if(verbose) message("done.\n")
          return(invisible(df));
        },
        # make a Knn graph
        # note: for reproducibility, set.seed() and set n.cores=1
        makeKnnGraph=function(k=30,nrand=1e3,type='counts',weight.type='1m',odgenes=NULL,n.cores=.self$n.cores,distance='cosine',center=TRUE,x=NULL,verbose=TRUE,p=NULL, var.scale=(type == "counts")) {
          if(is.null(x)) {
            x.was.given <- FALSE;
            if(type=='counts') {
              x <- counts;
              # Scale Raw counts
            } else {
              if(type %in% names(reductions)) {
                x <- reductions[[type]];
              } else {
                stop('Specified reduction does not exist');
              }
            }
            
            if (var.scale) {
              x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
            }
            
            if(!is.null(odgenes)) {
              if(!all(odgenes %in% rownames(x))) { warning("not all of the provided odgenes are present in the selected matrix")}
              if(verbose) message("using provided odgenes ... ")
              x <- x[,odgenes]
            }
            
          } else { # is.null(x)
            x.was.given <- TRUE;
          }
          
          if(distance %in% c('cosine','angular')) {
            if(center) {
              x<- x - Matrix::rowMeans(x) # centering for consine distance
            }
            xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='angular')
          } else if(distance %in% c('L2','euclidean')) {
            xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='L2')
          } else {
            stop("unknown distance measure specified. Currently supported: angular, L2")
          }
          colnames(xn) <- rownames(xn) <- rownames(x);
          
          #if(weight.type=='rank') {
          #  xn$r <-  unlist(lapply(diff(c(0,which(diff(xn$s)>0),nrow(xn))),function(x) seq(x,1)))
          #}
          #xn <- xn[!xn$s==xn$e,]
          diag(xn) <- 0;
          xn <- Matrix::drop0(xn);
          
          #if(n.cores==1) { # for reproducibility, sort by node names
          #  if(verbose) cat("ordering neighbors for reproducibility ... ");
          #  xn <- xn[order(xn$s+xn$e),]
          #  if(verbose) cat("done\n");
          #}
          #df <- data.frame(from=rownames(x)[xn$s+1],to=rownames(x)[xn$e+1],weight=xn$d,stringsAsFactors=F)
          #if(weight.type=='rank') { df$rank <- xn$r }
          
          if(weight.type %in% c("cauchy","normal") && ncol(x)>sqrt(nrand)) {
            # generate some random pair data for scaling
            if(distance=='cosine') {
              #rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=T),sample(colnames(x),nrand,replace=T)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-cor(x[,z[1]],x[,z[2]])}))
              rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-sum(x[,z[1]]*x[,z[2]])/sqrt(sum(x[,z[1]]^2)*sum(x[,z[2]]^2))}))
            } else if(distance=='JS') {
              rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {jw.disR(x[,z[1]],x[,z[2]])}))
            } else if(distance=='L2') {
              rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sqrt(sum((x[,z[1]]-x[,z[2]])^2))}))
            } else if(distance=='L1') {
              rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sum(abs(x[,z[1]]-x[,z[2]]))}))
            }
            suppressWarnings(rd.model <- MASS::fitdistr(rd,weight.type))
            if(weight.type=='cauchy') {
              xn@x <- 1/pcauchy(xn@x,location=rd.model$estimate['location'],scale=rd.model$estimate['scale'])-1
            } else {
              xn@x <- 1/pnorm(xn@x,mean=rd.model$estimate['mean'],sd=rd.model$estimate['sd'])-1
            }
          }
          xn@x <- pmax(0,xn@x);
          if(weight.type=='constant') { xn@x <- 1}
          if(weight.type=='1m') { xn@x <- pmax(0,1-xn@x) }
          #if(weight.type=='rank') { xn@x <- sqrt(df$rank) };
          # make a weighted edge matrix for the largeVis as well
          sxn <- (xn+t(xn))/2;
          g <- igraph::graph_from_adjacency_matrix(sxn,mode='undirected',weighted=TRUE)
          if(!x.was.given) {
            if(is.null(misc[['edgeMat']])) { misc[['edgeMat']] <<- list() }
            misc[['edgeMat']][[type]] <<- xn;
            graphs[[type]] <<- g;
          }
          return(invisible(g))
        },
        # calculate clusters based on the kNN graph
        getKnnClusters=function(type='counts',method=igraph::multilevel.community, name='community', subsampling.rate=0.8, n.subsamplings=10, cluster.stability.threshold=0.95, n.cores=.self$n.cores, g=NULL, min.cluster.size=1, persist=TRUE, return.details=FALSE, ...) {
          
          if(is.null(g)) {
            if(is.null(graphs[[type]])) { stop("call makeKnnGraph(type='",type,"', ...) first")}
            g <- graphs[[type]];
          }
          
          if(is.null(method)) {
            if(length(vcount(g))<2000) {
              method <- igraph::infomap.community;
            } else {
              method <- igraph::multilevel.community;
            }
          }
          
          # method <- igraph::multilevel.community; n.cores <- 20
          # n.subsamplings <- 10; cluster.stability.dilution <- 1.5; cluster.stability.fraction <- 0.9; subsampling.rate <- 0.8; metaclustering.method<- 'ward.D'
          # g <- r$graphs$PCA
          # x <- r$counts
          # cls <- method(g)
          
          #library(parallel)
          #x <- mclapply(1:5,function(z) method(g),mc.cores=20)
          
          cls <- method(g,...)
          cls.groups <- as.factor(membership(cls));
          
          # cleanup the clusters to remove very small ones
          if(min.cluster.size>1) {
            cn <- names(cls.groups);
            vg <- which(unlist(tapply(cls.groups,cls.groups,length))>=min.cluster.size);
            cls.groups <- as.integer(cls.groups); cls.groups[!cls.groups %in% vg] <- NA;
            cls.groups <- as.factor(cls.groups);
            names(cls.groups) <- cn;
          }
          
          if(persist) {
            clusters[[type]][[name]] <<- cls.groups;
            misc[['community']][[type]][[name]] <<- cls;
          }
          return(invisible(cls))
        },
        
        geneKnnbyPCA = function() {
          warning('geneKnnbyPCA is deprecated use makeGeneKnnGraph() instead');
          .self$makeGeneKnnGraph();
        },
        
        # take a given clustering and generate a hierarchical clustering
        # TODO: add support for innerOrder
        getHierarchicalDiffExpressionAspects = function(type='counts',groups=NULL,clusterName=NULL,method='ward.D', dist='pearson', persist=TRUE, z.threshold=2, n.cores=.self$n.cores, min.set.size=5 ) {
          if(type=='counts') {
            x <- counts;
          } else {
            x <- reductions[[type]];
          }
          if(is.null(groups)) {
            # retrieve clustering
            if(is.null(clusters)) stop("please generate clusters first")
            if(is.null(clusters[[type]])) stop(paste("please generate clusters for",type,"first"))
            if(is.null(clusterName)) { # use the first clustering
              if(length(clusters[[type]])<1) stop(paste("please generate clusters for",type,"first"))
              cl <- clusters[[type]][[1]];
            } else {
              cl <- clusters[[type]][[clusterName]]
              if(is.null(cl)) stop(paste("unable to find clustering",clusterName,'for',type))
              if(verbose) message("using ",clusterName," clustering for ",type," space\n")
            }
          } else {
            if(!all(rownames(x) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
            cl <- groups;
          }
          cl <- as.factor(cl[match(rownames(x),names(cl))]);
          
          if(dist %in% c('pearson','spearman')) {
            rx <- do.call(cbind,tapply(1:nrow(x),cl,function(ii) {
              if (length(ii) > 1) {
                Matrix::colMeans(x[ii,])
              } else {
                x[ii,]
              }
            }))
            d <- as.dist(1-cor(rx,method=dist))
          } else if(dist=='euclidean') {
            rx <- do.call(rbind,tapply(1:nrow(x),cl,function(ii) {
              if (legnth(ii) > 1) {
                Matrix::colMeans(x[ii,])
              } else {
                x[ii,]
              }
            }))
            d <- dist(rx)
          } else if(dist=='JS') {
            # this one has to be done on counts, even if we're working with a reduction
            cl <- as.factor(cl[match(rownames(misc[['rawCounts']]),names(cl))]);
            lvec <- colSumByFac(misc[['rawCounts']],as.integer(cl))[-1,] + 1
            d <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec))));
            colnames(d) <- rownames(d) <- which(table(cl)>0)
            d <- as.dist(d)
          } else {
            stop("unknown distance",dist,"requested")
          }
          
          dd <- as.dendrogram(hclust(d,method=method))
          
          # walk down the dendrogram to generate diff. expression on every split
          diffcontrasts <- function(l,env) {
            v <- mget("contrasts",envir=env,ifnotfound=0)[[1]]
            if(!is.list(v)) v <- list();
            if(is.leaf(l)) return(NULL)
            lcl <- rep(NA,nrow(x));
            names(lcl) <- rownames(x)
            lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[1]])]] <- paste(unlist(l[[1]]),collapse='.');
            lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[2]])]] <- paste(unlist(l[[2]]),collapse='.');
            v <- c(v,list(as.factor(lcl)))
            assign("contrasts",v,envir=env)
            return(1);
          }
          
          de <- environment()
          assign('contrasts',NULL,envir=de)
          dc <- dendrapply(dd,diffcontrasts,env=de)
          dc <- get("contrasts",env=de)
          names(dc) <- unlist(lapply(dc,function(x) paste(levels(x),collapse=".vs.")))
          
          #dexp <- papply(dc,function(x) getDifferentialGenes(groups=x,z.threshold=z.threshold),n.cores=n.cores)
          
          x <- counts;
          x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p)); # apply variance scaling
          x <- t(x)
          dexp <- papply(dc,function(g) {
            dg <- getDifferentialGenes(groups=g,z.threshold=z.threshold)
            dg <- lapply(dg,function(x) x[x$Z>=z.threshold,])
            # calculate average profiles
            x <- x[rownames(x) %in% unlist(lapply(dg,rownames)),]
            if(nrow(x)<1) return(NULL);
            x <- x-rowMeans(x[,!is.na(g)])
            sf <- rep(1,nrow(x)); names(sf) <- rownames(x);
            if(nrow(dg[[1]])>0) {
              ig <- which(names(sf) %in% rownames(dg[[1]]))
              sf[ig] <- 1/length(ig)
            }
            if(nrow(dg[[2]])>0) {
              ig <- which(names(sf) %in% rownames(dg[[2]]))
              sf[ig] <- -1/length(ig)
            }
            sf <- sf*sqrt(length(sf))
            pt <- colSums(x*sf)
            pt[is.na(g)] <- 0;
            return(list(dg=dg,pt=pt))
          },n.cores=n.cores)
          
          
          dexp <- dexp[!unlist(lapply(dexp,is.null))]; # remove cases where nothing was reported
          dexp <- dexp[!unlist(lapply(dexp,function(x){class(x) == 'try-error'}))] ## remove cases that failed
          
          # fake pathwayOD output
          tamr <- list(xv=do.call(rbind,lapply(dexp,function(x) x$pt)),
                       cnam=lapply(sn(names(dexp)),function(n) c(n)))
          
          dgl <- lapply(dexp,function(d) as.character(unlist(lapply(d$dg,function(x) rownames(x)[x$Z>=z.threshold]))))
          tamr$env <- list2env(dgl[unlist(lapply(dgl,length))>=min.set.size])
          
          misc[['pathwayOD']] <<- tamr;
          
          # fake pathwayODInfo
          zs <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$Z)))))
          mval <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$M)))))
          vdf <- data.frame(i=1:nrow(tamr$xv),npc=1,valid=TRUE,sd=apply(tamr$xv,1,sd),cz=zs,z=zs,oe=mval,n=unlist(lapply(dexp,function(x) sum(unlist(lapply(x$dg,nrow))))))
          vdf$name <- rownames(vdf) <- names(dexp);
          misc[['pathwayODInfo']] <<- vdf
          
          return(invisible(tamr))
        },
        
        # Calculates gene Knn network for gene similarity
        # Author: Simon Steiger
        makeGeneKnnGraph = function(nPcs = 100, scale =TRUE , center=TRUE, fastpath =TRUE, maxit =1000, k = 30, n.cores = .self$n.cores, verbose =TRUE) {
          # Transpose first
          x <- t(counts);
          
          # TODO: factor out gene PCA calculation
          # Do the PCA
          nPcs <- min(nrow(x)-1,ncol(x)-1,nPcs);
          if (center) {
            cm <- Matrix::colMeans(x);
            pcs <- irlba(x, nv=nPcs, nu =0, center=cm, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
          } else {
            pcs <- irlba(x, nv=nPcs, nu =0, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
          }
          rownames(pcs$v) <- colnames(x);
          
          # Optional centering
          if (center) {
            pcs$center <- cm;
            pcas <- as.matrix(t(t(x %*% pcs$v) - t(cm  %*% pcs$v)))
          } else {
            pcas <- as.matrix(x %*% pcs$v);
          }
          
          # Keep the names
          rownames(pcas) <- rownames(x);
          colnames(pcas) <- paste0('PC',seq(ncol(pcas)));
          
          # Save into genegraphs slot
          #genegraphs$genePCs <<- pcs;
          #genegraphs$geneRotated <<- pcas;
          
          # Using cosine distance only here
          if(center) {
            pcas <- pcas - Matrix::rowMeans(pcas)
          }
          xn <- n2Knn(pcas, k, nThreads= n.cores, verbose=verbose)
          diag(xn) <- 0; # Remove self edges
          xn <- as(xn,'dgTMatrix'); # will drop 0s
          # Turn into a dataframe, convert from correlation distance into weight
          df <- data.frame('from'=rownames(pcas)[xn@i+1],'to'=rownames(pcas)[xn@j+1],'w'=pmax(1-xn@x,0),stringsAsFactors=FALSE)
          
          genegraphs$graph <<- df;
        },
        
        # calculate density-based clusters
        getDensityClusters=function(type='counts', embeddingType=NULL, name='density', v=0.7, s=1, ...) {
          if (!requireNamespace("dbscan", quietly = TRUE)) {
            stop("Package \"dbscan\" needed for this function to work. Please install it.", call. = FALSE)
          }
          
          if(is.null(embeddings[[type]])) { stop("first, generate embeddings for type ",type)}
          if(is.null(embeddingType)) {
            # take the first one
            embeddingType <- names(embeddings[[type]])[1]
            if(verbose) message("using ",embeddingType," embedding\n")
            emb <- embeddings[[type]][[embeddingType]]
            
          } else {
            emb <- embeddings[[type]][[embeddingType]]
            if(is.null(emb)) { stop("embedding ",embeddingType," for type ", type," doesn't exist")}
          }
          
          cl <- dbscan::dbscan(emb, ...)$cluster;
          cols <- rainbow(length(unique(cl)),v=v,s=s)[cl+1];    cols[cl==0] <- "gray70"
          names(cols) <- rownames(emb);
          clusters[[type]][[name]] <<- cols;
          misc[['clusters']][[type]][[name]] <<- cols;
          return(invisible(cols))
        },
        # determine subpopulation-specific genes
        
        getDifferentialGenes=function(type='counts',clusterType=NULL,groups=NULL,name='customClustering', z.threshold=3,upregulated.only=FALSE,verbose=FALSE, append.specificity.metrics=TRUE, append.auc=FALSE) {
          "Determine differentially expressed genes, comparing each group against all others using Wilcoxon rank sum test\n
          - type data type (currently only default 'counts' is supported)\n
          - clusterType optional cluster type to use as a group-defining factor\n
          - groups explicit cell group specification - a named cell factor (use NA in the factor to exclude cells from the comparison)\n
          - name name slot to store the results in\n
          - z.threshold minimal absolute Z score (adjusted) to report\n
          - upregulated.only whether to report only genes that are expressed significantly higher in each group\n
          - verbose verbose flag\n
          return a list, with each element of the list corresponding to a cell group in the provided/used factor (i.e. factor levels); Each element of a list is a data frame listing the differentially epxressed genes (row names), with the following columns: Z - adjusted Z score, with positive values indicating higher expression in a given group compare to the rest; M - log2 fold change; highest- a boolean flag indicating whether the expression of a given gene in a given vcell group was on average higher than in every other cell group; fe - fraction of cells in a given group having non-zero expression level of a given gene;\n
          Examples:\n\n
          result <- r$getDifferentialGenes(groups=r$clusters$PCA$multilevel);\n
          str(r$diffgenes)"
          
          # restrict counts to the cells for which non-NA value has been specified in groups
          
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              cols <- clusters[[type]][[1]]
            } else {
              cols <- clusters[[type]][[clusterType]]
              if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          } else {
            cols <- groups;
          }
          cm <- counts;
          if(!all(rownames(cm) %in% names(cols))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
          # determine a subset of cells that's in the cols and cols[cell]!=NA
          valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
          if(!all(valid.cells)) {
            # take a subset of the count matrix
            cm <- cm[valid.cells,]
          }
          # reorder cols
          cols <- as.factor(cols[match(rownames(cm),names(cols))]);
          
          cols <- as.factor(cols);
          if(verbose) {
            message("running differential expression with ",length(levels(cols))," clusters ... ")
          }
          # use offsets based on the base model
          
          # run wilcoxon test comparing each group with the rest
          lower.lpv.limit <- -100;
          # calculate rank per-column (per-gene) average rank matrix
          xr <- sparse_matrix_column_ranks(cm);
          # calculate rank sums per group
          grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
          # calculate number of non-zero entries per group
          xr@x <- numeric(length(xr@x))+1
          gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
          #group.size <- as.numeric(tapply(cols,cols,length));
          group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
          # add contribution of zero entries to the grs
          gnz <- (group.size-gnzz)
          # rank of a 0 entry for each gene
          zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
          ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2
          # standardize
          n1n2 <- group.size*(nrow(cm)-group.size);
          # usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
          # correcting for 0 ties, of which there are plenty
          usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
          usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
          x <- t((ustat - n1n2/2)/usigma); # standardized U value- z score
          
          
          # correct for multiple hypothesis
          if(verbose) {
            message("adjusting p-values ... ")
          }
          x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),ncol=ncol(x))*sign(x)
          rownames(x) <- colnames(cm); colnames(x) <- levels(cols)[1:ncol(x)];
          if(verbose) {
            message("done.\n")
          }
          
          # add fold change information
          log.gene.av <- log2(Matrix::colMeans(cm));
          group.gene.av <- colSumByFac(cm,as.integer(cols))[-1,,drop=FALSE] / (group.size+1);
          log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
          # fraction of cells expressing
          f.expressing <- t(gnzz / group.size);
          max.group <- max.col(log2.fold.change)
          
          ds <- lapply(1:ncol(x),function(i) {
            z <- x[,i];
            vi <- which((if (upregulated.only) z else abs(z)) >= z.threshold);
            r <- data.frame(Z=z[vi],M=log2.fold.change[vi,i],highest=max.group[vi]==i,fe=f.expressing[vi,i], Gene=rownames(x)[vi])
            rownames(r) <- r$Gene;
            r <- r[order(r$Z,decreasing=TRUE),]
            r
          })
          names(ds) <- colnames(x);
          
          if (append.specificity.metrics) {
            ds <- names(ds) %>% setNames(., .) %>%
              papply(function(n) appendSpecificityMetricsToDE(ds[[n]], cols, n, p2.counts=cm, append.auc=append.auc), n.cores=n.cores)
          }
          
          if(is.null(groups)) {
            if(is.null(clusterType)) {
              diffgenes[[type]][[names(clusters[[type]])[1]]] <<- ds;
            } else {
              diffgenes[[type]][[clusterType]] <<- ds;
            }
          } else {
            diffgenes[[type]][[name]] <<- ds;
          }
          
          return(invisible(ds))
        },
        
        plotDiffGeneHeatmap=function(type='counts',clusterType=NULL, groups=NULL, n.genes=100, z.score=2, gradient.range.quantile=0.95, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, ... ) {
          if(!is.null(clusterType)) {
            x <- diffgenes[[type]][[clusterType]];
            if(is.null(x)) { stop("differential genes for the specified cluster type haven't been calculated") }
          } else {
            x <- diffgenes[[type]][[1]];
            if(is.null(x)) { stop("no differential genes found for data type ",type) }
          }
          
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              cols <- clusters[[type]][[1]]
            } else {
              cols <- clusters[[type]][[clusterType]]
              if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          } else {
            # use clusters information
            if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
            cols <- as.factor(groups[match(rownames(counts),names(groups))]);
          }
          cols <- as.factor(cols);
          # select genes to show
          if(!is.null(z.score)) {
            x <- lapply(x,function(d) d[d$Z >= z.score & d$highest==TRUE,])
            if(!is.null(n.genes)) {
              x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
            }
          } else {
            if(!is.null(n.genes)) {
              x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
            }
          }
          x <- lapply(x,rownames);
          # make expression matrix
          #browser()
          #x <- x[!unlist(lapply(x,is.null))]
          #cols <- cols[cols %in% names(x)]
          #cols <- droplevels(cols)
          em <- counts[,unlist(x)];
          # renormalize rows
          if(all(sign(em)>=0)) {
            if(is.null(gradientPalette)) {
              gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
            }
            em <- apply(em,1,function(x) {
              zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
              if(diff(zlim)==0) {
                zlim <- as.numeric(range(x))
              }
              x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
              x <- (x-zlim[1])/(zlim[2]-zlim[1])
            })
          } else {
            if(is.null(gradientPalette)) {
              gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
            }
            em <- apply(em,1,function(x) {
              zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
              if(diff(zlim)==0) {
                zlim <- c(-1,1)*as.numeric(max(abs(x)))
              }
              x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
              x <- (x-zlim[1])/(zlim[2]-zlim[1])
            })
          }
          
          # cluster cell types by averages
          rowfac <- factor(rep(names(x),unlist(lapply(x,length))),levels=names(x))
          if(inner.clustering) {
            clclo <- hclust(as.dist(1-cor(do.call(cbind,tapply(1:nrow(em),rowfac,function(ii) Matrix::colMeans(em[ii,,drop=FALSE]))))),method='complete')$order
          } else {
            clclo <- 1:length(levels(rowfac))
          }
          
          if(inner.clustering) {
            # cluster genes within each cluster
            clgo <- tapply(1:nrow(em),rowfac,function(ii) {
              ii[hclust(as.dist(1-cor(t(em[ii,]))),method='complete')$order]
            })
          } else {
            clgo <- tapply(1:nrow(em),rowfac,I)
          }
          if(inner.clustering) {
            # cluster cells within each cluster
            clco <- tapply(1:ncol(em),cols,function(ii) {
              if(length(ii)>3) {
                ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='complete')$order]
              } else {
                ii
              }
            })
          } else {
            clco <- tapply(1:ncol(em),cols,I)
          }
          #clco <- clco[names(clgo)]
          # filter down to the clusters that are included
          #vic <- cols %in% clclo
          colors <- fac2col(cols,v=v,s=s,return.details=TRUE)
          cellcols <- colors$colors[unlist(clco[clclo])]
          genecols <- rev(rep(colors$palette,unlist(lapply(clgo,length)[clclo])))
          bottomMargin <- ifelse(drawGroupNames,4,0.5);
          my.heatmap2(em[rev(unlist(clgo[clclo])),unlist(clco[clclo])],col=gradientPalette,Colv=NA,Rowv=NA,labRow=NA,labCol=NA,RowSideColors=genecols,ColSideColors=cellcols,margins=c(bottomMargin,0.5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=T, box=box, ...)
          abline(v=cumsum(unlist(lapply(clco[clclo],length))),col=1,lty=3)
          abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
        },
        
        # recalculate library sizes using robust regression within clusters
        getRefinedLibSizes=function(clusterType=NULL, groups=NULL,type='counts') {
          if (!requireNamespace("robustbase", quietly = TRUE)) {
            stop("Package \"robustbase\" needed for this function to work. Please install it.", call. = FALSE)
          }
          
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              groups <- clusters[[type]][[1]]
            } else {
              groups <- clusters[[type]][[clusterType]]
              if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          }
          if(is.null(groups)) { stop("clustering must be determined first, or passed as a groups parameter") }
          
          # calculated pooled profiles per cluster
          lvec <- colSumByFac(misc[['rawCounts']],as.integer(groups))[-1,,drop=FALSE];
          lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4
          
          # TODO: implement internal robust regression
          ## x <- misc[['rawCounts']]
          ## x <- x/as.numeric(depth);
          ## inplaceWinsorizeSparseCols(x,10);
          ## x <- x*as.numeric(depth);
          
          x <- mclapply(1:length(levels(groups)),function(j) {
            ii <- names(groups)[which(groups==j)]
            av <- lvec[,j]
            avi <- which(av>0)
            av <- av[avi]
            cvm <- as.matrix(misc[['rawCounts']][ii,avi])
            x <- unlist(lapply(ii,function(i) {
              cv <- cvm[i,]
              #as.numeric(coef(glm(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
              as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
            }))
            names(x) <- ii;
            x
          },mc.cores=30)
          
          lib.sizes <- unlist(x)[rownames(misc[['rawCounts']])]
          lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(misc[['rawCounts']]))
          
          depth <<- lib.sizes;
          return(invisible(lib.sizes))
        },
        
        # plot heatmap for a given set of genes
        plotGeneHeatmap=function(genes, type='counts', clusterType=NULL, groups=NULL, z.score=2, gradient.range.quantile=0.95, cluster.genes=FALSE, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, useRaster=TRUE, smooth.span=max(1,round(nrow(counts)/1024)), ... ) {
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              cols <- clusters[[type]][[1]]
            } else {
              cols <- clusters[[type]][[clusterType]]
              if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          } else {
            # use clusters information
            if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
            cols <- as.factor(groups[match(rownames(counts),names(groups))]);
          }
          cols <- as.factor(cols);
          # make expression matrix
          if(!all(genes %in% colnames(counts))) { warning(paste("the following specified genes were not found in the data: [",paste(genes[!genes %in% colnames(counts)],collapse=" "),"], omitting",sep="")) }
          x <- intersect(genes,colnames(counts));
          if(length(x)<1) { stop("too few genes") }
          em <- as.matrix(t(counts[,x]));
          
          # renormalize rows
          if(all(sign(em)>=0)) {
            if(is.null(gradientPalette)) {
              gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
            }
            em <- t(apply(em,1,function(x) {
              zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
              if(diff(zlim)==0) {
                zlim <- as.numeric(range(x))
              }
              x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
              x <- (x-zlim[1])/(zlim[2]-zlim[1])
            }))
          } else {
            if(is.null(gradientPalette)) {
              gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
            }
            em <- t(apply(em,1,function(x) {
              zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
              if(diff(zlim)==0) {
                zlim <- c(-1,1)*as.numeric(max(abs(x)))
              }
              x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
              x <- (x-zlim[1])/(zlim[2]-zlim[1])
            }))
          }
          
          # cluster cell types by averages
          clclo <- 1:length(levels(cols))
          
          if(cluster.genes) {
            # cluster genes within each cluster
            clgo <- hclust(as.dist(1-cor(t(em))),method='complete')$order
          } else {
            clgo <- 1:nrow(em)
          }
          
          if(inner.clustering) {
            # cluster cells within each cluster
            clco <- tapply(1:ncol(em),cols,function(ii) {
              if(length(ii)>3) {
                ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='single')$order]
              } else {
                ii
              }
              # TODO: implement smoothing span support
            })
          } else {
            clco <- tapply(1:ncol(em),cols,I)
          }
          
          cellcols <- fac2col(cols,v=v,s=s)[unlist(clco[clclo])]
          #genecols <- rev(rep(fac2col(cols,v=v,s=s,return.level.colors=T),unlist(lapply(clgo,length)[clclo])))
          bottomMargin <- 0.5;
          # reorder and potentially smooth em
          em <- em[rev(clgo),unlist(clco[clclo])];
          
          my.heatmap2(em,col=gradientPalette,Colv=NA,Rowv=NA,labCol=NA,ColSideColors=cellcols,margins=c(bottomMargin,5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=useRaster, box=box, ...)
          bp <- cumsum(unlist(lapply(clco[clclo],length))); # cluster border positions
          abline(v=bp,col=1,lty=3)
          #abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
          if(drawGroupNames) {
            clpos <- (c(0,bp[-length(bp)])+bp)/2;
            labpos <- rev(seq(0,length(bp)+1)/(length(bp)+1)*nrow(em)); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
            text(x=clpos,y=labpos,labels = levels(cols),cex=1)
            # par(xpd=TRUE)
            # clpos <- (c(0,bp[-length(bp)])+bp)/2;
            # labpos <- seq(0,length(bp)+1)/(length(bp)+1)*max(bp); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
            # text(x=labpos,y=-2,labels = levels(col))
            # segments(labpos,-1,clpos,0.5,lwd=0.5)
            # par(xpd=FALSE)
          }
        },
        
        # show embedding
        plotEmbedding=function(type=NULL, embeddingType=NULL, clusterType=NULL, groups=NULL, colors=NULL, gene=NULL, plot.theme=ggplot2::theme_bw(), ...) {
          if (is.null(type)) {
            if ('counts' %in% names(embeddings)) {
              type <- 'counts'
            } else if (length(embeddings) > 0) {
              type <- names(embeddings)[1]
            } else {
              stop("first, generate an embedding")
            }
          }
          
          if(is.null(embeddings[[type]]))
            stop("first, generate embeddings for type ",type)
          
          if(is.null(embeddingType)) {
            embeddingType <- 1 # take the first one
          }
          
          emb <- embeddings[[type]][[embeddingType]]
          
          if (!is.null(gene)) {
            if (!(gene %in% colnames(counts)))
              stop("Gene '", gene, "' isn't presented in the count matrix")
            
            colors <- .self$counts[,gene]
          }
          
          if(is.null(colors) && is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              groups <- clusters[[type]][[1]]
            } else {
              groups <- clusters[[type]][[clusterType]]
              if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          }
          
          sccore::embeddingPlot(emb, groups=groups, colors=colors, plot.theme=plot.theme, ...)
        },
        # get overdispersed genes
        getOdGenes=function(n.odgenes=NULL,alpha=5e-2,use.unadjusted.pvals=FALSE) {
          if(is.null(misc[['varinfo']])) { stop("please run adjustVariance first")}
          if(is.null(n.odgenes)) { #return according to alpha
            if(use.unadjusted.pvals) {
              rownames(misc[['varinfo']])[misc[['varinfo']]$lp <= log(alpha)]
            } else {
              rownames(misc[['varinfo']])[misc[['varinfo']]$lpa <= log(alpha)]
            }
          } else { # return top n.odgenes sites
            rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
          }
        },
        
        getNormalizedExpressionMatrix=function(n.odgenes=NULL,genes=NULL) {
          "Return variance-normalized matrix for specified genes or a number of OD genes\n
          - genes explicit vector of genes to return
          - n.odgenes whether a certain number of top overdispersed genes should be used (default NULL - all significant ones); ignored if genes are passed
          return:  cell x gene matrix\n"
          if(is.null(genes)) {
            genes <- .self$getOdGenes(n.odgenes)
          }
          x <- .self$counts[,genes];
          x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
          x
        },
        
        calculatePcaReduction=function(nPcs=20, type='counts', name='PCA', use.odgenes=TRUE, n.odgenes=NULL, odgenes=NULL, center=TRUE, cells=NULL, fastpath=TRUE, maxit=100, verbose=TRUE, var.scale=(type == "counts"), ...) {
          "Calculate PCA reduction of the data\n
          - nPcs number of PCs\n
          - type dataset view to reduce (counts by default, but can specify a name of an existing reduction)\n
          - name name for the PCA reduction to be created\n
          - use.odgenes whether pre-calculated set of overdispersed genes should be used (default=TRUE)\n
          - n.odgenes whether a certain number of top overdispersed genes should be used\n
          - odgenes explicitly specify a set of genes to use for the reduction\n
          - center should data be centerred prior to PCA (default=TRUE)\n
          - cells optional subset of cells on which PCA should be ran\n
          - fastpath use C implementation for speedup\n
          - maxit maximum number of irlba iterations to use\n
          - verbose verbose\n
          - ... additional arguments forwarded to irlba::irlba\n
          return invisible PCA result (the reduction itself is saved in $reductions[[name]])"
          
          if(type=='counts') {
            x <- counts;
          } else {
            if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
            x <- reductions[[type]]
          }
          if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
            if(is.null(misc[['odgenes']] )) { stop("please run adjustVariance() first")}
            odgenes <- misc[['odgenes']];
            if(!is.null(n.odgenes)) {
              if(n.odgenes>length(odgenes)) {
                #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
                odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
              } else {
                odgenes <- odgenes[1:n.odgenes]
              }
            }
          }
          if(!is.null(odgenes)) {
            x <- x[,odgenes]
            if(verbose) message('running PCA using ',length(odgenes),' OD genes .')
          } else { #all genes?
            if(verbose) message('running PCA all ',ncol(x),' genes .')
          }
          # apply scaling if using raw counts
          if(var.scale) {
            #x <- t(t(x)*misc[['varinfo']][colnames(x),'gsf'])
            x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
          }
          if(verbose) message('.')
          
          if(!is.null(cells)) {
            # cell subset is just for PC determination
            nPcs <- min(min(length(cells),ncol(x))-1,nPcs)
            cm <- Matrix::colMeans(x[cells,])
            pcs <- irlba(x[cells,], nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
          } else {
            nPcs <- min(min(nrow(x),ncol(x))-1,nPcs)
            if(center) {
              cm <- Matrix::colMeans(x)
              pcs <- irlba(x, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
            } else {
              pcs <- irlba(x, nv=nPcs, nu=0, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
            }
          }
          rownames(pcs$v) <- colnames(x);
          
          if(verbose) message('.')
          
          # adjust for centering!
          if(center) {
            pcs$center <- cm;
            pcas <- as.matrix(t(as(t(x %*% pcs$v), "dgeMatrix") - t(cm %*% pcs$v)))
          } else {
            pcas <- as.matrix(x %*% pcs$v);
          }
          misc$PCA <<- pcs;
          if(verbose) message('.')
          #pcas <- scde::winsorize.matrix(pcas,0.05)
          # # control for sequencing depth
          # if(is.null(batch)) {
          #   mx <- model.matrix(x ~ d,data=data.frame(x=1,d=depth))
          # } else {
          #   mx <- model.matrix(x ~ d*b,data=data.frame(x=1,d=depth,b=batch))
          # }
          # # TODO: how to get rid of residual depth effects in the PCA-based clustering?
          # #pcas <- t(t(colLm(pcas,mx,returnResid=TRUE))+Matrix::colMeans(pcas))
          # pcas <- colLm(pcas,mx,returnResid=TRUE)
          rownames(pcas) <- rownames(x)
          colnames(pcas) <- paste('PC',seq(ncol(pcas)),sep='')
          #pcas <- pcas[,-1]
          #pcas <- scde::winsorize.matrix(pcas,0.1)
          if(verbose) message(' done\n')
          reductions[[name]] <<- pcas;
          ## nIcs <- nPcs;
          ## a <- ica.R.def(t(pcas),nIcs,tol=1e-3,fun='logcosh',maxit=200,verbose=T,alpha=1,w.init=matrix(rnorm(nIcs*nPcs),nIcs,nPcs))
          ## reductions[['ICA']] <<- as.matrix( x %*% pcs$v %*% a);
          ## colnames(reductions[['ICA']]) <<- paste('IC',seq(ncol(reductions[['ICA']])),sep='');
          
          return(invisible(pcas))
        },
        
        # will reset odgenes to be a superset of the standard odgene selection (guided by n.odgenes or alpha), and
        # a set of recursively determined odgenes based on a given group (or a cluster info)
        expandOdGenes=function(type='counts', clusterType=NULL, groups=NULL , min.group.size=30, od.alpha=1e-1, use.odgenes=FALSE, n.odgenes=NULL, odgenes=NULL, n.odgene.multiplier=1, gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=10,max.odgenes=Inf,take.top.odgenes=TRUE, recursive=TRUE) {
          # determine groups
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              groups <- clusters[[type]][[1]]
            } else {
              groups <- clusters[[type]][[clusterType]]
              if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          } else {
            groups <- as.factor(groups[names(groups) %in% rownames(counts)]);
            groups <- droplevels(groups);
          }
          
          
          # determine initial set of odgenes
          if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
            if(is.null(misc[['varinfo']] )) { stop("please run adjustVariance() first")}
            df <- misc$varinfo
            odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
            #odgenes <- misc[['odgenes']];
            if(!is.null(n.odgenes)) {
              if(n.odgenes>length(odgenes)) {
                #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
                odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
              } else {
                odgenes <- odgenes[1:n.odgenes]
              }
            }
          }
          
          # filter out small groups
          if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
          if(sum(!is.na(groups))<min.group.size) {
            warning("clustering specifies fewer cells than min.group.size")
            return(odgenes)
          }
          
          
          if(length(levels(groups))<2) {
            warning("cannot expand od genes based on a single group")
            return(odgenes)
          }
          
          # determine groups for which variance normalization will be reran
          if(recursive) {
            if(verbose) message("recursive group enumeration ...")
            # derive cluster hierarchy
            
            # use raw counts to derive clustering
            z <- misc$rawCounts;
            rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
            rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
            tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
            rownames(tc) <- levels(groups)
            d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
            hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
            
            dlab <- function(l) {
              if(is.leaf(l)) {
                return(list(labels(l)))
              } else {
                return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
              }
            }
            
            # for each level in the cluster hierarchy, except for the top
            rgroups <- dlab(as.dendrogram(hc))[-1]
            rgroups <- c(list(levels(groups)),rgroups)
            if(verbose) message("done.\n");
          } else {
            rgroups <- lapply(levels(groups),I)
          }
          names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
          
          # run local variance normalization
          if(verbose) message("running local variance normalization ");
          # run variance normalization, determine PCs
          gpcs <- papply(rgroups,function(group) {
            cells <- names(groups)[groups %in% group]
            
            # variance normalization
            df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
            #if(!is.null(n.odgenes)) {
            #  odgenes <- rownames(df)[order(df$lp,decreasing=F)[1:n.odgenes]]
            #} else {
            df <- df[!is.na(df$lp),,drop=FALSE]
            df <- df[order(df$lp,decreasing=FALSE),,drop=FALSE]
            n.od <- min(max(sum(df$lpa<log(od.alpha)),min.odgenes),max.odgenes);
            if(n.od>0) {
              odgenes <- rownames(df)[1:min(n.od*n.odgene.multiplier,nrow(df))]
            } else {
              return(NULL)
            }
            sf <- df$gsf[match(odgenes,rownames(df))];
            return(list(sf=sf,cells=cells,odgenes=odgenes))
          },n.cores=n.cores,mc.preschedule=TRUE)
          if(verbose) message(" done\n");
          odg <- unique(unlist(lapply(gpcs,function(z) z$odgenes)))
          # TODO: consider gsf?
          odgenes <- unique(c(odgenes,odg));
          misc[['odgenes']] <<- odgenes;
          return(invisible(odgenes));
        },
        
        localPcaKnn=function(nPcs=5, type='counts', clusterType=NULL, groups=NULL , k=30, b=1, a=1, min.group.size=30, name='localPCA', baseReduction='PCA', od.alpha=1e-1, n.odgenes=NULL,gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=5,take.top.odgenes=FALSE, recursive=TRUE,euclidean=FALSE,perplexity=k,debug=FALSE,return.pca=FALSE,skip.pca=FALSE) {
          if(type=='counts') {
            x <- counts;
          } else {
            if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
            x <- reductions[[type]]
          }
          
          
          if(is.null(groups)) {
            # look up the clustering based on a specified type
            if(is.null(clusterType)) {
              # take the first one
              groups <- clusters[[type]][[1]]
            } else {
              groups <- clusters[[type]][[clusterType]]
              if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
            }
          } else {
            groups <- as.factor(groups[names(groups) %in% rownames(x)]);
            groups <- droplevels(groups);
            
          }
          if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
          if(sum(!is.na(groups))<min.group.size) { stop("clustering specifies fewer cells than min.group.size") }
          
          
          if(recursive) {
            if(verbose) message("recursive group enumeration ...")
            ## # derive cluster hierarchy
            ## rowFac <- rep(-1,nrow(x)); names(rowFac) <- rownames(x);
            ## rowFac[names(groups)] <- as.integer(groups);
            ## tc <- colSumByFac(x,as.integer(rowFac))[-1,]
            ## rownames(tc) <- levels(groups)
            ## #tc <- rbind("total"=Matrix::colSums(tc),tc)
            ## #d <- jsDist(t(((tc/pmax(1,Matrix::rowSums(tc)))))); rownames(d) <- colnames(d) <- rownames(tc)
            ## d <- 1-cor(t(tc))
            ## hc <- hclust(as.dist(d),method='ward.D')
            
            # use raw counts to derive clustering
            z <- misc$rawCounts;
            rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
            rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
            tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
            rownames(tc) <- levels(groups)
            d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
            hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
            
            
            dlab <- function(l) {
              if(is.leaf(l)) {
                return(list(labels(l)))
              } else {
                return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
              }
            }
            
            # for each level in the cluster hierarchy, except for the top
            rgroups <- dlab(as.dendrogram(hc))[-1]
            rgroups <- c(list(levels(groups)),rgroups)
            if(verbose) message("done.\n");
          } else {
            rgroups <- lapply(levels(groups),I)
          }
          names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
          
          
          if(verbose) message("determining local PCs ");
          # run variance normalization, determine PCs
          gpcs <- papply(rgroups,function(group) {
            cells <- names(groups)[groups %in% group]
            
            # variance normalization
            df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
            if(!is.null(n.odgenes)) {
              odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:n.odgenes]]
            } else {
              odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
            }
            if(length(odgenes)<min.odgenes) {
              if(take.top.odgenes) {
                odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:min.odgenes]]
              } else {
                return(NULL);
              }
            }
            sf <- df$gsf[match(odgenes,rownames(df))];
            
            if(return.pca && skip.pca) {
              return(list(sf=sf,cells=cells,odgenes=odgenes))
            }
            
            
            y <- t(t(x[cells,odgenes])*sf)
            cm <- Matrix::colMeans(y)
            # PCA
            pcs <- irlba(y, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=TRUE,reorth=TRUE)
            rownames(pcs$v) <- colnames(y);
            pcs$center <- cm;
            # row-randomize x to get a sense for the pcs
            m1 <- y;
            if(euclidean) {
              #for (i in 1:nrow(m1)) m1[i,] <- m1[i,order(runif(length(m1[i,])))]
              m1@i <- sample(m1@i)
              rpcas <- t(t(m1 %*% pcs$v) - t(pcs$center %*% pcs$v))
              pcs$rsd <- apply(rpcas,2,sd)
              pcs$trsd <- sd(dist(rpcas))
            }
            
            # sample within-cluster distances (based on main PCA)
            
            pcas <- as.matrix(t(t(t(t(x[,odgenes])*sf) %*% pcs$v) - t(pcs$center %*% pcs$v)))
            if(verbose) message(".")
            return(list(pcs=pcs,sf=sf,df=df,cells=cells,pcas=pcas,odgenes=odgenes))
          },n.cores=n.cores)
          if(verbose) message(" done\n");
          if(return.pca) return(gpcs)
          
          ivi <- unlist(lapply(gpcs,is.null))
          if(any(ivi)) { gpcs <- gpcs[!ivi]; rgroups <- rgroups[!ivi]; }
          
          if(debug) { browser() }
          
          
          # calculate cell relevance to each cluster (p_k,i matrix)
          # use global PCA distances
          if(verbose) message("calculating global distances ...");
          gcdist <- as.matrix(dist(gpcs[[1]]$pcas))
          if(verbose) message(" done.\n")
          
          # for each PCA
          # for each cell, determine p_k_i
          # for each PC
          # determine cell projections
          # subset genes
          # subtract center
          # scale, multiply
          # for each pair, determine cell distances
          # use cell distances to complete weight matrix
          # add to the w*d^2 and w matrices
          # normalize by the sqrt(sum(w))
          
          
          if(euclidean) {
            if(verbose) message("calculating local Euclidean distances .");
            dcs <- papply(gpcs,function(p) {
              pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
              nci <- setdiff(rownames(gcdist),p$cells)
              if(length(nci)>0) {
                # determine within cluster sd
                scells <- sample(p$cells,min(1e3,length(p$cells)))
                cldsd <- sd(as.numeric(gcdist[scells,scells]))
                ncid <- rowMeans(gcdist[nci,scells])
                pk[nci] <- exp(-b*(ncid/cldsd)^2)
              }
              dsq <- as.matrix(dist(p$pcas)^2)
              w <- (1-exp(-a*(dsq)/(p$pcs$trsd^2))) * (pk %o% pk)
              if(verbose) message(".")
              list(dsq=dsq,w=w)
            },n.cores=n.cores)
            if(verbose) message(".")
            d <- Reduce('+',lapply(dcs,function(x) x$dsq*x$w))
            if(verbose) message(".")
            d <- sqrt(d/Reduce('+',lapply(dcs,function(x) x$w)));
            diag(d) <- 0
            if(verbose) message(" done.\n")
          } else {
            # weighted correlation
            if(verbose) message("calculating local correlation distances .");
            dcs <- papply(gpcs,function(p) {
              pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
              nci <- setdiff(rownames(gcdist),p$cells)
              if(length(nci)>0) {
                # determine within cluster sd
                scells <- sample(p$cells,min(1e3,length(p$cells)))
                cldsd <- sd(as.numeric(gcdist[scells,scells]))
                ncid <- rowMeans(gcdist[nci,scells])
                pk[nci] <- exp(-b*(ncid/cldsd)^2)
              }
              x <- cov(t(p$pcas))*(ncol(p$pcas)-1)
              xc <- x / sqrt(diag(x) %o% diag(x)) # correlation
              w <- (1-exp(-a*(1-xc))) * (pk %o% pk)
              
              if(verbose) message(".")
              list(x=x,w=w)
            },n.cores=n.cores)
            if(verbose) message(".")
            # calculate sum_{k}_{w_k*v} matrix
            wm <- Reduce('+',lapply(dcs,function(z) z$w*diag(z$x)))
            d <- Reduce('+',lapply(dcs,function(z) z$x*z$w))
            d <- 1-d/sqrt(wm*t(wm))
            diag(d) <- 0
            if(verbose) message(" done.\n")
          }
          
          
          
          ## d <- dcs[[1]]$dsq
          ## d <- as.matrix(dist(r$reductions$PCA))
          ## knn <- apply(d,2,function(x) order(x,decreasing=F)[1:(k+1)])
          ## cat(".")
          ## m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
          ## m <- m+t(m); # symmetrize
          ## diag(m) <- 0;
          ## rownames(m) <- colnames(m) <- rownames(d)
          ## x <- list(m=m)
          ## i <- 5; cl <- rep(NA,nrow(x$m)); names(cl) <- rownames(x$m); cl[rownames(x$m)[i]] <- 1; cl[which(x$m[,i]>0)] <- 2; cl <- as.factor(cl);
          ## r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
          
          ## i <- 5; cl <- 1/(dcs[[1]]$dsq[,i]+1e-6); names(cl) <- rownames(x$m);
          ## i <- 5; cl <- 1/(d[,i]+1e-6); names(cl) <- rownames(x$m);
          ## r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
          
          # kNN
          if(verbose) message("creating kNN graph .");
          knn <- apply(d,2,function(x) order(x,decreasing=FALSE)[1:(k+1)])
          if(verbose) message(".")
          #m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
          m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=d[as.integer(t(t(knn)+((1:ncol(knn))-1)*nrow(d)))])
          m <- m+t(m); # symmetrize
          diag(m) <- 0;
          rownames(m) <- colnames(m) <- rownames(d)
          if(verbose) message(".")
          g <- graph_from_adjacency_matrix(m,mode='undirected',weighted=TRUE);
          if(verbose) message(".")
          graphs[[name]] <<- g;
          if(verbose) message(" done.\n")
          
          emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, num_threads=n.cores)$Y;
          rownames(emb) <- colnames(d)
          embeddings[[type]][[name]] <<- emb;
          
          # calculate cell-cell distance, considering weighting
          
          # getting total cell-cell distance
          
          return(invisible(list(d=d,m=m,gpcs=gpcs)))
        },
        
        
        # test pathway overdispersion
        # this is a compressed version of the PAGODA1 approach
        # env - pathway to gene environment
        testPathwayOverdispersion=function(setenv, 
                                           type='counts', 
                                           max.pathway.size=1e3, 
                                           min.pathway.size=10, 
                                           n.randomizations=5, 
                                           verbose=FALSE, 
                                           score.alpha=0.05,
                                           plot=FALSE, 
                                           cells=NULL,
                                           adjusted.pvalues=TRUE,
                                           z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                           use.oe.scale = FALSE, return.table=FALSE,
                                           name='pathwayPCA',
                                           correlation.distance.threshold=0.2,
                                           loading.distance.threshold=0.01,
                                           top.aspects=Inf,
                                           recalculate.pca=FALSE,
                                           save.pca=TRUE) {
          nPcs <- 1;
          
          if(type=='counts') {
            x <- counts;
            # apply scaling if using raw counts
            x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
          } else {
            if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
            x <- reductions[[type]]
          }
          if(!is.null(cells)) {
            x <- x[cells,]
          }
          
          proper.gene.names <- colnames(x);
          
          if(is.null(misc[['pwpca']]) || recalculate.pca) {
            if(verbose) {
              message("determining valid pathways")
            }
            
            # determine valid pathways
            gsl <- ls(envir = setenv)
            gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
            gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
            names(gsl) <- gsl
            
            if(verbose) {
              message("processing ", length(gsl), " valid pathways")
            }
            
            cm <- Matrix::colMeans(x)
            
            pwpca <- papply(gsl, function(sn) {
              lab <- proper.gene.names %in% get(sn, envir = setenv)
              if(sum(lab)<1) { return(NULL) }
              pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
              pcs$d <- pcs$d/sqrt(nrow(x))
              pcs$rotation <- pcs$v;
              pcs$v <- NULL;
              
              # get standard deviations for the random samples
              ngenes <- sum(lab)
              z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
                si <- sample(ncol(x), ngenes)
                pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
              }))
              z <- z/sqrt(nrow(x));
              
              # local normalization of each component relative to sampled PC1 sd
              avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
              
              if(avar>0.5) {
                # flip orientations to roughly correspond with the means
                pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
                cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
                pcs$scores <- pcs$scores*cs
                pcs$rotation <- pcs$rotation*cs
                rownames(pcs$rotation) <- colnames(x)[lab];
              } # don't bother otherwise - it's not significant
              return(list(xp=pcs,z=z,n=ngenes))
            }, n.cores = n.cores,mc.preschedule=TRUE)
            if(save.pca) {
              misc[['pwpca']] <<- pwpca;
            }
          } else {
            if(verbose) {
              message("re-using previous overdispersion calculations")
              pwpca <- misc[['pwpca']];
            }
          }
          
          if(verbose) {
            message("scoring pathway overdispersion signifcance")
          }
          
          # score overdispersion
          true.n.cells <- nrow(x)
          
          pagoda.effective.cells <- function(pwpca, start = NULL) {
            n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
            var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
            if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
            of <- function(p, v, sp) {
              sn <- p[1]
              vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
              residuals <- (v-vfit)^2
              return(sum(residuals))
            }
            x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
            return((x$par)^2+1/2)
          }
          n.cells <- pagoda.effective.cells(pwpca)
          
          vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
            vars <- as.numeric((pwpca[[i]]$xp$d))
            cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
          })))
          
          # fix p-to-q mistake in qWishartSpike
          qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
            params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
            qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
          }
          
          # add right tail approximation to ptw, which gives up quite early
          pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
            params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
            q.tw <- (q - params$centering)/(params$scaling)
            p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
            p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
            p
          }
          
          vshift <- 0
          ev <- 0
          
          vdf$var <- vdf$var-(vshift-ev)*vdf$n
          basevar <- 1
          vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
          vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
          vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
          vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
          
          if(plot) {
            par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
            un <- sort(unique(vdf$n))
            on <- order(vdf$n, decreasing = FALSE)
            pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
            plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
            lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
            lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
          }
          
          rs <- (vshift-ev)*vdf$n
          vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
          vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
          
          df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
          if(adjusted.pvalues) {
            vdf$valid <- vdf$cz  >=  z.score
          } else {
            vdf$valid <- vdf$z  >=  z.score
          }
          
          if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
          
          # apply additional filtering based on >0.5 sd above the local random estimate
          vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
          vdf$name <- names(pwpca)[vdf$i];
          
          if(return.table) {
            df <- df[vdf$valid, ]
            df <- df[order(df$score, decreasing = TRUE), ]
            return(df)
          }
          if(verbose) {
            message("compiling pathway reduction")
          }
          # calculate pathway reduction matrix
          
          # return scaled patterns
          xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
            xm <- x$xp$scores
          }))
          
          if(use.oe.scale) {
            xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
            vdf$sd <- as.numeric(vdf$oe)
          } else {
            # chi-squared
            xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
            vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
            
          }
          rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
          rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
          misc[['pathwayODInfo']] <<- vdf
          
          # collapse gene loading
          if(verbose) {
            message("clustering aspects based on gene loading ... ",appendLF=FALSE)
          }
          tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
          if(verbose) {
            message(nrow(tam2$xv)," aspects remaining")
          }
          if(verbose) {
            message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
          }
          tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
          if(verbose) {
            message(nrow(tam3$xv)," aspects remaining\n")
          }
          tam2$xvw <- tam3$xvw <- NULL; # to save space
          tam3$env <- setenv;
          
          # clean up aspect names, as GO ids are meaningless
          names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
          
          misc[['pathwayOD']] <<- tam3;
          reductions[[name]] <<- tam3$xv;
          return(invisible(tam3))
        },
        
        getEmbedding=function(type='counts', embeddingType='largeVis', name=NULL, dims=2, M=1, gamma=1/M, perplexity=50, sgd_batches=NULL, diffusion.steps=0, diffusion.power=0.5, 
                              distance='pearson', n.cores = .self$n.cores, n.sgd.cores=n.cores, ... ) {
          
          if(dims<1) stop("dimensions must be >=1")
          if(type=='counts') {
            x <- counts;
          } else {
            if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
            x <- reductions[[type]]
          }
          if(is.null(name)) { 
            name <- embeddingType 
          }
          
          if(embeddingType=='largeVis') {
            edgeMat <- misc[['edgeMat']][[type]];
            if(is.null(edgeMat)) { stop(paste('KNN graph for type ',type,' not found. Please run makeKnnGraph with type=',type,sep='')) }
            if(is.null(sgd_batches)) { sgd_batches <- nrow(edgeMat)*1e3 }
            #edgeMat <- sparseMatrix(i=xn$s+1,j=xn$e+1,x=xn$rd,dims=c(nrow(x),nrow(x)))
            edgeMat <- (edgeMat + t(edgeMat))/2; # symmetrize
            #edgeMat <- sparseMatrix(i=c(xn$s,xn$e)+1,j=c(xn$e,xn$s)+1,x=c(xn$rd,xn$rd),dims=c(nrow(x),nrow(x)))
            # if(diffusion.steps>0) {
            #   Dinv <- Diagonal(nrow(edgeMat),1/colSums(edgeMat))
            #   Im <- Diagonal(nrow(edgeMat))
            #   W <- (Diagonal(nrow(edgeMat)) + edgeMat %*% Dinv)/2
            #   for(i in 1:diffusion.steps) {
            #     edgeMat <- edgeMat %*% W
            #   }
            # }
            #require(largeVis)
            #if(!is.null(seed)) { set.seed(seed) }
            if(!is.na(perplexity)) {
              wij <- buildWijMatrix(edgeMat,perplexity=perplexity,threads=n.cores)
            } else {
              wij <- edgeMat;
            }
            
            if(diffusion.steps>0) {
              Dinv <- Diagonal(nrow(wij),1/colSums(wij))
              W <- Dinv %*% wij ;
              W <- (1-diffusion.power)*Diagonal(nrow(wij)) + diffusion.power*W
              #W <- (Diagonal(nrow(wij)) + W)/2
              #W <- (Diagonal(nrow(wij)) + sign(W)*(abs(W)^(diffusion.power)))/2
              
              #W <- sign(W)*(abs(W)^diffusion.power)
              #W <- (Diagonal(nrow(wij)) + W)/2
              for(i in 1:diffusion.steps) {
                wij <- wij %*% W
              }
              #browser()
              if(!is.na(perplexity)) {
                wij <- buildWijMatrix(wij,perplexity=perplexity,threads=n.cores)
              }
              
            }
            coords <- projectKNNs(wij = wij, M = M, dim=dims, verbose = TRUE,sgd_batches = sgd_batches,gamma=gamma, seed=1, threads=n.cores, ...)
            colnames(coords) <- rownames(x);
            emb <- embeddings[[type]][[name]] <<- t(coords);
          } else if(embeddingType=='tSNE') {
            if(nrow(x)>4e4) {
              warning('Too many cells to pre-calcualte correlation distances, switching to L2. Please, consider using UMAP.');
              distance <- 'L2';
            }
            
            dup.ids <- which(duplicated(x))
            if (length(dup.ids) > 0) {
              max.vals <- abs(x[dup.ids,] * 0.01)
              x[dup.ids,] <- runif(length(x[dup.ids,]), -max.vals, max.vals)
            }
            
            if (distance=='L2') {
              if(verbose) message("running tSNE using ",n.cores," cores:\n")
              emb <- Rtsne::Rtsne(x, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
            } else {
              if(verbose) message('calculating distance ... ');
              if(verbose) message('pearson ...')
              d <- 1-cor(t(x))
              if(verbose) message("running tSNE using ",n.cores," cores:\n")
              emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
            }
            rownames(emb) <- rownames(x)
            embeddings[[type]][[name]] <<- emb;
          } else if(embeddingType=='FR') {
            g <- graphs[[type]];
            if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
            emb <- layout.fruchterman.reingold(g, weights=E(g)$weight)
            rownames(emb) <- rownames(x); colnames(emb) <- c("D1","D2")
            embeddings[[type]][[name]] <<- emb;
          } else if (embeddingType == "UMAP") {
            if (!requireNamespace("uwot", quietly=TRUE))
              stop("You need to install package 'uwot' to be able to use UMAP embedding.")
            
            distance <- switch (distance, pearson = "cosine", L2 = "euclidean", distance)
            
            emb <- uwot::umap(as.matrix(x), metric=distance, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
            rownames(emb) <- rownames(x)
            embeddings[[type]][[name]] <<- emb;
          } else if (embeddingType == "UMAP_graph") {
            g <- graphs[[type]];
            if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
            emb <- embedKnnGraphUmap(g, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
            embeddings[[type]][[name]] <<- emb;
          } else {
            stop('unknown embeddingType ',embeddingType,' specified');
          }
          
          return(invisible(emb));
        }
      )
    );
    
    

    相关文章

      网友评论

          本文标题:pagoda1(scde包)+pagoda2

          本文链接:https://www.haomeiwen.com/subject/zmfivktx.html