pagoda1(scde包)+pagoda2

作者: 一只烟酒僧 | 来源:发表于2020-11-03 16:24 被阅读0次

pagoda1(scde包)+pagoda2
单细胞转录组差异表达分析——DEsingle
单细胞转录组数据分析课件||8. Differential ex
单细胞转录组差异表达分析——SCDE
闭包，闭包，闭包
Golang-基础包
swift-闭包
npm使用手册
生信小组DAY6-JANE
“包治百病”却拯救不了你的“烂人生”

最近在使用pagoda做单细胞的基因集分析，不得不说，算法是真的慢，，，

pbmc3k数据集
5个线程跑满
用时为：
knn.error.models
    user   system  elapsed
6033.772   23.041 2105.214

pagoda.varnorm
 user   system  elapsed
3387.265   49.590  734.011

pagoda.pathway.wPCA
 user    system   elapsed
 8713.815 12737.006  1091.431

pagoda.gene.clusters
user    system   elapsed
11324.177  8925.026  1199.914

大概用时：2105+734+1091+1199=5129  约为1.4h

一个快速跑的示例文件

library(scde)
library(SeuratData)
data("pbmc3k")
#导入为count矩阵，同时要改为integer
pbmc3k<-apply(as.matrix(pbmc3k@assays$RNA@counts),2,function(x){storage.mode(x)<-"integer";x})
pbmc3k[1:6,1:66]
#为了加快速度，我取其中的1/10的细胞数用于后续分析
set.seed(2020)
pbmc3k<-pbmc3k[,sample(1:ncol(pbmc3k),round(ncol(pbmc3k)/10),replace = F)]
dim(pbmc3k)
#去掉低质量的细胞和基因
pbmc3k<-clean.counts(pbmc3k,min.lib.size = 200,min.reads = 10)
dim(pbmc3k)
#对每个细胞构建误差模型，
knn <- knn.error.models(pbmc3k, 
                        #groups = ,可选
                        k = ncol(pbmc3k)/4, #猜测整个细胞群可以分为4个群
                        n.cores = 5, #用10个核
                        min.count.threshold = 2, #质控参数参数
                        min.nonfailed = 5, #质控参数
                        max.model.plots = 10)

#Normalizing variance
varinfo <- pagoda.varnorm(knn, #
                          counts = pbmc3k, 
                          trim = 3/ncol(pbmc3k), 
                          max.adj.var = 5, 
                          n.cores = 5, 
                          plot = TRUE)
#Controlling for sequencing depth
varinfo <- pagoda.subtract.aspect(varinfo, colSums(pbmc3k[, rownames(knn)]>0))

#Evaluate overdispersion of pre-defined gene sets
library(org.Hs.eg.db)
library(msigdbr)
C2<-msigdbr(category = "C2")
C2_kegg<-C2[grep("KEGG",C2$gs_subcat),]
C2_kegg<-split(C2_kegg$human_gene_symbol,C2_kegg$gs_name)
C2_kegg
C2_kegg<-list2env(C2_kegg)
#对每个基因集计算加权的第一个主成分的量级
pwpca <- pagoda.pathway.wPCA(varinfo, 
                            C2_kegg,
                             n.components = 1, 
                             n.cores = 5)
#对上述结果做统计学检验
df <- pagoda.top.aspects(pwpca, 
                         return.table = F,
                         plot = F, 
                         z.score = 2)
dim(df)
# #寻找denovo的基因集
clpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 10, plot = TRUE)
df <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
#列先聚类，在做热图（很奇怪！！）
hc <- pagoda.cluster.cells(df, varinfo)
pagoda.view.aspects(df,cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20))

pheatmap(pbmc3k[names(df$gw[order(df$gw,decreasing = T)][1:200]),hc$order],
         scale = "row",
         cluster_rows = F,breaks = seq(-5,5,10/100),cluster_cols = F)

scde下的pagoda1实在是太慢了！！！！！
无奈还是要使用pagoda2包进行pagoda分析
但是pagoda2的对象结构有些特殊，属于对象中嵌套操作函数。而比较坑爹的是这种嵌套的操作函数却没有对应的帮助文档，，，，
为此我去浏览了pagoda2工程文件的全部内容

image.png

当我们打开pagoda2/R路径时，进入存放所有脚本的地方

image.png

如果对一些说明文档中未提及的函数感兴趣，可以直接通过.R脚本查看函数的源代码！

    testPathwayOverdispersion=function(setenv, 
                                       type='counts', 
                                       max.pathway.size=1e3, 
                                       min.pathway.size=10, 
                                       n.randomizations=5, 
                                       verbose=FALSE, 
                                       score.alpha=0.05,
                                       plot=FALSE, 
                                       cells=NULL,
                                       adjusted.pvalues=TRUE,
                                       z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                       use.oe.scale = FALSE, return.table=FALSE,
                                       name='pathwayPCA',
                                       correlation.distance.threshold=0.2,
                                       loading.distance.threshold=0.01,
                                       top.aspects=Inf,
                                       recalculate.pca=FALSE,
                                       save.pca=TRUE) {
      nPcs <- 1;
      
      if(type=='counts') {
        x <- counts;
        # apply scaling if using raw counts
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(!is.null(cells)) {
        x <- x[cells,]
      }
      
      proper.gene.names <- colnames(x);
      
      if(is.null(misc[['pwpca']]) || recalculate.pca) {
        if(verbose) {
          message("determining valid pathways")
        }
        
        # determine valid pathways
        gsl <- ls(envir = setenv)
        gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
        gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
        names(gsl) <- gsl
        
        if(verbose) {
          message("processing ", length(gsl), " valid pathways")
        }
        
        cm <- Matrix::colMeans(x)
        
        pwpca <- papply(gsl, function(sn) {
          lab <- proper.gene.names %in% get(sn, envir = setenv)
          if(sum(lab)<1) { return(NULL) }
          pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
          pcs$d <- pcs$d/sqrt(nrow(x))
          pcs$rotation <- pcs$v;
          pcs$v <- NULL;
          
          # get standard deviations for the random samples
          ngenes <- sum(lab)
          z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
            si <- sample(ncol(x), ngenes)
            pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
          }))
          z <- z/sqrt(nrow(x));
          
          # local normalization of each component relative to sampled PC1 sd
          avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
          
          if(avar>0.5) {
            # flip orientations to roughly correspond with the means
            pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
            cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
            pcs$scores <- pcs$scores*cs
            pcs$rotation <- pcs$rotation*cs
            rownames(pcs$rotation) <- colnames(x)[lab];
          } # don't bother otherwise - it's not significant
          return(list(xp=pcs,z=z,n=ngenes))
        }, n.cores = n.cores,mc.preschedule=TRUE)
        if(save.pca) {
          misc[['pwpca']] <<- pwpca;
        }
      } else {
        if(verbose) {
          message("re-using previous overdispersion calculations")
          pwpca <- misc[['pwpca']];
        }
      }
      
      if(verbose) {
        message("scoring pathway overdispersion signifcance")
      }
      
      # score overdispersion
      true.n.cells <- nrow(x)
      
      pagoda.effective.cells <- function(pwpca, start = NULL) {
        n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
        var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
        if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
        of <- function(p, v, sp) {
          sn <- p[1]
          vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
          residuals <- (v-vfit)^2
          return(sum(residuals))
        }
        x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
        return((x$par)^2+1/2)
      }
      n.cells <- pagoda.effective.cells(pwpca)
      
      vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
        vars <- as.numeric((pwpca[[i]]$xp$d))
        cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
      })))
      
      # fix p-to-q mistake in qWishartSpike
      qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
        params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
        qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
      }
      
      # add right tail approximation to ptw, which gives up quite early
      pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
        params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
        q.tw <- (q - params$centering)/(params$scaling)
        p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
        p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
        p
      }
      
      vshift <- 0
      ev <- 0
      
      vdf$var <- vdf$var-(vshift-ev)*vdf$n
      basevar <- 1
      vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
      vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      
      if(plot) {
        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
        un <- sort(unique(vdf$n))
        on <- order(vdf$n, decreasing = FALSE)
        pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
        plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
        lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
        lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
      }
      
      rs <- (vshift-ev)*vdf$n
      vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
      vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
      
      df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
      if(adjusted.pvalues) {
        vdf$valid <- vdf$cz  >=  z.score
      } else {
        vdf$valid <- vdf$z  >=  z.score
      }
      
      if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
      
      # apply additional filtering based on >0.5 sd above the local random estimate
      vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
      vdf$name <- names(pwpca)[vdf$i];
      
      if(return.table) {
        df <- df[vdf$valid, ]
        df <- df[order(df$score, decreasing = TRUE), ]
        return(df)
      }
      if(verbose) {
        message("compiling pathway reduction")
      }
      # calculate pathway reduction matrix
      
      # return scaled patterns
      xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
        xm <- x$xp$scores
      }))
      
      if(use.oe.scale) {
        xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
        vdf$sd <- as.numeric(vdf$oe)
      } else {
        # chi-squared
        xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
        vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
        
      }
      rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
      rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
      misc[['pathwayODInfo']] <<- vdf
      
      # collapse gene loading
      if(verbose) {
        message("clustering aspects based on gene loading ... ",appendLF=FALSE)
      }
      tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
      if(verbose) {
        message(nrow(tam2$xv)," aspects remaining")
      }
      if(verbose) {
        message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
      }
      tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
      if(verbose) {
        message(nrow(tam3$xv)," aspects remaining\n")
      }
      tam2$xvw <- tam3$xvw <- NULL; # to save space
      tam3$env <- setenv;
      
      # clean up aspect names, as GO ids are meaningless
      names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
      
      misc[['pathwayOD']] <<- tam3;
      reductions[[name]] <<- tam3$xv;
      return(invisible(tam3))
    },

#' @useDynLib pagoda2
#' @import MASS
#' @import Matrix
#' @importFrom Rcpp evalCpp
#' @import Rook
#' @import igraph
#' @import sccore
#' @importFrom irlba irlba
#' @importFrom parallel mclapply
#' @importFrom Rcpp sourceCpp
#' @importFrom magrittr %>%
NULL

#' A Reference Class, which holds and process single cell RNA-seq data.
#'
#' @field counts gene count matrix, normalized on total counts
#' @field clusters results of clustering
#' @field graphs graph representations of the dataset
#' @field reductions results of reductions, i.e. PCA
#' @field embeddings results of visualization algorithms, i.e. t-SNE or largeVis
#' @field diffgenes lists of differentially expressed genes
#' @field pathways pathway information
#' @field n.cores number of cores, used for the analyses
#' @field misc a list of miscelaneous structures
#' @field batch batch factor for the dataset
#' @field modelType plain (default) or raw (expression matrix taken as is without normalization, though log.scale still applies)
#' @field verbose verbosity level
#' @field depth cell size factor
#' @field genegraphs a slot to store graphical representations in gene space (i.e. gene kNN graphs)
#' @export Pagoda2
#' @exportClass Pagoda2
Pagoda2 <- setRefClass(
  "Pagoda2",
  
  fields=c('counts','clusters','graphs','reductions','embeddings','diffgenes',
           'pathways','n.cores','misc','batch','modelType','verbose','depth','batchNorm','mat','genegraphs'),
  
  methods = list(
    initialize=function(x, ..., modelType='plain', batchNorm='glm',
                        n.cores=parallel::detectCores(logical=FALSE), verbose=TRUE,
                        min.cells.per.gene=0, trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10,
                        lib.sizes=NULL, log.scale=TRUE, keep.genes = NULL) {
      # # init all the output lists
      embeddings <<- list();
      graphs <<- list();
      diffgenes <<- list();
      reductions <<-list();
      clusters <<- list();
      pathways <<- list();
      genegraphs <<- list();
      misc <<-list(lib.sizes=lib.sizes, log.scale=log.scale, model.type=modelType, trim=trim);
      batch <<- NULL;
      counts <<- NULL;
      
      
      if(!missing(x) && ('Pagoda2' %in% class(x))) { # copy constructor
        callSuper(x, ..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores);
      } else {
        callSuper(..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores,verbose=verbose);
        if(!missing(x) && is.null(counts)) { # interpret x as a countMatrix
          if ('matrix' %in% class(x)) {
            x <- as(Matrix(x, sparse=TRUE), "dgCMatrix")
          }
          if(!('dgCMatrix' %in% class(x))) {
            stop("x is not of class dgCMatrix or matrix");
          }
          #if(any(x@x < 0)) {
          #  stop("x contains negative values");
          #}
          setCountMatrix(x, min.cells.per.gene=min.cells.per.gene, trim=trim, 
                         min.transcripts.per.cell=min.transcripts.per.cell, lib.sizes=lib.sizes,
                         log.scale=log.scale, keep.genes=keep.genes)
        }
      }
    },
    
    # provide the initial count matrix, and estimate deviance residual matrix (correcting for depth and batch)
    setCountMatrix=function(countMatrix, depthScale=1e3, min.cells.per.gene=30,
                            trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10, 
                            lib.sizes=NULL, log.scale=FALSE, keep.genes = NULL) {
      # check names
      if(any(duplicated(rownames(countMatrix)))) {
        stop("duplicate gene names are not allowed - please reduce")
      }
      if(any(duplicated(colnames(countMatrix)))) {
        stop("duplicate cell names are not allowed - please reduce")
      }
      
      if(any(is.na(rownames(countMatrix)))) {
        stop("NA gene names are not allowed - please fix")
      }
      if(any(is.na(colnames(countMatrix)))) {
        stop("NA cell names are not allowed - please fix")
      }
      
      if(ncol(countMatrix)<3) { stop("too few cells remaining after min.count.per.cell filter applied - have you pre-filtered the count matrix to include only cells of a realistic size?") }
      
      counts <<- t(countMatrix)
      
      # Keep genes of sufficient coverage or genes that are in the keep.genes list
      counts <<- counts[,diff(counts@p) >= min.cells.per.gene | colnames(counts) %in% keep.genes]
      
      # Save the filtered count matrix in misc$rawCounts
      misc[['rawCounts']] <<- counts;
      misc$depthScale <<- depthScale;
      
      if (modelType == 'raw') {
        return()
      }
      
      if(!is.null(batch)) {
        if(!all(colnames(countMatrix) %in% names(batch))) { 
          stop("the supplied batch vector doesn't contain all the cells in its names attribute")
        }
        colBatch <- as.factor(batch[colnames(countMatrix)])
        batch <<- colBatch;
      }
      
      if(!is.null(lib.sizes)) {
        if(!all(colnames(countMatrix) %in% names(lib.sizes))) { 
          stop("the supplied lib.sizes vector doesn't contain all the cells in its names attribute")
        }
        lib.sizes <- lib.sizes[colnames(countMatrix)]
        depth <<- lib.sizes/mean(lib.sizes)*mean(Matrix::colSums(countMatrix))
      } else {
        depth <<- Matrix::colSums(countMatrix);
      }
      
      cell.filt.mask <- (depth >= min.transcripts.per.cell)
      counts <<- counts[cell.filt.mask,]
      depth <<- depth[cell.filt.mask]
      
      if (any(depth == 0)) {
        stop("Cells with zero expression over all genes are not allowed")
      }
      
      if(verbose) message(nrow(counts)," cells, ",ncol(counts)," genes; normalizing ... ")
      
      # get normalized matrix
      if(modelType=='linearObs') { # this shoudln't work well, since the depth dependency is not completely normalized out
        
        # winsorize in normalized space first in hopes of getting a more stable depth estimate
        if(trim>0) {
          counts <<- counts / as.numeric(depth);
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
          counts <<- counts*as.numeric(depth);
          if(is.null(lib.sizes)) {
            depth <<- round(Matrix::rowSums(counts))
          }
        }
        
        ldepth <- log(depth);
        
        # rank cells, cut into n pieces
        n.depth.slices <- 20;
        #depth.fac <- as.factor(floor(rank(depth)/(length(depth)+1)*n.depth.slices)+1); names(depth.fac) <- rownames(counts);
        depth.fac <- cut(cumsum(sort(depth)),breaks=seq(0,sum(depth),length.out=n.depth.slices)); names(depth.fac) <- rownames(counts);
        depth.fac <- depth.fac[rank(depth)]
        # dataset-wide gene average
        gene.av <- (Matrix::colSums(counts)+n.depth.slices)/(sum(depth)+n.depth.slices)
        
        # pooled counts, df for all genes
        tc <- colSumByFac(counts,as.integer(depth.fac))[-1,,drop=FALSE]
        tc <- log(tc+1)- log(as.numeric(tapply(depth,depth.fac,sum))+1)
        md <- log(as.numeric(tapply(depth,depth.fac,mean)))
        # combined lm
        cm <- lm(tc ~ md)
        colnames(cm$coef) <- colnames(counts)
        # adjust counts
        # predict log(p) for each non-0 entry
        count.gene <- rep(1:counts@Dim[2],diff(counts@p))
        exp.x <- exp(log(gene.av)[count.gene] - cm$coef[1,count.gene] - ldepth[counts@i+1]*cm$coef[2,count.gene])
        counts@x <<- as.numeric(counts@x*exp.x/(depth[counts@i+1]/depthScale)); # normalize by depth as well
        # performa another round of trim
        if(trim>0) {
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
        }
        
        
        # regress out on non-0 observations of ecah gene
        #non0LogColLmS(counts,mx,ldepth)
      } else if(modelType=='plain') {
        if(verbose) message("using plain model ")
        
        if(!is.null(batch)) {
          if(verbose) message("batch ... ")
          
          # dataset-wide gene average
          gene.av <- (Matrix::colSums(counts)+length(levels(batch)))/(sum(depth)+length(levels(batch)))
          
          # pooled counts, df for all genes
          tc <- colSumByFac(counts,as.integer(batch))[-1,,drop=FALSE]
          tc <- t(log(tc+1)- log(as.numeric(tapply(depth,batch,sum))+1))
          bc <- exp(tc-log(gene.av))
          
          # adjust every non-0 entry
          count.gene <- rep(1:counts@Dim[2],diff(counts@p))
          
          counts@x <<- as.numeric(counts@x/bc[cbind(count.gene,as.integer(batch)[counts@i+1])])
        }
        
        if(trim>0) {
          if(verbose) message("winsorizing ... ")
          counts <<- counts/as.numeric(depth);
          
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
          counts <<- counts*as.numeric(depth);
          
          if(is.null(lib.sizes)) {
            depth <<- round(Matrix::rowSums(counts))
          }
        }
        
        counts <<- counts/as.numeric(depth/depthScale);
      } else {
        stop('modelType ',modelType,' is not implemented');
      }
      if(log.scale) {
        if(verbose) message("log scale ... ")
        counts@x <<- as.numeric(log(counts@x+1))
      }
      misc[['rescaled.mat']] <<- NULL;
      if(verbose) message("done.\n")
    },
    
    # adjust variance of the residual matrix, determine overdispersed sites
    adjustVariance=function(gam.k=5, alpha=5e-2, plot=FALSE, use.raw.variance=(.self$modelType=='raw'), use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, cells=NULL, verbose=TRUE, min.gene.cells=0, persist=is.null(cells), n.cores = .self$n.cores) {
      #persist <- is.null(cells) # persist results only if variance normalization is performed for all cells (not a subset)
      if(!is.null(cells)) { # translate cells into a rowSel boolean vector
        if(!(is.logical(cells) && length(cells)==nrow(counts))) {
          if(is.character(cells) || is.integer(cells)) {
            rowSel <- rep(FALSE,nrow(counts)); names(rowSel) <- rownames(counts); rowSel[cells] <- TRUE;
          } else {
            stop("cells argument must be either a logical vector over rows of the count matrix (cells), a vector of cell names or cell integer ids (row numbers)");
          }
        }
      } else {
        rowSel <- NULL;
      }
      
      if(verbose) message("calculating variance fit ...")
      df <- colMeanVarS(counts,rowSel,n.cores);
      
      if(use.raw.variance) { # use raw variance estimates without relative adjustments
        rownames(df) <- colnames(counts);
        vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
        df$lp <- df$lpa <- log(df$v);
        df$qv <- df$v
        df$gsf <- 1; # no rescaling of variance
        ods <- order(df$v,decreasing=TRUE); if(length(ods)>1e3) { ods <- ods[1:1e3] }
        if(persist) misc[['odgenes']] <<- rownames(df)[ods];
      } else {
        # gene-relative normalizaton 
        df$m <- log(df$m); df$v <- log(df$v);
        rownames(df) <- colnames(counts);
        vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
        if(length(vi)<gam.k*1.5) { gam.k=1 };# too few genes
        if(gam.k<2) {
          if(verbose) message(" using lm ")
          m <- lm(v ~ m, data = df[vi,])
        } else {
          if(verbose) message(" using gam ")
          m <- mgcv::gam(as.formula(paste0('v ~ s(m, k = ',gam.k,')')), data = df[vi,])
        }
        df$res <- -Inf;  df$res[vi] <- resid(m,type='response')
        n.obs <- df$nobs; #diff(counts@p)
        suppressWarnings(df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=FALSE,log.p=TRUE)))
        df$lpa <- bh.adjust(df$lp,log=TRUE)
        n.cells <- nrow(counts)
        df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE, log.p=TRUE)/n.cells)
        
        if(use.unadjusted.pvals) {
          ods <- which(df$lp<log(alpha))
        } else {
          ods <- which(df$lpa<log(alpha))
        }
        
        if(persist) misc[['odgenes']] <<- rownames(df)[ods];
        if(verbose) message(length(ods),' overdispersed genes ... ',length(ods) )
        
        df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
        df$gsf[!is.finite(df$gsf)] <- 0;
      }
      
      if(persist) {
        if(verbose) message('persisting ... ')
        misc[['varinfo']] <<- df;
      }
      
      # rescale mat variance
      ## if(rescale.mat) {
      ##   if(verbose) cat("rescaling signal matrix ... ")
      ##   #df$gsf <- geneScaleFactors <- sqrt(1/exp(df$v));
      ##   inplaceColMult(counts,geneScaleFactors,rowSel);  # normalize variance of each gene
      ##   #inplaceColMult(counts,rep(1/mean(Matrix::colSums(counts)),ncol(counts))); # normalize the column sums to be around 1
      ##   if(persist) misc[['rescaled.mat']] <<- geneScaleFactors;
      ## }
      if(plot) {
        if(do.par) {
          par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
        }
        ## convert to log10 from natural log() by factor log10(exp(1))
        smoothScatter(log10(exp(1))*df$m,log10(exp(1))*df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
        grid <- seq(min(log10(exp(1))*df$m[vi]),max(log10(exp(1))*df$m[vi]),length.out=1000)
        lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
        ## NOTE: overdispersed genes ods calcuated with log()-based corrections...
        if(length(ods)>0) {
          points(log10(exp(1))*df$m[ods],log10(exp(1))*df$v[ods],pch='.',col=2,cex=1)
        }
        smoothScatter(log10(exp(1))*df$m[vi],log10(exp(1))*df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
        abline(h=1,lty=2,col=8)
        if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
        points(log10(exp(1))*df$m[ods],log10(exp(1))*df$qv[ods],col=2,pch='.')
      }
      if(verbose) message("done.\n")
      return(invisible(df));
    },
    # make a Knn graph
    # note: for reproducibility, set.seed() and set n.cores=1
    makeKnnGraph=function(k=30,nrand=1e3,type='counts',weight.type='1m',odgenes=NULL,n.cores=.self$n.cores,distance='cosine',center=TRUE,x=NULL,verbose=TRUE,p=NULL, var.scale=(type == "counts")) {
      if(is.null(x)) {
        x.was.given <- FALSE;
        if(type=='counts') {
          x <- counts;
          # Scale Raw counts
        } else {
          if(type %in% names(reductions)) {
            x <- reductions[[type]];
          } else {
            stop('Specified reduction does not exist');
          }
        }
        
        if (var.scale) {
          x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
        }
        
        if(!is.null(odgenes)) {
          if(!all(odgenes %in% rownames(x))) { warning("not all of the provided odgenes are present in the selected matrix")}
          if(verbose) message("using provided odgenes ... ")
          x <- x[,odgenes]
        }
        
      } else { # is.null(x)
        x.was.given <- TRUE;
      }
      
      if(distance %in% c('cosine','angular')) {
        if(center) {
          x<- x - Matrix::rowMeans(x) # centering for consine distance
        }
        xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='angular')
      } else if(distance %in% c('L2','euclidean')) {
        xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='L2')
      } else {
        stop("unknown distance measure specified. Currently supported: angular, L2")
      }
      colnames(xn) <- rownames(xn) <- rownames(x);
      
      #if(weight.type=='rank') {
      #  xn$r <-  unlist(lapply(diff(c(0,which(diff(xn$s)>0),nrow(xn))),function(x) seq(x,1)))
      #}
      #xn <- xn[!xn$s==xn$e,]
      diag(xn) <- 0;
      xn <- Matrix::drop0(xn);
      
      #if(n.cores==1) { # for reproducibility, sort by node names
      #  if(verbose) cat("ordering neighbors for reproducibility ... ");
      #  xn <- xn[order(xn$s+xn$e),]
      #  if(verbose) cat("done\n");
      #}
      #df <- data.frame(from=rownames(x)[xn$s+1],to=rownames(x)[xn$e+1],weight=xn$d,stringsAsFactors=F)
      #if(weight.type=='rank') { df$rank <- xn$r }
      
      if(weight.type %in% c("cauchy","normal") && ncol(x)>sqrt(nrand)) {
        # generate some random pair data for scaling
        if(distance=='cosine') {
          #rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=T),sample(colnames(x),nrand,replace=T)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-cor(x[,z[1]],x[,z[2]])}))
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-sum(x[,z[1]]*x[,z[2]])/sqrt(sum(x[,z[1]]^2)*sum(x[,z[2]]^2))}))
        } else if(distance=='JS') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {jw.disR(x[,z[1]],x[,z[2]])}))
        } else if(distance=='L2') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sqrt(sum((x[,z[1]]-x[,z[2]])^2))}))
        } else if(distance=='L1') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sum(abs(x[,z[1]]-x[,z[2]]))}))
        }
        suppressWarnings(rd.model <- MASS::fitdistr(rd,weight.type))
        if(weight.type=='cauchy') {
          xn@x <- 1/pcauchy(xn@x,location=rd.model$estimate['location'],scale=rd.model$estimate['scale'])-1
        } else {
          xn@x <- 1/pnorm(xn@x,mean=rd.model$estimate['mean'],sd=rd.model$estimate['sd'])-1
        }
      }
      xn@x <- pmax(0,xn@x);
      if(weight.type=='constant') { xn@x <- 1}
      if(weight.type=='1m') { xn@x <- pmax(0,1-xn@x) }
      #if(weight.type=='rank') { xn@x <- sqrt(df$rank) };
      # make a weighted edge matrix for the largeVis as well
      sxn <- (xn+t(xn))/2;
      g <- igraph::graph_from_adjacency_matrix(sxn,mode='undirected',weighted=TRUE)
      if(!x.was.given) {
        if(is.null(misc[['edgeMat']])) { misc[['edgeMat']] <<- list() }
        misc[['edgeMat']][[type]] <<- xn;
        graphs[[type]] <<- g;
      }
      return(invisible(g))
    },
    # calculate clusters based on the kNN graph
    getKnnClusters=function(type='counts',method=igraph::multilevel.community, name='community', subsampling.rate=0.8, n.subsamplings=10, cluster.stability.threshold=0.95, n.cores=.self$n.cores, g=NULL, min.cluster.size=1, persist=TRUE, return.details=FALSE, ...) {
      
      if(is.null(g)) {
        if(is.null(graphs[[type]])) { stop("call makeKnnGraph(type='",type,"', ...) first")}
        g <- graphs[[type]];
      }
      
      if(is.null(method)) {
        if(length(vcount(g))<2000) {
          method <- igraph::infomap.community;
        } else {
          method <- igraph::multilevel.community;
        }
      }
      
      # method <- igraph::multilevel.community; n.cores <- 20
      # n.subsamplings <- 10; cluster.stability.dilution <- 1.5; cluster.stability.fraction <- 0.9; subsampling.rate <- 0.8; metaclustering.method<- 'ward.D'
      # g <- r$graphs$PCA
      # x <- r$counts
      # cls <- method(g)
      
      #library(parallel)
      #x <- mclapply(1:5,function(z) method(g),mc.cores=20)
      
      cls <- method(g,...)
      cls.groups <- as.factor(membership(cls));
      
      # cleanup the clusters to remove very small ones
      if(min.cluster.size>1) {
        cn <- names(cls.groups);
        vg <- which(unlist(tapply(cls.groups,cls.groups,length))>=min.cluster.size);
        cls.groups <- as.integer(cls.groups); cls.groups[!cls.groups %in% vg] <- NA;
        cls.groups <- as.factor(cls.groups);
        names(cls.groups) <- cn;
      }
      
      if(persist) {
        clusters[[type]][[name]] <<- cls.groups;
        misc[['community']][[type]][[name]] <<- cls;
      }
      return(invisible(cls))
    },
    
    geneKnnbyPCA = function() {
      warning('geneKnnbyPCA is deprecated use makeGeneKnnGraph() instead');
      .self$makeGeneKnnGraph();
    },
    
    # take a given clustering and generate a hierarchical clustering
    # TODO: add support for innerOrder
    getHierarchicalDiffExpressionAspects = function(type='counts',groups=NULL,clusterName=NULL,method='ward.D', dist='pearson', persist=TRUE, z.threshold=2, n.cores=.self$n.cores, min.set.size=5 ) {
      if(type=='counts') {
        x <- counts;
      } else {
        x <- reductions[[type]];
      }
      if(is.null(groups)) {
        # retrieve clustering
        if(is.null(clusters)) stop("please generate clusters first")
        if(is.null(clusters[[type]])) stop(paste("please generate clusters for",type,"first"))
        if(is.null(clusterName)) { # use the first clustering
          if(length(clusters[[type]])<1) stop(paste("please generate clusters for",type,"first"))
          cl <- clusters[[type]][[1]];
        } else {
          cl <- clusters[[type]][[clusterName]]
          if(is.null(cl)) stop(paste("unable to find clustering",clusterName,'for',type))
          if(verbose) message("using ",clusterName," clustering for ",type," space\n")
        }
      } else {
        if(!all(rownames(x) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cl <- groups;
      }
      cl <- as.factor(cl[match(rownames(x),names(cl))]);
      
      if(dist %in% c('pearson','spearman')) {
        rx <- do.call(cbind,tapply(1:nrow(x),cl,function(ii) {
          if (length(ii) > 1) {
            Matrix::colMeans(x[ii,])
          } else {
            x[ii,]
          }
        }))
        d <- as.dist(1-cor(rx,method=dist))
      } else if(dist=='euclidean') {
        rx <- do.call(rbind,tapply(1:nrow(x),cl,function(ii) {
          if (legnth(ii) > 1) {
            Matrix::colMeans(x[ii,])
          } else {
            x[ii,]
          }
        }))
        d <- dist(rx)
      } else if(dist=='JS') {
        # this one has to be done on counts, even if we're working with a reduction
        cl <- as.factor(cl[match(rownames(misc[['rawCounts']]),names(cl))]);
        lvec <- colSumByFac(misc[['rawCounts']],as.integer(cl))[-1,] + 1
        d <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec))));
        colnames(d) <- rownames(d) <- which(table(cl)>0)
        d <- as.dist(d)
      } else {
        stop("unknown distance",dist,"requested")
      }
      
      dd <- as.dendrogram(hclust(d,method=method))
      
      # walk down the dendrogram to generate diff. expression on every split
      diffcontrasts <- function(l,env) {
        v <- mget("contrasts",envir=env,ifnotfound=0)[[1]]
        if(!is.list(v)) v <- list();
        if(is.leaf(l)) return(NULL)
        lcl <- rep(NA,nrow(x));
        names(lcl) <- rownames(x)
        lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[1]])]] <- paste(unlist(l[[1]]),collapse='.');
        lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[2]])]] <- paste(unlist(l[[2]]),collapse='.');
        v <- c(v,list(as.factor(lcl)))
        assign("contrasts",v,envir=env)
        return(1);
      }
      
      de <- environment()
      assign('contrasts',NULL,envir=de)
      dc <- dendrapply(dd,diffcontrasts,env=de)
      dc <- get("contrasts",env=de)
      names(dc) <- unlist(lapply(dc,function(x) paste(levels(x),collapse=".vs.")))
      
      #dexp <- papply(dc,function(x) getDifferentialGenes(groups=x,z.threshold=z.threshold),n.cores=n.cores)
      
      x <- counts;
      x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p)); # apply variance scaling
      x <- t(x)
      dexp <- papply(dc,function(g) {
        dg <- getDifferentialGenes(groups=g,z.threshold=z.threshold)
        dg <- lapply(dg,function(x) x[x$Z>=z.threshold,])
        # calculate average profiles
        x <- x[rownames(x) %in% unlist(lapply(dg,rownames)),]
        if(nrow(x)<1) return(NULL);
        x <- x-rowMeans(x[,!is.na(g)])
        sf <- rep(1,nrow(x)); names(sf) <- rownames(x);
        if(nrow(dg[[1]])>0) {
          ig <- which(names(sf) %in% rownames(dg[[1]]))
          sf[ig] <- 1/length(ig)
        }
        if(nrow(dg[[2]])>0) {
          ig <- which(names(sf) %in% rownames(dg[[2]]))
          sf[ig] <- -1/length(ig)
        }
        sf <- sf*sqrt(length(sf))
        pt <- colSums(x*sf)
        pt[is.na(g)] <- 0;
        return(list(dg=dg,pt=pt))
      },n.cores=n.cores)
      
      
      dexp <- dexp[!unlist(lapply(dexp,is.null))]; # remove cases where nothing was reported
      dexp <- dexp[!unlist(lapply(dexp,function(x){class(x) == 'try-error'}))] ## remove cases that failed
      
      # fake pathwayOD output
      tamr <- list(xv=do.call(rbind,lapply(dexp,function(x) x$pt)),
                   cnam=lapply(sn(names(dexp)),function(n) c(n)))
      
      dgl <- lapply(dexp,function(d) as.character(unlist(lapply(d$dg,function(x) rownames(x)[x$Z>=z.threshold]))))
      tamr$env <- list2env(dgl[unlist(lapply(dgl,length))>=min.set.size])
      
      misc[['pathwayOD']] <<- tamr;
      
      # fake pathwayODInfo
      zs <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$Z)))))
      mval <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$M)))))
      vdf <- data.frame(i=1:nrow(tamr$xv),npc=1,valid=TRUE,sd=apply(tamr$xv,1,sd),cz=zs,z=zs,oe=mval,n=unlist(lapply(dexp,function(x) sum(unlist(lapply(x$dg,nrow))))))
      vdf$name <- rownames(vdf) <- names(dexp);
      misc[['pathwayODInfo']] <<- vdf
      
      return(invisible(tamr))
    },
    
    # Calculates gene Knn network for gene similarity
    # Author: Simon Steiger
    makeGeneKnnGraph = function(nPcs = 100, scale =TRUE , center=TRUE, fastpath =TRUE, maxit =1000, k = 30, n.cores = .self$n.cores, verbose =TRUE) {
      # Transpose first
      x <- t(counts);
      
      # TODO: factor out gene PCA calculation
      # Do the PCA
      nPcs <- min(nrow(x)-1,ncol(x)-1,nPcs);
      if (center) {
        cm <- Matrix::colMeans(x);
        pcs <- irlba(x, nv=nPcs, nu =0, center=cm, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
      } else {
        pcs <- irlba(x, nv=nPcs, nu =0, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
      }
      rownames(pcs$v) <- colnames(x);
      
      # Optional centering
      if (center) {
        pcs$center <- cm;
        pcas <- as.matrix(t(t(x %*% pcs$v) - t(cm  %*% pcs$v)))
      } else {
        pcas <- as.matrix(x %*% pcs$v);
      }
      
      # Keep the names
      rownames(pcas) <- rownames(x);
      colnames(pcas) <- paste0('PC',seq(ncol(pcas)));
      
      # Save into genegraphs slot
      #genegraphs$genePCs <<- pcs;
      #genegraphs$geneRotated <<- pcas;
      
      # Using cosine distance only here
      if(center) {
        pcas <- pcas - Matrix::rowMeans(pcas)
      }
      xn <- n2Knn(pcas, k, nThreads= n.cores, verbose=verbose)
      diag(xn) <- 0; # Remove self edges
      xn <- as(xn,'dgTMatrix'); # will drop 0s
      # Turn into a dataframe, convert from correlation distance into weight
      df <- data.frame('from'=rownames(pcas)[xn@i+1],'to'=rownames(pcas)[xn@j+1],'w'=pmax(1-xn@x,0),stringsAsFactors=FALSE)
      
      genegraphs$graph <<- df;
    },
    
    # calculate density-based clusters
    getDensityClusters=function(type='counts', embeddingType=NULL, name='density', v=0.7, s=1, ...) {
      if (!requireNamespace("dbscan", quietly = TRUE)) {
        stop("Package \"dbscan\" needed for this function to work. Please install it.", call. = FALSE)
      }
      
      if(is.null(embeddings[[type]])) { stop("first, generate embeddings for type ",type)}
      if(is.null(embeddingType)) {
        # take the first one
        embeddingType <- names(embeddings[[type]])[1]
        if(verbose) message("using ",embeddingType," embedding\n")
        emb <- embeddings[[type]][[embeddingType]]
        
      } else {
        emb <- embeddings[[type]][[embeddingType]]
        if(is.null(emb)) { stop("embedding ",embeddingType," for type ", type," doesn't exist")}
      }
      
      cl <- dbscan::dbscan(emb, ...)$cluster;
      cols <- rainbow(length(unique(cl)),v=v,s=s)[cl+1];    cols[cl==0] <- "gray70"
      names(cols) <- rownames(emb);
      clusters[[type]][[name]] <<- cols;
      misc[['clusters']][[type]][[name]] <<- cols;
      return(invisible(cols))
    },
    # determine subpopulation-specific genes
    
    getDifferentialGenes=function(type='counts',clusterType=NULL,groups=NULL,name='customClustering', z.threshold=3,upregulated.only=FALSE,verbose=FALSE, append.specificity.metrics=TRUE, append.auc=FALSE) {
      "Determine differentially expressed genes, comparing each group against all others using Wilcoxon rank sum test\n
      - type data type (currently only default 'counts' is supported)\n
      - clusterType optional cluster type to use as a group-defining factor\n
      - groups explicit cell group specification - a named cell factor (use NA in the factor to exclude cells from the comparison)\n
      - name name slot to store the results in\n
      - z.threshold minimal absolute Z score (adjusted) to report\n
      - upregulated.only whether to report only genes that are expressed significantly higher in each group\n
      - verbose verbose flag\n
      return a list, with each element of the list corresponding to a cell group in the provided/used factor (i.e. factor levels); Each element of a list is a data frame listing the differentially epxressed genes (row names), with the following columns: Z - adjusted Z score, with positive values indicating higher expression in a given group compare to the rest; M - log2 fold change; highest- a boolean flag indicating whether the expression of a given gene in a given vcell group was on average higher than in every other cell group; fe - fraction of cells in a given group having non-zero expression level of a given gene;\n
      Examples:\n\n
      result <- r$getDifferentialGenes(groups=r$clusters$PCA$multilevel);\n
      str(r$diffgenes)"
      
      # restrict counts to the cells for which non-NA value has been specified in groups
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        cols <- groups;
      }
      cm <- counts;
      if(!all(rownames(cm) %in% names(cols))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
      # determine a subset of cells that's in the cols and cols[cell]!=NA
      valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
      if(!all(valid.cells)) {
        # take a subset of the count matrix
        cm <- cm[valid.cells,]
      }
      # reorder cols
      cols <- as.factor(cols[match(rownames(cm),names(cols))]);
      
      cols <- as.factor(cols);
      if(verbose) {
        message("running differential expression with ",length(levels(cols))," clusters ... ")
      }
      # use offsets based on the base model
      
      # run wilcoxon test comparing each group with the rest
      lower.lpv.limit <- -100;
      # calculate rank per-column (per-gene) average rank matrix
      xr <- sparse_matrix_column_ranks(cm);
      # calculate rank sums per group
      grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
      # calculate number of non-zero entries per group
      xr@x <- numeric(length(xr@x))+1
      gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
      #group.size <- as.numeric(tapply(cols,cols,length));
      group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
      # add contribution of zero entries to the grs
      gnz <- (group.size-gnzz)
      # rank of a 0 entry for each gene
      zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
      ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2
      # standardize
      n1n2 <- group.size*(nrow(cm)-group.size);
      # usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
      # correcting for 0 ties, of which there are plenty
      usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
      usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
      x <- t((ustat - n1n2/2)/usigma); # standardized U value- z score
      
      
      # correct for multiple hypothesis
      if(verbose) {
        message("adjusting p-values ... ")
      }
      x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),ncol=ncol(x))*sign(x)
      rownames(x) <- colnames(cm); colnames(x) <- levels(cols)[1:ncol(x)];
      if(verbose) {
        message("done.\n")
      }
      
      # add fold change information
      log.gene.av <- log2(Matrix::colMeans(cm));
      group.gene.av <- colSumByFac(cm,as.integer(cols))[-1,,drop=FALSE] / (group.size+1);
      log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
      # fraction of cells expressing
      f.expressing <- t(gnzz / group.size);
      max.group <- max.col(log2.fold.change)
      
      ds <- lapply(1:ncol(x),function(i) {
        z <- x[,i];
        vi <- which((if (upregulated.only) z else abs(z)) >= z.threshold);
        r <- data.frame(Z=z[vi],M=log2.fold.change[vi,i],highest=max.group[vi]==i,fe=f.expressing[vi,i], Gene=rownames(x)[vi])
        rownames(r) <- r$Gene;
        r <- r[order(r$Z,decreasing=TRUE),]
        r
      })
      names(ds) <- colnames(x);
      
      if (append.specificity.metrics) {
        ds <- names(ds) %>% setNames(., .) %>%
          papply(function(n) appendSpecificityMetricsToDE(ds[[n]], cols, n, p2.counts=cm, append.auc=append.auc), n.cores=n.cores)
      }
      
      if(is.null(groups)) {
        if(is.null(clusterType)) {
          diffgenes[[type]][[names(clusters[[type]])[1]]] <<- ds;
        } else {
          diffgenes[[type]][[clusterType]] <<- ds;
        }
      } else {
        diffgenes[[type]][[name]] <<- ds;
      }
      
      return(invisible(ds))
    },
    
    plotDiffGeneHeatmap=function(type='counts',clusterType=NULL, groups=NULL, n.genes=100, z.score=2, gradient.range.quantile=0.95, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, ... ) {
      if(!is.null(clusterType)) {
        x <- diffgenes[[type]][[clusterType]];
        if(is.null(x)) { stop("differential genes for the specified cluster type haven't been calculated") }
      } else {
        x <- diffgenes[[type]][[1]];
        if(is.null(x)) { stop("no differential genes found for data type ",type) }
      }
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        # use clusters information
        if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cols <- as.factor(groups[match(rownames(counts),names(groups))]);
      }
      cols <- as.factor(cols);
      # select genes to show
      if(!is.null(z.score)) {
        x <- lapply(x,function(d) d[d$Z >= z.score & d$highest==TRUE,])
        if(!is.null(n.genes)) {
          x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
        }
      } else {
        if(!is.null(n.genes)) {
          x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
        }
      }
      x <- lapply(x,rownames);
      # make expression matrix
      #browser()
      #x <- x[!unlist(lapply(x,is.null))]
      #cols <- cols[cols %in% names(x)]
      #cols <- droplevels(cols)
      em <- counts[,unlist(x)];
      # renormalize rows
      if(all(sign(em)>=0)) {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
        }
        em <- apply(em,1,function(x) {
          zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
          if(diff(zlim)==0) {
            zlim <- as.numeric(range(x))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        })
      } else {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
        }
        em <- apply(em,1,function(x) {
          zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
          if(diff(zlim)==0) {
            zlim <- c(-1,1)*as.numeric(max(abs(x)))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        })
      }
      
      # cluster cell types by averages
      rowfac <- factor(rep(names(x),unlist(lapply(x,length))),levels=names(x))
      if(inner.clustering) {
        clclo <- hclust(as.dist(1-cor(do.call(cbind,tapply(1:nrow(em),rowfac,function(ii) Matrix::colMeans(em[ii,,drop=FALSE]))))),method='complete')$order
      } else {
        clclo <- 1:length(levels(rowfac))
      }
      
      if(inner.clustering) {
        # cluster genes within each cluster
        clgo <- tapply(1:nrow(em),rowfac,function(ii) {
          ii[hclust(as.dist(1-cor(t(em[ii,]))),method='complete')$order]
        })
      } else {
        clgo <- tapply(1:nrow(em),rowfac,I)
      }
      if(inner.clustering) {
        # cluster cells within each cluster
        clco <- tapply(1:ncol(em),cols,function(ii) {
          if(length(ii)>3) {
            ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='complete')$order]
          } else {
            ii
          }
        })
      } else {
        clco <- tapply(1:ncol(em),cols,I)
      }
      #clco <- clco[names(clgo)]
      # filter down to the clusters that are included
      #vic <- cols %in% clclo
      colors <- fac2col(cols,v=v,s=s,return.details=TRUE)
      cellcols <- colors$colors[unlist(clco[clclo])]
      genecols <- rev(rep(colors$palette,unlist(lapply(clgo,length)[clclo])))
      bottomMargin <- ifelse(drawGroupNames,4,0.5);
      my.heatmap2(em[rev(unlist(clgo[clclo])),unlist(clco[clclo])],col=gradientPalette,Colv=NA,Rowv=NA,labRow=NA,labCol=NA,RowSideColors=genecols,ColSideColors=cellcols,margins=c(bottomMargin,0.5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=T, box=box, ...)
      abline(v=cumsum(unlist(lapply(clco[clclo],length))),col=1,lty=3)
      abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
    },
    
    # recalculate library sizes using robust regression within clusters
    getRefinedLibSizes=function(clusterType=NULL, groups=NULL,type='counts') {
      if (!requireNamespace("robustbase", quietly = TRUE)) {
        stop("Package \"robustbase\" needed for this function to work. Please install it.", call. = FALSE)
      }
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      }
      if(is.null(groups)) { stop("clustering must be determined first, or passed as a groups parameter") }
      
      # calculated pooled profiles per cluster
      lvec <- colSumByFac(misc[['rawCounts']],as.integer(groups))[-1,,drop=FALSE];
      lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4
      
      # TODO: implement internal robust regression
      ## x <- misc[['rawCounts']]
      ## x <- x/as.numeric(depth);
      ## inplaceWinsorizeSparseCols(x,10);
      ## x <- x*as.numeric(depth);
      
      x <- mclapply(1:length(levels(groups)),function(j) {
        ii <- names(groups)[which(groups==j)]
        av <- lvec[,j]
        avi <- which(av>0)
        av <- av[avi]
        cvm <- as.matrix(misc[['rawCounts']][ii,avi])
        x <- unlist(lapply(ii,function(i) {
          cv <- cvm[i,]
          #as.numeric(coef(glm(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
          as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
        }))
        names(x) <- ii;
        x
      },mc.cores=30)
      
      lib.sizes <- unlist(x)[rownames(misc[['rawCounts']])]
      lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(misc[['rawCounts']]))
      
      depth <<- lib.sizes;
      return(invisible(lib.sizes))
    },
    
    # plot heatmap for a given set of genes
    plotGeneHeatmap=function(genes, type='counts', clusterType=NULL, groups=NULL, z.score=2, gradient.range.quantile=0.95, cluster.genes=FALSE, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, useRaster=TRUE, smooth.span=max(1,round(nrow(counts)/1024)), ... ) {
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        # use clusters information
        if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cols <- as.factor(groups[match(rownames(counts),names(groups))]);
      }
      cols <- as.factor(cols);
      # make expression matrix
      if(!all(genes %in% colnames(counts))) { warning(paste("the following specified genes were not found in the data: [",paste(genes[!genes %in% colnames(counts)],collapse=" "),"], omitting",sep="")) }
      x <- intersect(genes,colnames(counts));
      if(length(x)<1) { stop("too few genes") }
      em <- as.matrix(t(counts[,x]));
      
      # renormalize rows
      if(all(sign(em)>=0)) {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
        }
        em <- t(apply(em,1,function(x) {
          zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
          if(diff(zlim)==0) {
            zlim <- as.numeric(range(x))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        }))
      } else {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
        }
        em <- t(apply(em,1,function(x) {
          zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
          if(diff(zlim)==0) {
            zlim <- c(-1,1)*as.numeric(max(abs(x)))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        }))
      }
      
      # cluster cell types by averages
      clclo <- 1:length(levels(cols))
      
      if(cluster.genes) {
        # cluster genes within each cluster
        clgo <- hclust(as.dist(1-cor(t(em))),method='complete')$order
      } else {
        clgo <- 1:nrow(em)
      }
      
      if(inner.clustering) {
        # cluster cells within each cluster
        clco <- tapply(1:ncol(em),cols,function(ii) {
          if(length(ii)>3) {
            ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='single')$order]
          } else {
            ii
          }
          # TODO: implement smoothing span support
        })
      } else {
        clco <- tapply(1:ncol(em),cols,I)
      }
      
      cellcols <- fac2col(cols,v=v,s=s)[unlist(clco[clclo])]
      #genecols <- rev(rep(fac2col(cols,v=v,s=s,return.level.colors=T),unlist(lapply(clgo,length)[clclo])))
      bottomMargin <- 0.5;
      # reorder and potentially smooth em
      em <- em[rev(clgo),unlist(clco[clclo])];
      
      my.heatmap2(em,col=gradientPalette,Colv=NA,Rowv=NA,labCol=NA,ColSideColors=cellcols,margins=c(bottomMargin,5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=useRaster, box=box, ...)
      bp <- cumsum(unlist(lapply(clco[clclo],length))); # cluster border positions
      abline(v=bp,col=1,lty=3)
      #abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
      if(drawGroupNames) {
        clpos <- (c(0,bp[-length(bp)])+bp)/2;
        labpos <- rev(seq(0,length(bp)+1)/(length(bp)+1)*nrow(em)); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
        text(x=clpos,y=labpos,labels = levels(cols),cex=1)
        # par(xpd=TRUE)
        # clpos <- (c(0,bp[-length(bp)])+bp)/2;
        # labpos <- seq(0,length(bp)+1)/(length(bp)+1)*max(bp); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
        # text(x=labpos,y=-2,labels = levels(col))
        # segments(labpos,-1,clpos,0.5,lwd=0.5)
        # par(xpd=FALSE)
      }
    },
    
    # show embedding
    plotEmbedding=function(type=NULL, embeddingType=NULL, clusterType=NULL, groups=NULL, colors=NULL, gene=NULL, plot.theme=ggplot2::theme_bw(), ...) {
      if (is.null(type)) {
        if ('counts' %in% names(embeddings)) {
          type <- 'counts'
        } else if (length(embeddings) > 0) {
          type <- names(embeddings)[1]
        } else {
          stop("first, generate an embedding")
        }
      }
      
      if(is.null(embeddings[[type]]))
        stop("first, generate embeddings for type ",type)
      
      if(is.null(embeddingType)) {
        embeddingType <- 1 # take the first one
      }
      
      emb <- embeddings[[type]][[embeddingType]]
      
      if (!is.null(gene)) {
        if (!(gene %in% colnames(counts)))
          stop("Gene '", gene, "' isn't presented in the count matrix")
        
        colors <- .self$counts[,gene]
      }
      
      if(is.null(colors) && is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      }
      
      sccore::embeddingPlot(emb, groups=groups, colors=colors, plot.theme=plot.theme, ...)
    },
    # get overdispersed genes
    getOdGenes=function(n.odgenes=NULL,alpha=5e-2,use.unadjusted.pvals=FALSE) {
      if(is.null(misc[['varinfo']])) { stop("please run adjustVariance first")}
      if(is.null(n.odgenes)) { #return according to alpha
        if(use.unadjusted.pvals) {
          rownames(misc[['varinfo']])[misc[['varinfo']]$lp <= log(alpha)]
        } else {
          rownames(misc[['varinfo']])[misc[['varinfo']]$lpa <= log(alpha)]
        }
      } else { # return top n.odgenes sites
        rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
      }
    },
    
    getNormalizedExpressionMatrix=function(n.odgenes=NULL,genes=NULL) {
      "Return variance-normalized matrix for specified genes or a number of OD genes\n
      - genes explicit vector of genes to return
      - n.odgenes whether a certain number of top overdispersed genes should be used (default NULL - all significant ones); ignored if genes are passed
      return:  cell x gene matrix\n"
      if(is.null(genes)) {
        genes <- .self$getOdGenes(n.odgenes)
      }
      x <- .self$counts[,genes];
      x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      x
    },
    
    calculatePcaReduction=function(nPcs=20, type='counts', name='PCA', use.odgenes=TRUE, n.odgenes=NULL, odgenes=NULL, center=TRUE, cells=NULL, fastpath=TRUE, maxit=100, verbose=TRUE, var.scale=(type == "counts"), ...) {
      "Calculate PCA reduction of the data\n
      - nPcs number of PCs\n
      - type dataset view to reduce (counts by default, but can specify a name of an existing reduction)\n
      - name name for the PCA reduction to be created\n
      - use.odgenes whether pre-calculated set of overdispersed genes should be used (default=TRUE)\n
      - n.odgenes whether a certain number of top overdispersed genes should be used\n
      - odgenes explicitly specify a set of genes to use for the reduction\n
      - center should data be centerred prior to PCA (default=TRUE)\n
      - cells optional subset of cells on which PCA should be ran\n
      - fastpath use C implementation for speedup\n
      - maxit maximum number of irlba iterations to use\n
      - verbose verbose\n
      - ... additional arguments forwarded to irlba::irlba\n
      return invisible PCA result (the reduction itself is saved in $reductions[[name]])"
      
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
        if(is.null(misc[['odgenes']] )) { stop("please run adjustVariance() first")}
        odgenes <- misc[['odgenes']];
        if(!is.null(n.odgenes)) {
          if(n.odgenes>length(odgenes)) {
            #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
            odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
          } else {
            odgenes <- odgenes[1:n.odgenes]
          }
        }
      }
      if(!is.null(odgenes)) {
        x <- x[,odgenes]
        if(verbose) message('running PCA using ',length(odgenes),' OD genes .')
      } else { #all genes?
        if(verbose) message('running PCA all ',ncol(x),' genes .')
      }
      # apply scaling if using raw counts
      if(var.scale) {
        #x <- t(t(x)*misc[['varinfo']][colnames(x),'gsf'])
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      }
      if(verbose) message('.')
      
      if(!is.null(cells)) {
        # cell subset is just for PC determination
        nPcs <- min(min(length(cells),ncol(x))-1,nPcs)
        cm <- Matrix::colMeans(x[cells,])
        pcs <- irlba(x[cells,], nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
      } else {
        nPcs <- min(min(nrow(x),ncol(x))-1,nPcs)
        if(center) {
          cm <- Matrix::colMeans(x)
          pcs <- irlba(x, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
        } else {
          pcs <- irlba(x, nv=nPcs, nu=0, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
        }
      }
      rownames(pcs$v) <- colnames(x);
      
      if(verbose) message('.')
      
      # adjust for centering!
      if(center) {
        pcs$center <- cm;
        pcas <- as.matrix(t(as(t(x %*% pcs$v), "dgeMatrix") - t(cm %*% pcs$v)))
      } else {
        pcas <- as.matrix(x %*% pcs$v);
      }
      misc$PCA <<- pcs;
      if(verbose) message('.')
      #pcas <- scde::winsorize.matrix(pcas,0.05)
      # # control for sequencing depth
      # if(is.null(batch)) {
      #   mx <- model.matrix(x ~ d,data=data.frame(x=1,d=depth))
      # } else {
      #   mx <- model.matrix(x ~ d*b,data=data.frame(x=1,d=depth,b=batch))
      # }
      # # TODO: how to get rid of residual depth effects in the PCA-based clustering?
      # #pcas <- t(t(colLm(pcas,mx,returnResid=TRUE))+Matrix::colMeans(pcas))
      # pcas <- colLm(pcas,mx,returnResid=TRUE)
      rownames(pcas) <- rownames(x)
      colnames(pcas) <- paste('PC',seq(ncol(pcas)),sep='')
      #pcas <- pcas[,-1]
      #pcas <- scde::winsorize.matrix(pcas,0.1)
      if(verbose) message(' done\n')
      reductions[[name]] <<- pcas;
      ## nIcs <- nPcs;
      ## a <- ica.R.def(t(pcas),nIcs,tol=1e-3,fun='logcosh',maxit=200,verbose=T,alpha=1,w.init=matrix(rnorm(nIcs*nPcs),nIcs,nPcs))
      ## reductions[['ICA']] <<- as.matrix( x %*% pcs$v %*% a);
      ## colnames(reductions[['ICA']]) <<- paste('IC',seq(ncol(reductions[['ICA']])),sep='');
      
      return(invisible(pcas))
    },
    
    # will reset odgenes to be a superset of the standard odgene selection (guided by n.odgenes or alpha), and
    # a set of recursively determined odgenes based on a given group (or a cluster info)
    expandOdGenes=function(type='counts', clusterType=NULL, groups=NULL , min.group.size=30, od.alpha=1e-1, use.odgenes=FALSE, n.odgenes=NULL, odgenes=NULL, n.odgene.multiplier=1, gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=10,max.odgenes=Inf,take.top.odgenes=TRUE, recursive=TRUE) {
      # determine groups
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        groups <- as.factor(groups[names(groups) %in% rownames(counts)]);
        groups <- droplevels(groups);
      }
      
      
      # determine initial set of odgenes
      if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
        if(is.null(misc[['varinfo']] )) { stop("please run adjustVariance() first")}
        df <- misc$varinfo
        odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
        #odgenes <- misc[['odgenes']];
        if(!is.null(n.odgenes)) {
          if(n.odgenes>length(odgenes)) {
            #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
            odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
          } else {
            odgenes <- odgenes[1:n.odgenes]
          }
        }
      }
      
      # filter out small groups
      if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
      if(sum(!is.na(groups))<min.group.size) {
        warning("clustering specifies fewer cells than min.group.size")
        return(odgenes)
      }
      
      
      if(length(levels(groups))<2) {
        warning("cannot expand od genes based on a single group")
        return(odgenes)
      }
      
      # determine groups for which variance normalization will be reran
      if(recursive) {
        if(verbose) message("recursive group enumeration ...")
        # derive cluster hierarchy
        
        # use raw counts to derive clustering
        z <- misc$rawCounts;
        rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
        rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
        tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
        rownames(tc) <- levels(groups)
        d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
        hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
        
        dlab <- function(l) {
          if(is.leaf(l)) {
            return(list(labels(l)))
          } else {
            return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
          }
        }
        
        # for each level in the cluster hierarchy, except for the top
        rgroups <- dlab(as.dendrogram(hc))[-1]
        rgroups <- c(list(levels(groups)),rgroups)
        if(verbose) message("done.\n");
      } else {
        rgroups <- lapply(levels(groups),I)
      }
      names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
      
      # run local variance normalization
      if(verbose) message("running local variance normalization ");
      # run variance normalization, determine PCs
      gpcs <- papply(rgroups,function(group) {
        cells <- names(groups)[groups %in% group]
        
        # variance normalization
        df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
        #if(!is.null(n.odgenes)) {
        #  odgenes <- rownames(df)[order(df$lp,decreasing=F)[1:n.odgenes]]
        #} else {
        df <- df[!is.na(df$lp),,drop=FALSE]
        df <- df[order(df$lp,decreasing=FALSE),,drop=FALSE]
        n.od <- min(max(sum(df$lpa<log(od.alpha)),min.odgenes),max.odgenes);
        if(n.od>0) {
          odgenes <- rownames(df)[1:min(n.od*n.odgene.multiplier,nrow(df))]
        } else {
          return(NULL)
        }
        sf <- df$gsf[match(odgenes,rownames(df))];
        return(list(sf=sf,cells=cells,odgenes=odgenes))
      },n.cores=n.cores,mc.preschedule=TRUE)
      if(verbose) message(" done\n");
      odg <- unique(unlist(lapply(gpcs,function(z) z$odgenes)))
      # TODO: consider gsf?
      odgenes <- unique(c(odgenes,odg));
      misc[['odgenes']] <<- odgenes;
      return(invisible(odgenes));
    },
    
    localPcaKnn=function(nPcs=5, type='counts', clusterType=NULL, groups=NULL , k=30, b=1, a=1, min.group.size=30, name='localPCA', baseReduction='PCA', od.alpha=1e-1, n.odgenes=NULL,gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=5,take.top.odgenes=FALSE, recursive=TRUE,euclidean=FALSE,perplexity=k,debug=FALSE,return.pca=FALSE,skip.pca=FALSE) {
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        groups <- as.factor(groups[names(groups) %in% rownames(x)]);
        groups <- droplevels(groups);
        
      }
      if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
      if(sum(!is.na(groups))<min.group.size) { stop("clustering specifies fewer cells than min.group.size") }
      
      
      if(recursive) {
        if(verbose) message("recursive group enumeration ...")
        ## # derive cluster hierarchy
        ## rowFac <- rep(-1,nrow(x)); names(rowFac) <- rownames(x);
        ## rowFac[names(groups)] <- as.integer(groups);
        ## tc <- colSumByFac(x,as.integer(rowFac))[-1,]
        ## rownames(tc) <- levels(groups)
        ## #tc <- rbind("total"=Matrix::colSums(tc),tc)
        ## #d <- jsDist(t(((tc/pmax(1,Matrix::rowSums(tc)))))); rownames(d) <- colnames(d) <- rownames(tc)
        ## d <- 1-cor(t(tc))
        ## hc <- hclust(as.dist(d),method='ward.D')
        
        # use raw counts to derive clustering
        z <- misc$rawCounts;
        rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
        rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
        tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
        rownames(tc) <- levels(groups)
        d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
        hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
        
        
        dlab <- function(l) {
          if(is.leaf(l)) {
            return(list(labels(l)))
          } else {
            return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
          }
        }
        
        # for each level in the cluster hierarchy, except for the top
        rgroups <- dlab(as.dendrogram(hc))[-1]
        rgroups <- c(list(levels(groups)),rgroups)
        if(verbose) message("done.\n");
      } else {
        rgroups <- lapply(levels(groups),I)
      }
      names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
      
      
      if(verbose) message("determining local PCs ");
      # run variance normalization, determine PCs
      gpcs <- papply(rgroups,function(group) {
        cells <- names(groups)[groups %in% group]
        
        # variance normalization
        df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
        if(!is.null(n.odgenes)) {
          odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:n.odgenes]]
        } else {
          odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
        }
        if(length(odgenes)<min.odgenes) {
          if(take.top.odgenes) {
            odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:min.odgenes]]
          } else {
            return(NULL);
          }
        }
        sf <- df$gsf[match(odgenes,rownames(df))];
        
        if(return.pca && skip.pca) {
          return(list(sf=sf,cells=cells,odgenes=odgenes))
        }
        
        
        y <- t(t(x[cells,odgenes])*sf)
        cm <- Matrix::colMeans(y)
        # PCA
        pcs <- irlba(y, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=TRUE,reorth=TRUE)
        rownames(pcs$v) <- colnames(y);
        pcs$center <- cm;
        # row-randomize x to get a sense for the pcs
        m1 <- y;
        if(euclidean) {
          #for (i in 1:nrow(m1)) m1[i,] <- m1[i,order(runif(length(m1[i,])))]
          m1@i <- sample(m1@i)
          rpcas <- t(t(m1 %*% pcs$v) - t(pcs$center %*% pcs$v))
          pcs$rsd <- apply(rpcas,2,sd)
          pcs$trsd <- sd(dist(rpcas))
        }
        
        # sample within-cluster distances (based on main PCA)
        
        pcas <- as.matrix(t(t(t(t(x[,odgenes])*sf) %*% pcs$v) - t(pcs$center %*% pcs$v)))
        if(verbose) message(".")
        return(list(pcs=pcs,sf=sf,df=df,cells=cells,pcas=pcas,odgenes=odgenes))
      },n.cores=n.cores)
      if(verbose) message(" done\n");
      if(return.pca) return(gpcs)
      
      ivi <- unlist(lapply(gpcs,is.null))
      if(any(ivi)) { gpcs <- gpcs[!ivi]; rgroups <- rgroups[!ivi]; }
      
      if(debug) { browser() }
      
      
      # calculate cell relevance to each cluster (p_k,i matrix)
      # use global PCA distances
      if(verbose) message("calculating global distances ...");
      gcdist <- as.matrix(dist(gpcs[[1]]$pcas))
      if(verbose) message(" done.\n")
      
      # for each PCA
      # for each cell, determine p_k_i
      # for each PC
      # determine cell projections
      # subset genes
      # subtract center
      # scale, multiply
      # for each pair, determine cell distances
      # use cell distances to complete weight matrix
      # add to the w*d^2 and w matrices
      # normalize by the sqrt(sum(w))
      
      
      if(euclidean) {
        if(verbose) message("calculating local Euclidean distances .");
        dcs <- papply(gpcs,function(p) {
          pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
          nci <- setdiff(rownames(gcdist),p$cells)
          if(length(nci)>0) {
            # determine within cluster sd
            scells <- sample(p$cells,min(1e3,length(p$cells)))
            cldsd <- sd(as.numeric(gcdist[scells,scells]))
            ncid <- rowMeans(gcdist[nci,scells])
            pk[nci] <- exp(-b*(ncid/cldsd)^2)
          }
          dsq <- as.matrix(dist(p$pcas)^2)
          w <- (1-exp(-a*(dsq)/(p$pcs$trsd^2))) * (pk %o% pk)
          if(verbose) message(".")
          list(dsq=dsq,w=w)
        },n.cores=n.cores)
        if(verbose) message(".")
        d <- Reduce('+',lapply(dcs,function(x) x$dsq*x$w))
        if(verbose) message(".")
        d <- sqrt(d/Reduce('+',lapply(dcs,function(x) x$w)));
        diag(d) <- 0
        if(verbose) message(" done.\n")
      } else {
        # weighted correlation
        if(verbose) message("calculating local correlation distances .");
        dcs <- papply(gpcs,function(p) {
          pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
          nci <- setdiff(rownames(gcdist),p$cells)
          if(length(nci)>0) {
            # determine within cluster sd
            scells <- sample(p$cells,min(1e3,length(p$cells)))
            cldsd <- sd(as.numeric(gcdist[scells,scells]))
            ncid <- rowMeans(gcdist[nci,scells])
            pk[nci] <- exp(-b*(ncid/cldsd)^2)
          }
          x <- cov(t(p$pcas))*(ncol(p$pcas)-1)
          xc <- x / sqrt(diag(x) %o% diag(x)) # correlation
          w <- (1-exp(-a*(1-xc))) * (pk %o% pk)
          
          if(verbose) message(".")
          list(x=x,w=w)
        },n.cores=n.cores)
        if(verbose) message(".")
        # calculate sum_{k}_{w_k*v} matrix
        wm <- Reduce('+',lapply(dcs,function(z) z$w*diag(z$x)))
        d <- Reduce('+',lapply(dcs,function(z) z$x*z$w))
        d <- 1-d/sqrt(wm*t(wm))
        diag(d) <- 0
        if(verbose) message(" done.\n")
      }
      
      
      
      ## d <- dcs[[1]]$dsq
      ## d <- as.matrix(dist(r$reductions$PCA))
      ## knn <- apply(d,2,function(x) order(x,decreasing=F)[1:(k+1)])
      ## cat(".")
      ## m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
      ## m <- m+t(m); # symmetrize
      ## diag(m) <- 0;
      ## rownames(m) <- colnames(m) <- rownames(d)
      ## x <- list(m=m)
      ## i <- 5; cl <- rep(NA,nrow(x$m)); names(cl) <- rownames(x$m); cl[rownames(x$m)[i]] <- 1; cl[which(x$m[,i]>0)] <- 2; cl <- as.factor(cl);
      ## r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
      
      ## i <- 5; cl <- 1/(dcs[[1]]$dsq[,i]+1e-6); names(cl) <- rownames(x$m);
      ## i <- 5; cl <- 1/(d[,i]+1e-6); names(cl) <- rownames(x$m);
      ## r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
      
      # kNN
      if(verbose) message("creating kNN graph .");
      knn <- apply(d,2,function(x) order(x,decreasing=FALSE)[1:(k+1)])
      if(verbose) message(".")
      #m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
      m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=d[as.integer(t(t(knn)+((1:ncol(knn))-1)*nrow(d)))])
      m <- m+t(m); # symmetrize
      diag(m) <- 0;
      rownames(m) <- colnames(m) <- rownames(d)
      if(verbose) message(".")
      g <- graph_from_adjacency_matrix(m,mode='undirected',weighted=TRUE);
      if(verbose) message(".")
      graphs[[name]] <<- g;
      if(verbose) message(" done.\n")
      
      emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, num_threads=n.cores)$Y;
      rownames(emb) <- colnames(d)
      embeddings[[type]][[name]] <<- emb;
      
      # calculate cell-cell distance, considering weighting
      
      # getting total cell-cell distance
      
      return(invisible(list(d=d,m=m,gpcs=gpcs)))
    },
    
    
    # test pathway overdispersion
    # this is a compressed version of the PAGODA1 approach
    # env - pathway to gene environment
    testPathwayOverdispersion=function(setenv, 
                                       type='counts', 
                                       max.pathway.size=1e3, 
                                       min.pathway.size=10, 
                                       n.randomizations=5, 
                                       verbose=FALSE, 
                                       score.alpha=0.05,
                                       plot=FALSE, 
                                       cells=NULL,
                                       adjusted.pvalues=TRUE,
                                       z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                       use.oe.scale = FALSE, return.table=FALSE,
                                       name='pathwayPCA',
                                       correlation.distance.threshold=0.2,
                                       loading.distance.threshold=0.01,
                                       top.aspects=Inf,
                                       recalculate.pca=FALSE,
                                       save.pca=TRUE) {
      nPcs <- 1;
      
      if(type=='counts') {
        x <- counts;
        # apply scaling if using raw counts
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(!is.null(cells)) {
        x <- x[cells,]
      }
      
      proper.gene.names <- colnames(x);
      
      if(is.null(misc[['pwpca']]) || recalculate.pca) {
        if(verbose) {
          message("determining valid pathways")
        }
        
        # determine valid pathways
        gsl <- ls(envir = setenv)
        gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
        gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
        names(gsl) <- gsl
        
        if(verbose) {
          message("processing ", length(gsl), " valid pathways")
        }
        
        cm <- Matrix::colMeans(x)
        
        pwpca <- papply(gsl, function(sn) {
          lab <- proper.gene.names %in% get(sn, envir = setenv)
          if(sum(lab)<1) { return(NULL) }
          pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
          pcs$d <- pcs$d/sqrt(nrow(x))
          pcs$rotation <- pcs$v;
          pcs$v <- NULL;
          
          # get standard deviations for the random samples
          ngenes <- sum(lab)
          z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
            si <- sample(ncol(x), ngenes)
            pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
          }))
          z <- z/sqrt(nrow(x));
          
          # local normalization of each component relative to sampled PC1 sd
          avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
          
          if(avar>0.5) {
            # flip orientations to roughly correspond with the means
            pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
            cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
            pcs$scores <- pcs$scores*cs
            pcs$rotation <- pcs$rotation*cs
            rownames(pcs$rotation) <- colnames(x)[lab];
          } # don't bother otherwise - it's not significant
          return(list(xp=pcs,z=z,n=ngenes))
        }, n.cores = n.cores,mc.preschedule=TRUE)
        if(save.pca) {
          misc[['pwpca']] <<- pwpca;
        }
      } else {
        if(verbose) {
          message("re-using previous overdispersion calculations")
          pwpca <- misc[['pwpca']];
        }
      }
      
      if(verbose) {
        message("scoring pathway overdispersion signifcance")
      }
      
      # score overdispersion
      true.n.cells <- nrow(x)
      
      pagoda.effective.cells <- function(pwpca, start = NULL) {
        n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
        var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
        if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
        of <- function(p, v, sp) {
          sn <- p[1]
          vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
          residuals <- (v-vfit)^2
          return(sum(residuals))
        }
        x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
        return((x$par)^2+1/2)
      }
      n.cells <- pagoda.effective.cells(pwpca)
      
      vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
        vars <- as.numeric((pwpca[[i]]$xp$d))
        cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
      })))
      
      # fix p-to-q mistake in qWishartSpike
      qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
        params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
        qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
      }
      
      # add right tail approximation to ptw, which gives up quite early
      pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
        params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
        q.tw <- (q - params$centering)/(params$scaling)
        p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
        p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
        p
      }
      
      vshift <- 0
      ev <- 0
      
      vdf$var <- vdf$var-(vshift-ev)*vdf$n
      basevar <- 1
      vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
      vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      
      if(plot) {
        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
        un <- sort(unique(vdf$n))
        on <- order(vdf$n, decreasing = FALSE)
        pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
        plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
        lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
        lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
      }
      
      rs <- (vshift-ev)*vdf$n
      vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
      vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
      
      df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
      if(adjusted.pvalues) {
        vdf$valid <- vdf$cz  >=  z.score
      } else {
        vdf$valid <- vdf$z  >=  z.score
      }
      
      if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
      
      # apply additional filtering based on >0.5 sd above the local random estimate
      vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
      vdf$name <- names(pwpca)[vdf$i];
      
      if(return.table) {
        df <- df[vdf$valid, ]
        df <- df[order(df$score, decreasing = TRUE), ]
        return(df)
      }
      if(verbose) {
        message("compiling pathway reduction")
      }
      # calculate pathway reduction matrix
      
      # return scaled patterns
      xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
        xm <- x$xp$scores
      }))
      
      if(use.oe.scale) {
        xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
        vdf$sd <- as.numeric(vdf$oe)
      } else {
        # chi-squared
        xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
        vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
        
      }
      rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
      rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
      misc[['pathwayODInfo']] <<- vdf
      
      # collapse gene loading
      if(verbose) {
        message("clustering aspects based on gene loading ... ",appendLF=FALSE)
      }
      tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
      if(verbose) {
        message(nrow(tam2$xv)," aspects remaining")
      }
      if(verbose) {
        message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
      }
      tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
      if(verbose) {
        message(nrow(tam3$xv)," aspects remaining\n")
      }
      tam2$xvw <- tam3$xvw <- NULL; # to save space
      tam3$env <- setenv;
      
      # clean up aspect names, as GO ids are meaningless
      names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
      
      misc[['pathwayOD']] <<- tam3;
      reductions[[name]] <<- tam3$xv;
      return(invisible(tam3))
    },
    
    getEmbedding=function(type='counts', embeddingType='largeVis', name=NULL, dims=2, M=1, gamma=1/M, perplexity=50, sgd_batches=NULL, diffusion.steps=0, diffusion.power=0.5, 
                          distance='pearson', n.cores = .self$n.cores, n.sgd.cores=n.cores, ... ) {
      
      if(dims<1) stop("dimensions must be >=1")
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(is.null(name)) { 
        name <- embeddingType 
      }
      
      if(embeddingType=='largeVis') {
        edgeMat <- misc[['edgeMat']][[type]];
        if(is.null(edgeMat)) { stop(paste('KNN graph for type ',type,' not found. Please run makeKnnGraph with type=',type,sep='')) }
        if(is.null(sgd_batches)) { sgd_batches <- nrow(edgeMat)*1e3 }
        #edgeMat <- sparseMatrix(i=xn$s+1,j=xn$e+1,x=xn$rd,dims=c(nrow(x),nrow(x)))
        edgeMat <- (edgeMat + t(edgeMat))/2; # symmetrize
        #edgeMat <- sparseMatrix(i=c(xn$s,xn$e)+1,j=c(xn$e,xn$s)+1,x=c(xn$rd,xn$rd),dims=c(nrow(x),nrow(x)))
        # if(diffusion.steps>0) {
        #   Dinv <- Diagonal(nrow(edgeMat),1/colSums(edgeMat))
        #   Im <- Diagonal(nrow(edgeMat))
        #   W <- (Diagonal(nrow(edgeMat)) + edgeMat %*% Dinv)/2
        #   for(i in 1:diffusion.steps) {
        #     edgeMat <- edgeMat %*% W
        #   }
        # }
        #require(largeVis)
        #if(!is.null(seed)) { set.seed(seed) }
        if(!is.na(perplexity)) {
          wij <- buildWijMatrix(edgeMat,perplexity=perplexity,threads=n.cores)
        } else {
          wij <- edgeMat;
        }
        
        if(diffusion.steps>0) {
          Dinv <- Diagonal(nrow(wij),1/colSums(wij))
          W <- Dinv %*% wij ;
          W <- (1-diffusion.power)*Diagonal(nrow(wij)) + diffusion.power*W
          #W <- (Diagonal(nrow(wij)) + W)/2
          #W <- (Diagonal(nrow(wij)) + sign(W)*(abs(W)^(diffusion.power)))/2
          
          #W <- sign(W)*(abs(W)^diffusion.power)
          #W <- (Diagonal(nrow(wij)) + W)/2
          for(i in 1:diffusion.steps) {
            wij <- wij %*% W
          }
          #browser()
          if(!is.na(perplexity)) {
            wij <- buildWijMatrix(wij,perplexity=perplexity,threads=n.cores)
          }
          
        }
        coords <- projectKNNs(wij = wij, M = M, dim=dims, verbose = TRUE,sgd_batches = sgd_batches,gamma=gamma, seed=1, threads=n.cores, ...)
        colnames(coords) <- rownames(x);
        emb <- embeddings[[type]][[name]] <<- t(coords);
      } else if(embeddingType=='tSNE') {
        if(nrow(x)>4e4) {
          warning('Too many cells to pre-calcualte correlation distances, switching to L2. Please, consider using UMAP.');
          distance <- 'L2';
        }
        
        dup.ids <- which(duplicated(x))
        if (length(dup.ids) > 0) {
          max.vals <- abs(x[dup.ids,] * 0.01)
          x[dup.ids,] <- runif(length(x[dup.ids,]), -max.vals, max.vals)
        }
        
        if (distance=='L2') {
          if(verbose) message("running tSNE using ",n.cores," cores:\n")
          emb <- Rtsne::Rtsne(x, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
        } else {
          if(verbose) message('calculating distance ... ');
          if(verbose) message('pearson ...')
          d <- 1-cor(t(x))
          if(verbose) message("running tSNE using ",n.cores," cores:\n")
          emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
        }
        rownames(emb) <- rownames(x)
        embeddings[[type]][[name]] <<- emb;
      } else if(embeddingType=='FR') {
        g <- graphs[[type]];
        if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
        emb <- layout.fruchterman.reingold(g, weights=E(g)$weight)
        rownames(emb) <- rownames(x); colnames(emb) <- c("D1","D2")
        embeddings[[type]][[name]] <<- emb;
      } else if (embeddingType == "UMAP") {
        if (!requireNamespace("uwot", quietly=TRUE))
          stop("You need to install package 'uwot' to be able to use UMAP embedding.")
        
        distance <- switch (distance, pearson = "cosine", L2 = "euclidean", distance)
        
        emb <- uwot::umap(as.matrix(x), metric=distance, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
        rownames(emb) <- rownames(x)
        embeddings[[type]][[name]] <<- emb;
      } else if (embeddingType == "UMAP_graph") {
        g <- graphs[[type]];
        if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
        emb <- embedKnnGraphUmap(g, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
        embeddings[[type]][[name]] <<- emb;
      } else {
        stop('unknown embeddingType ',embeddingType,' specified');
      }
      
      return(invisible(emb));
    }
  )
);

pagoda1(scde包)+pagoda2
最近在使用pagoda做单细胞的基因集分析，不得不说，算法是真的慢，，，一个快速跑的示例文件 scde下的pag...
单细胞转录组差异表达分析——DEsingle
上一篇说道SCDE包进行单细胞测序的差异表达分析。这一篇就来说说DEsingle这个包。1、安装先上包的官网链接h...
单细胞转录组数据分析课件||8. Differential ex
单细胞数据分析中的差异分析主要是找出每个亚群的marker基因，可以作为亚群的某个特性。此处省略SCDE的标准流...
单细胞转录组差异表达分析——SCDE
哎呀，终究还是绕不过差异表达分析。单细胞测序，测得还是转录组数据，和bluk data的差别就是单细胞测序存在许多...
闭包，闭包，闭包
1、这家伙到底是什么？网上关于这个的讨论的太多了太多了，有各种的举例子，但是大部分还在寻找这个答案的小伙伴对于变...
Golang-基础包
math包 strconv包 json包 time包
swift-闭包
闭包闭包定义闭包简化 - 尾随闭包闭包参数闭包返回值闭包的循环引用
npm使用手册
目录 # 包的管理# ----》创建包# ----》安装包# ----》删除包# ----》发布包# ----》更...
生信小组DAY6-JANE
R包R语言的包，常常听到各种名字，扩展包、程序包、软件包、函数包、算法包……不如单名一个“包（Package）”字...
“包治百病”却拯救不了你的“烂人生”
1. 包个包个包包那是什么包包个包个包包这个害人包一包一包一包包包催人傻妙龄青春女孩已经犯错了！！！包是什...