最近在使用pagoda做单细胞的基因集分析,不得不说,算法是真的慢,,,
pbmc3k数据集
5个线程跑满
用时为:
knn.error.models
user system elapsed
6033.772 23.041 2105.214
pagoda.varnorm
user system elapsed
3387.265 49.590 734.011
pagoda.pathway.wPCA
user system elapsed
8713.815 12737.006 1091.431
pagoda.gene.clusters
user system elapsed
11324.177 8925.026 1199.914
大概用时:2105+734+1091+1199=5129 约为1.4h
一个快速跑的示例文件
library(scde)
library(SeuratData)
data("pbmc3k")
#导入为count矩阵,同时要改为integer
pbmc3k<-apply(as.matrix(pbmc3k@assays$RNA@counts),2,function(x){storage.mode(x)<-"integer";x})
pbmc3k[1:6,1:66]
#为了加快速度,我取其中的1/10的细胞数用于后续分析
set.seed(2020)
pbmc3k<-pbmc3k[,sample(1:ncol(pbmc3k),round(ncol(pbmc3k)/10),replace = F)]
dim(pbmc3k)
#去掉低质量的细胞和基因
pbmc3k<-clean.counts(pbmc3k,min.lib.size = 200,min.reads = 10)
dim(pbmc3k)
#对每个细胞构建误差模型,
knn <- knn.error.models(pbmc3k,
#groups = ,可选
k = ncol(pbmc3k)/4, #猜测整个细胞群可以分为4个群
n.cores = 5, #用10个核
min.count.threshold = 2, #质控参数参数
min.nonfailed = 5, #质控参数
max.model.plots = 10)
#Normalizing variance
varinfo <- pagoda.varnorm(knn, #
counts = pbmc3k,
trim = 3/ncol(pbmc3k),
max.adj.var = 5,
n.cores = 5,
plot = TRUE)
#Controlling for sequencing depth
varinfo <- pagoda.subtract.aspect(varinfo, colSums(pbmc3k[, rownames(knn)]>0))
#Evaluate overdispersion of pre-defined gene sets
library(org.Hs.eg.db)
library(msigdbr)
C2<-msigdbr(category = "C2")
C2_kegg<-C2[grep("KEGG",C2$gs_subcat),]
C2_kegg<-split(C2_kegg$human_gene_symbol,C2_kegg$gs_name)
C2_kegg
C2_kegg<-list2env(C2_kegg)
#对每个基因集计算加权的第一个主成分的量级
pwpca <- pagoda.pathway.wPCA(varinfo,
C2_kegg,
n.components = 1,
n.cores = 5)
#对上述结果做统计学检验
df <- pagoda.top.aspects(pwpca,
return.table = F,
plot = F,
z.score = 2)
dim(df)
# #寻找denovo的基因集
clpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 10, plot = TRUE)
df <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
#列先聚类,在做热图(很奇怪!!)
hc <- pagoda.cluster.cells(df, varinfo)
pagoda.view.aspects(df,cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20))
pheatmap(pbmc3k[names(df$gw[order(df$gw,decreasing = T)][1:200]),hc$order],
scale = "row",
cluster_rows = F,breaks = seq(-5,5,10/100),cluster_cols = F)
scde下的pagoda1实在是太慢了!!!!!
无奈还是要使用pagoda2包进行pagoda分析
但是pagoda2的对象结构有些特殊,属于对象中嵌套操作函数。而比较坑爹的是这种嵌套的操作函数却没有对应的帮助文档,,,,
为此我去浏览了pagoda2工程文件的全部内容
image.png
当我们打开pagoda2/R路径时,进入存放所有脚本的地方
image.png
如果对一些说明文档中未提及的函数感兴趣,可以直接通过.R脚本查看函数的源代码!
testPathwayOverdispersion=function(setenv,
type='counts',
max.pathway.size=1e3,
min.pathway.size=10,
n.randomizations=5,
verbose=FALSE,
score.alpha=0.05,
plot=FALSE,
cells=NULL,
adjusted.pvalues=TRUE,
z.score = qnorm(0.05/2, lower.tail = FALSE),
use.oe.scale = FALSE, return.table=FALSE,
name='pathwayPCA',
correlation.distance.threshold=0.2,
loading.distance.threshold=0.01,
top.aspects=Inf,
recalculate.pca=FALSE,
save.pca=TRUE) {
nPcs <- 1;
if(type=='counts') {
x <- counts;
# apply scaling if using raw counts
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
} else {
if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
x <- reductions[[type]]
}
if(!is.null(cells)) {
x <- x[cells,]
}
proper.gene.names <- colnames(x);
if(is.null(misc[['pwpca']]) || recalculate.pca) {
if(verbose) {
message("determining valid pathways")
}
# determine valid pathways
gsl <- ls(envir = setenv)
gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
names(gsl) <- gsl
if(verbose) {
message("processing ", length(gsl), " valid pathways")
}
cm <- Matrix::colMeans(x)
pwpca <- papply(gsl, function(sn) {
lab <- proper.gene.names %in% get(sn, envir = setenv)
if(sum(lab)<1) { return(NULL) }
pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
pcs$d <- pcs$d/sqrt(nrow(x))
pcs$rotation <- pcs$v;
pcs$v <- NULL;
# get standard deviations for the random samples
ngenes <- sum(lab)
z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
si <- sample(ncol(x), ngenes)
pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
}))
z <- z/sqrt(nrow(x));
# local normalization of each component relative to sampled PC1 sd
avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
if(avar>0.5) {
# flip orientations to roughly correspond with the means
pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
pcs$scores <- pcs$scores*cs
pcs$rotation <- pcs$rotation*cs
rownames(pcs$rotation) <- colnames(x)[lab];
} # don't bother otherwise - it's not significant
return(list(xp=pcs,z=z,n=ngenes))
}, n.cores = n.cores,mc.preschedule=TRUE)
if(save.pca) {
misc[['pwpca']] <<- pwpca;
}
} else {
if(verbose) {
message("re-using previous overdispersion calculations")
pwpca <- misc[['pwpca']];
}
}
if(verbose) {
message("scoring pathway overdispersion signifcance")
}
# score overdispersion
true.n.cells <- nrow(x)
pagoda.effective.cells <- function(pwpca, start = NULL) {
n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
of <- function(p, v, sp) {
sn <- p[1]
vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
residuals <- (v-vfit)^2
return(sum(residuals))
}
x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
return((x$par)^2+1/2)
}
n.cells <- pagoda.effective.cells(pwpca)
vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
vars <- as.numeric((pwpca[[i]]$xp$d))
cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
})))
# fix p-to-q mistake in qWishartSpike
qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE) {
params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
}
# add right tail approximation to ptw, which gives up quite early
pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
q.tw <- (q - params$centering)/(params$scaling)
p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
p
}
vshift <- 0
ev <- 0
vdf$var <- vdf$var-(vshift-ev)*vdf$n
basevar <- 1
vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
#vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
if(plot) {
par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
un <- sort(unique(vdf$n))
on <- order(vdf$n, decreasing = FALSE)
pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
}
rs <- (vshift-ev)*vdf$n
vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
if(adjusted.pvalues) {
vdf$valid <- vdf$cz >= z.score
} else {
vdf$valid <- vdf$z >= z.score
}
if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
# apply additional filtering based on >0.5 sd above the local random estimate
vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
vdf$name <- names(pwpca)[vdf$i];
if(return.table) {
df <- df[vdf$valid, ]
df <- df[order(df$score, decreasing = TRUE), ]
return(df)
}
if(verbose) {
message("compiling pathway reduction")
}
# calculate pathway reduction matrix
# return scaled patterns
xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
xm <- x$xp$scores
}))
if(use.oe.scale) {
xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
vdf$sd <- as.numeric(vdf$oe)
} else {
# chi-squared
xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
}
rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
misc[['pathwayODInfo']] <<- vdf
# collapse gene loading
if(verbose) {
message("clustering aspects based on gene loading ... ",appendLF=FALSE)
}
tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
if(verbose) {
message(nrow(tam2$xv)," aspects remaining")
}
if(verbose) {
message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
}
tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
if(verbose) {
message(nrow(tam3$xv)," aspects remaining\n")
}
tam2$xvw <- tam3$xvw <- NULL; # to save space
tam3$env <- setenv;
# clean up aspect names, as GO ids are meaningless
names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
misc[['pathwayOD']] <<- tam3;
reductions[[name]] <<- tam3$xv;
return(invisible(tam3))
},
#' @useDynLib pagoda2
#' @import MASS
#' @import Matrix
#' @importFrom Rcpp evalCpp
#' @import Rook
#' @import igraph
#' @import sccore
#' @importFrom irlba irlba
#' @importFrom parallel mclapply
#' @importFrom Rcpp sourceCpp
#' @importFrom magrittr %>%
NULL
#' A Reference Class, which holds and process single cell RNA-seq data.
#'
#' @field counts gene count matrix, normalized on total counts
#' @field clusters results of clustering
#' @field graphs graph representations of the dataset
#' @field reductions results of reductions, i.e. PCA
#' @field embeddings results of visualization algorithms, i.e. t-SNE or largeVis
#' @field diffgenes lists of differentially expressed genes
#' @field pathways pathway information
#' @field n.cores number of cores, used for the analyses
#' @field misc a list of miscelaneous structures
#' @field batch batch factor for the dataset
#' @field modelType plain (default) or raw (expression matrix taken as is without normalization, though log.scale still applies)
#' @field verbose verbosity level
#' @field depth cell size factor
#' @field genegraphs a slot to store graphical representations in gene space (i.e. gene kNN graphs)
#' @export Pagoda2
#' @exportClass Pagoda2
Pagoda2 <- setRefClass(
"Pagoda2",
fields=c('counts','clusters','graphs','reductions','embeddings','diffgenes',
'pathways','n.cores','misc','batch','modelType','verbose','depth','batchNorm','mat','genegraphs'),
methods = list(
initialize=function(x, ..., modelType='plain', batchNorm='glm',
n.cores=parallel::detectCores(logical=FALSE), verbose=TRUE,
min.cells.per.gene=0, trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10,
lib.sizes=NULL, log.scale=TRUE, keep.genes = NULL) {
# # init all the output lists
embeddings <<- list();
graphs <<- list();
diffgenes <<- list();
reductions <<-list();
clusters <<- list();
pathways <<- list();
genegraphs <<- list();
misc <<-list(lib.sizes=lib.sizes, log.scale=log.scale, model.type=modelType, trim=trim);
batch <<- NULL;
counts <<- NULL;
if(!missing(x) && ('Pagoda2' %in% class(x))) { # copy constructor
callSuper(x, ..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores);
} else {
callSuper(..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores,verbose=verbose);
if(!missing(x) && is.null(counts)) { # interpret x as a countMatrix
if ('matrix' %in% class(x)) {
x <- as(Matrix(x, sparse=TRUE), "dgCMatrix")
}
if(!('dgCMatrix' %in% class(x))) {
stop("x is not of class dgCMatrix or matrix");
}
#if(any(x@x < 0)) {
# stop("x contains negative values");
#}
setCountMatrix(x, min.cells.per.gene=min.cells.per.gene, trim=trim,
min.transcripts.per.cell=min.transcripts.per.cell, lib.sizes=lib.sizes,
log.scale=log.scale, keep.genes=keep.genes)
}
}
},
# provide the initial count matrix, and estimate deviance residual matrix (correcting for depth and batch)
setCountMatrix=function(countMatrix, depthScale=1e3, min.cells.per.gene=30,
trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10,
lib.sizes=NULL, log.scale=FALSE, keep.genes = NULL) {
# check names
if(any(duplicated(rownames(countMatrix)))) {
stop("duplicate gene names are not allowed - please reduce")
}
if(any(duplicated(colnames(countMatrix)))) {
stop("duplicate cell names are not allowed - please reduce")
}
if(any(is.na(rownames(countMatrix)))) {
stop("NA gene names are not allowed - please fix")
}
if(any(is.na(colnames(countMatrix)))) {
stop("NA cell names are not allowed - please fix")
}
if(ncol(countMatrix)<3) { stop("too few cells remaining after min.count.per.cell filter applied - have you pre-filtered the count matrix to include only cells of a realistic size?") }
counts <<- t(countMatrix)
# Keep genes of sufficient coverage or genes that are in the keep.genes list
counts <<- counts[,diff(counts@p) >= min.cells.per.gene | colnames(counts) %in% keep.genes]
# Save the filtered count matrix in misc$rawCounts
misc[['rawCounts']] <<- counts;
misc$depthScale <<- depthScale;
if (modelType == 'raw') {
return()
}
if(!is.null(batch)) {
if(!all(colnames(countMatrix) %in% names(batch))) {
stop("the supplied batch vector doesn't contain all the cells in its names attribute")
}
colBatch <- as.factor(batch[colnames(countMatrix)])
batch <<- colBatch;
}
if(!is.null(lib.sizes)) {
if(!all(colnames(countMatrix) %in% names(lib.sizes))) {
stop("the supplied lib.sizes vector doesn't contain all the cells in its names attribute")
}
lib.sizes <- lib.sizes[colnames(countMatrix)]
depth <<- lib.sizes/mean(lib.sizes)*mean(Matrix::colSums(countMatrix))
} else {
depth <<- Matrix::colSums(countMatrix);
}
cell.filt.mask <- (depth >= min.transcripts.per.cell)
counts <<- counts[cell.filt.mask,]
depth <<- depth[cell.filt.mask]
if (any(depth == 0)) {
stop("Cells with zero expression over all genes are not allowed")
}
if(verbose) message(nrow(counts)," cells, ",ncol(counts)," genes; normalizing ... ")
# get normalized matrix
if(modelType=='linearObs') { # this shoudln't work well, since the depth dependency is not completely normalized out
# winsorize in normalized space first in hopes of getting a more stable depth estimate
if(trim>0) {
counts <<- counts / as.numeric(depth);
inplaceWinsorizeSparseCols(counts,trim,n.cores);
counts <<- counts*as.numeric(depth);
if(is.null(lib.sizes)) {
depth <<- round(Matrix::rowSums(counts))
}
}
ldepth <- log(depth);
# rank cells, cut into n pieces
n.depth.slices <- 20;
#depth.fac <- as.factor(floor(rank(depth)/(length(depth)+1)*n.depth.slices)+1); names(depth.fac) <- rownames(counts);
depth.fac <- cut(cumsum(sort(depth)),breaks=seq(0,sum(depth),length.out=n.depth.slices)); names(depth.fac) <- rownames(counts);
depth.fac <- depth.fac[rank(depth)]
# dataset-wide gene average
gene.av <- (Matrix::colSums(counts)+n.depth.slices)/(sum(depth)+n.depth.slices)
# pooled counts, df for all genes
tc <- colSumByFac(counts,as.integer(depth.fac))[-1,,drop=FALSE]
tc <- log(tc+1)- log(as.numeric(tapply(depth,depth.fac,sum))+1)
md <- log(as.numeric(tapply(depth,depth.fac,mean)))
# combined lm
cm <- lm(tc ~ md)
colnames(cm$coef) <- colnames(counts)
# adjust counts
# predict log(p) for each non-0 entry
count.gene <- rep(1:counts@Dim[2],diff(counts@p))
exp.x <- exp(log(gene.av)[count.gene] - cm$coef[1,count.gene] - ldepth[counts@i+1]*cm$coef[2,count.gene])
counts@x <<- as.numeric(counts@x*exp.x/(depth[counts@i+1]/depthScale)); # normalize by depth as well
# performa another round of trim
if(trim>0) {
inplaceWinsorizeSparseCols(counts,trim,n.cores);
}
# regress out on non-0 observations of ecah gene
#non0LogColLmS(counts,mx,ldepth)
} else if(modelType=='plain') {
if(verbose) message("using plain model ")
if(!is.null(batch)) {
if(verbose) message("batch ... ")
# dataset-wide gene average
gene.av <- (Matrix::colSums(counts)+length(levels(batch)))/(sum(depth)+length(levels(batch)))
# pooled counts, df for all genes
tc <- colSumByFac(counts,as.integer(batch))[-1,,drop=FALSE]
tc <- t(log(tc+1)- log(as.numeric(tapply(depth,batch,sum))+1))
bc <- exp(tc-log(gene.av))
# adjust every non-0 entry
count.gene <- rep(1:counts@Dim[2],diff(counts@p))
counts@x <<- as.numeric(counts@x/bc[cbind(count.gene,as.integer(batch)[counts@i+1])])
}
if(trim>0) {
if(verbose) message("winsorizing ... ")
counts <<- counts/as.numeric(depth);
inplaceWinsorizeSparseCols(counts,trim,n.cores);
counts <<- counts*as.numeric(depth);
if(is.null(lib.sizes)) {
depth <<- round(Matrix::rowSums(counts))
}
}
counts <<- counts/as.numeric(depth/depthScale);
} else {
stop('modelType ',modelType,' is not implemented');
}
if(log.scale) {
if(verbose) message("log scale ... ")
counts@x <<- as.numeric(log(counts@x+1))
}
misc[['rescaled.mat']] <<- NULL;
if(verbose) message("done.\n")
},
# adjust variance of the residual matrix, determine overdispersed sites
adjustVariance=function(gam.k=5, alpha=5e-2, plot=FALSE, use.raw.variance=(.self$modelType=='raw'), use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, cells=NULL, verbose=TRUE, min.gene.cells=0, persist=is.null(cells), n.cores = .self$n.cores) {
#persist <- is.null(cells) # persist results only if variance normalization is performed for all cells (not a subset)
if(!is.null(cells)) { # translate cells into a rowSel boolean vector
if(!(is.logical(cells) && length(cells)==nrow(counts))) {
if(is.character(cells) || is.integer(cells)) {
rowSel <- rep(FALSE,nrow(counts)); names(rowSel) <- rownames(counts); rowSel[cells] <- TRUE;
} else {
stop("cells argument must be either a logical vector over rows of the count matrix (cells), a vector of cell names or cell integer ids (row numbers)");
}
}
} else {
rowSel <- NULL;
}
if(verbose) message("calculating variance fit ...")
df <- colMeanVarS(counts,rowSel,n.cores);
if(use.raw.variance) { # use raw variance estimates without relative adjustments
rownames(df) <- colnames(counts);
vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
df$lp <- df$lpa <- log(df$v);
df$qv <- df$v
df$gsf <- 1; # no rescaling of variance
ods <- order(df$v,decreasing=TRUE); if(length(ods)>1e3) { ods <- ods[1:1e3] }
if(persist) misc[['odgenes']] <<- rownames(df)[ods];
} else {
# gene-relative normalizaton
df$m <- log(df$m); df$v <- log(df$v);
rownames(df) <- colnames(counts);
vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
if(length(vi)<gam.k*1.5) { gam.k=1 };# too few genes
if(gam.k<2) {
if(verbose) message(" using lm ")
m <- lm(v ~ m, data = df[vi,])
} else {
if(verbose) message(" using gam ")
m <- mgcv::gam(as.formula(paste0('v ~ s(m, k = ',gam.k,')')), data = df[vi,])
}
df$res <- -Inf; df$res[vi] <- resid(m,type='response')
n.obs <- df$nobs; #diff(counts@p)
suppressWarnings(df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=FALSE,log.p=TRUE)))
df$lpa <- bh.adjust(df$lp,log=TRUE)
n.cells <- nrow(counts)
df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE, log.p=TRUE)/n.cells)
if(use.unadjusted.pvals) {
ods <- which(df$lp<log(alpha))
} else {
ods <- which(df$lpa<log(alpha))
}
if(persist) misc[['odgenes']] <<- rownames(df)[ods];
if(verbose) message(length(ods),' overdispersed genes ... ',length(ods) )
df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
df$gsf[!is.finite(df$gsf)] <- 0;
}
if(persist) {
if(verbose) message('persisting ... ')
misc[['varinfo']] <<- df;
}
# rescale mat variance
## if(rescale.mat) {
## if(verbose) cat("rescaling signal matrix ... ")
## #df$gsf <- geneScaleFactors <- sqrt(1/exp(df$v));
## inplaceColMult(counts,geneScaleFactors,rowSel); # normalize variance of each gene
## #inplaceColMult(counts,rep(1/mean(Matrix::colSums(counts)),ncol(counts))); # normalize the column sums to be around 1
## if(persist) misc[['rescaled.mat']] <<- geneScaleFactors;
## }
if(plot) {
if(do.par) {
par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
}
## convert to log10 from natural log() by factor log10(exp(1))
smoothScatter(log10(exp(1))*df$m,log10(exp(1))*df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
grid <- seq(min(log10(exp(1))*df$m[vi]),max(log10(exp(1))*df$m[vi]),length.out=1000)
lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
## NOTE: overdispersed genes ods calcuated with log()-based corrections...
if(length(ods)>0) {
points(log10(exp(1))*df$m[ods],log10(exp(1))*df$v[ods],pch='.',col=2,cex=1)
}
smoothScatter(log10(exp(1))*df$m[vi],log10(exp(1))*df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
abline(h=1,lty=2,col=8)
if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
points(log10(exp(1))*df$m[ods],log10(exp(1))*df$qv[ods],col=2,pch='.')
}
if(verbose) message("done.\n")
return(invisible(df));
},
# make a Knn graph
# note: for reproducibility, set.seed() and set n.cores=1
makeKnnGraph=function(k=30,nrand=1e3,type='counts',weight.type='1m',odgenes=NULL,n.cores=.self$n.cores,distance='cosine',center=TRUE,x=NULL,verbose=TRUE,p=NULL, var.scale=(type == "counts")) {
if(is.null(x)) {
x.was.given <- FALSE;
if(type=='counts') {
x <- counts;
# Scale Raw counts
} else {
if(type %in% names(reductions)) {
x <- reductions[[type]];
} else {
stop('Specified reduction does not exist');
}
}
if (var.scale) {
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
}
if(!is.null(odgenes)) {
if(!all(odgenes %in% rownames(x))) { warning("not all of the provided odgenes are present in the selected matrix")}
if(verbose) message("using provided odgenes ... ")
x <- x[,odgenes]
}
} else { # is.null(x)
x.was.given <- TRUE;
}
if(distance %in% c('cosine','angular')) {
if(center) {
x<- x - Matrix::rowMeans(x) # centering for consine distance
}
xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='angular')
} else if(distance %in% c('L2','euclidean')) {
xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='L2')
} else {
stop("unknown distance measure specified. Currently supported: angular, L2")
}
colnames(xn) <- rownames(xn) <- rownames(x);
#if(weight.type=='rank') {
# xn$r <- unlist(lapply(diff(c(0,which(diff(xn$s)>0),nrow(xn))),function(x) seq(x,1)))
#}
#xn <- xn[!xn$s==xn$e,]
diag(xn) <- 0;
xn <- Matrix::drop0(xn);
#if(n.cores==1) { # for reproducibility, sort by node names
# if(verbose) cat("ordering neighbors for reproducibility ... ");
# xn <- xn[order(xn$s+xn$e),]
# if(verbose) cat("done\n");
#}
#df <- data.frame(from=rownames(x)[xn$s+1],to=rownames(x)[xn$e+1],weight=xn$d,stringsAsFactors=F)
#if(weight.type=='rank') { df$rank <- xn$r }
if(weight.type %in% c("cauchy","normal") && ncol(x)>sqrt(nrand)) {
# generate some random pair data for scaling
if(distance=='cosine') {
#rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=T),sample(colnames(x),nrand,replace=T)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-cor(x[,z[1]],x[,z[2]])}))
rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-sum(x[,z[1]]*x[,z[2]])/sqrt(sum(x[,z[1]]^2)*sum(x[,z[2]]^2))}))
} else if(distance=='JS') {
rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {jw.disR(x[,z[1]],x[,z[2]])}))
} else if(distance=='L2') {
rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sqrt(sum((x[,z[1]]-x[,z[2]])^2))}))
} else if(distance=='L1') {
rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sum(abs(x[,z[1]]-x[,z[2]]))}))
}
suppressWarnings(rd.model <- MASS::fitdistr(rd,weight.type))
if(weight.type=='cauchy') {
xn@x <- 1/pcauchy(xn@x,location=rd.model$estimate['location'],scale=rd.model$estimate['scale'])-1
} else {
xn@x <- 1/pnorm(xn@x,mean=rd.model$estimate['mean'],sd=rd.model$estimate['sd'])-1
}
}
xn@x <- pmax(0,xn@x);
if(weight.type=='constant') { xn@x <- 1}
if(weight.type=='1m') { xn@x <- pmax(0,1-xn@x) }
#if(weight.type=='rank') { xn@x <- sqrt(df$rank) };
# make a weighted edge matrix for the largeVis as well
sxn <- (xn+t(xn))/2;
g <- igraph::graph_from_adjacency_matrix(sxn,mode='undirected',weighted=TRUE)
if(!x.was.given) {
if(is.null(misc[['edgeMat']])) { misc[['edgeMat']] <<- list() }
misc[['edgeMat']][[type]] <<- xn;
graphs[[type]] <<- g;
}
return(invisible(g))
},
# calculate clusters based on the kNN graph
getKnnClusters=function(type='counts',method=igraph::multilevel.community, name='community', subsampling.rate=0.8, n.subsamplings=10, cluster.stability.threshold=0.95, n.cores=.self$n.cores, g=NULL, min.cluster.size=1, persist=TRUE, return.details=FALSE, ...) {
if(is.null(g)) {
if(is.null(graphs[[type]])) { stop("call makeKnnGraph(type='",type,"', ...) first")}
g <- graphs[[type]];
}
if(is.null(method)) {
if(length(vcount(g))<2000) {
method <- igraph::infomap.community;
} else {
method <- igraph::multilevel.community;
}
}
# method <- igraph::multilevel.community; n.cores <- 20
# n.subsamplings <- 10; cluster.stability.dilution <- 1.5; cluster.stability.fraction <- 0.9; subsampling.rate <- 0.8; metaclustering.method<- 'ward.D'
# g <- r$graphs$PCA
# x <- r$counts
# cls <- method(g)
#library(parallel)
#x <- mclapply(1:5,function(z) method(g),mc.cores=20)
cls <- method(g,...)
cls.groups <- as.factor(membership(cls));
# cleanup the clusters to remove very small ones
if(min.cluster.size>1) {
cn <- names(cls.groups);
vg <- which(unlist(tapply(cls.groups,cls.groups,length))>=min.cluster.size);
cls.groups <- as.integer(cls.groups); cls.groups[!cls.groups %in% vg] <- NA;
cls.groups <- as.factor(cls.groups);
names(cls.groups) <- cn;
}
if(persist) {
clusters[[type]][[name]] <<- cls.groups;
misc[['community']][[type]][[name]] <<- cls;
}
return(invisible(cls))
},
geneKnnbyPCA = function() {
warning('geneKnnbyPCA is deprecated use makeGeneKnnGraph() instead');
.self$makeGeneKnnGraph();
},
# take a given clustering and generate a hierarchical clustering
# TODO: add support for innerOrder
getHierarchicalDiffExpressionAspects = function(type='counts',groups=NULL,clusterName=NULL,method='ward.D', dist='pearson', persist=TRUE, z.threshold=2, n.cores=.self$n.cores, min.set.size=5 ) {
if(type=='counts') {
x <- counts;
} else {
x <- reductions[[type]];
}
if(is.null(groups)) {
# retrieve clustering
if(is.null(clusters)) stop("please generate clusters first")
if(is.null(clusters[[type]])) stop(paste("please generate clusters for",type,"first"))
if(is.null(clusterName)) { # use the first clustering
if(length(clusters[[type]])<1) stop(paste("please generate clusters for",type,"first"))
cl <- clusters[[type]][[1]];
} else {
cl <- clusters[[type]][[clusterName]]
if(is.null(cl)) stop(paste("unable to find clustering",clusterName,'for',type))
if(verbose) message("using ",clusterName," clustering for ",type," space\n")
}
} else {
if(!all(rownames(x) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
cl <- groups;
}
cl <- as.factor(cl[match(rownames(x),names(cl))]);
if(dist %in% c('pearson','spearman')) {
rx <- do.call(cbind,tapply(1:nrow(x),cl,function(ii) {
if (length(ii) > 1) {
Matrix::colMeans(x[ii,])
} else {
x[ii,]
}
}))
d <- as.dist(1-cor(rx,method=dist))
} else if(dist=='euclidean') {
rx <- do.call(rbind,tapply(1:nrow(x),cl,function(ii) {
if (legnth(ii) > 1) {
Matrix::colMeans(x[ii,])
} else {
x[ii,]
}
}))
d <- dist(rx)
} else if(dist=='JS') {
# this one has to be done on counts, even if we're working with a reduction
cl <- as.factor(cl[match(rownames(misc[['rawCounts']]),names(cl))]);
lvec <- colSumByFac(misc[['rawCounts']],as.integer(cl))[-1,] + 1
d <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec))));
colnames(d) <- rownames(d) <- which(table(cl)>0)
d <- as.dist(d)
} else {
stop("unknown distance",dist,"requested")
}
dd <- as.dendrogram(hclust(d,method=method))
# walk down the dendrogram to generate diff. expression on every split
diffcontrasts <- function(l,env) {
v <- mget("contrasts",envir=env,ifnotfound=0)[[1]]
if(!is.list(v)) v <- list();
if(is.leaf(l)) return(NULL)
lcl <- rep(NA,nrow(x));
names(lcl) <- rownames(x)
lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[1]])]] <- paste(unlist(l[[1]]),collapse='.');
lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[2]])]] <- paste(unlist(l[[2]]),collapse='.');
v <- c(v,list(as.factor(lcl)))
assign("contrasts",v,envir=env)
return(1);
}
de <- environment()
assign('contrasts',NULL,envir=de)
dc <- dendrapply(dd,diffcontrasts,env=de)
dc <- get("contrasts",env=de)
names(dc) <- unlist(lapply(dc,function(x) paste(levels(x),collapse=".vs.")))
#dexp <- papply(dc,function(x) getDifferentialGenes(groups=x,z.threshold=z.threshold),n.cores=n.cores)
x <- counts;
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p)); # apply variance scaling
x <- t(x)
dexp <- papply(dc,function(g) {
dg <- getDifferentialGenes(groups=g,z.threshold=z.threshold)
dg <- lapply(dg,function(x) x[x$Z>=z.threshold,])
# calculate average profiles
x <- x[rownames(x) %in% unlist(lapply(dg,rownames)),]
if(nrow(x)<1) return(NULL);
x <- x-rowMeans(x[,!is.na(g)])
sf <- rep(1,nrow(x)); names(sf) <- rownames(x);
if(nrow(dg[[1]])>0) {
ig <- which(names(sf) %in% rownames(dg[[1]]))
sf[ig] <- 1/length(ig)
}
if(nrow(dg[[2]])>0) {
ig <- which(names(sf) %in% rownames(dg[[2]]))
sf[ig] <- -1/length(ig)
}
sf <- sf*sqrt(length(sf))
pt <- colSums(x*sf)
pt[is.na(g)] <- 0;
return(list(dg=dg,pt=pt))
},n.cores=n.cores)
dexp <- dexp[!unlist(lapply(dexp,is.null))]; # remove cases where nothing was reported
dexp <- dexp[!unlist(lapply(dexp,function(x){class(x) == 'try-error'}))] ## remove cases that failed
# fake pathwayOD output
tamr <- list(xv=do.call(rbind,lapply(dexp,function(x) x$pt)),
cnam=lapply(sn(names(dexp)),function(n) c(n)))
dgl <- lapply(dexp,function(d) as.character(unlist(lapply(d$dg,function(x) rownames(x)[x$Z>=z.threshold]))))
tamr$env <- list2env(dgl[unlist(lapply(dgl,length))>=min.set.size])
misc[['pathwayOD']] <<- tamr;
# fake pathwayODInfo
zs <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$Z)))))
mval <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$M)))))
vdf <- data.frame(i=1:nrow(tamr$xv),npc=1,valid=TRUE,sd=apply(tamr$xv,1,sd),cz=zs,z=zs,oe=mval,n=unlist(lapply(dexp,function(x) sum(unlist(lapply(x$dg,nrow))))))
vdf$name <- rownames(vdf) <- names(dexp);
misc[['pathwayODInfo']] <<- vdf
return(invisible(tamr))
},
# Calculates gene Knn network for gene similarity
# Author: Simon Steiger
makeGeneKnnGraph = function(nPcs = 100, scale =TRUE , center=TRUE, fastpath =TRUE, maxit =1000, k = 30, n.cores = .self$n.cores, verbose =TRUE) {
# Transpose first
x <- t(counts);
# TODO: factor out gene PCA calculation
# Do the PCA
nPcs <- min(nrow(x)-1,ncol(x)-1,nPcs);
if (center) {
cm <- Matrix::colMeans(x);
pcs <- irlba(x, nv=nPcs, nu =0, center=cm, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
} else {
pcs <- irlba(x, nv=nPcs, nu =0, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
}
rownames(pcs$v) <- colnames(x);
# Optional centering
if (center) {
pcs$center <- cm;
pcas <- as.matrix(t(t(x %*% pcs$v) - t(cm %*% pcs$v)))
} else {
pcas <- as.matrix(x %*% pcs$v);
}
# Keep the names
rownames(pcas) <- rownames(x);
colnames(pcas) <- paste0('PC',seq(ncol(pcas)));
# Save into genegraphs slot
#genegraphs$genePCs <<- pcs;
#genegraphs$geneRotated <<- pcas;
# Using cosine distance only here
if(center) {
pcas <- pcas - Matrix::rowMeans(pcas)
}
xn <- n2Knn(pcas, k, nThreads= n.cores, verbose=verbose)
diag(xn) <- 0; # Remove self edges
xn <- as(xn,'dgTMatrix'); # will drop 0s
# Turn into a dataframe, convert from correlation distance into weight
df <- data.frame('from'=rownames(pcas)[xn@i+1],'to'=rownames(pcas)[xn@j+1],'w'=pmax(1-xn@x,0),stringsAsFactors=FALSE)
genegraphs$graph <<- df;
},
# calculate density-based clusters
getDensityClusters=function(type='counts', embeddingType=NULL, name='density', v=0.7, s=1, ...) {
if (!requireNamespace("dbscan", quietly = TRUE)) {
stop("Package \"dbscan\" needed for this function to work. Please install it.", call. = FALSE)
}
if(is.null(embeddings[[type]])) { stop("first, generate embeddings for type ",type)}
if(is.null(embeddingType)) {
# take the first one
embeddingType <- names(embeddings[[type]])[1]
if(verbose) message("using ",embeddingType," embedding\n")
emb <- embeddings[[type]][[embeddingType]]
} else {
emb <- embeddings[[type]][[embeddingType]]
if(is.null(emb)) { stop("embedding ",embeddingType," for type ", type," doesn't exist")}
}
cl <- dbscan::dbscan(emb, ...)$cluster;
cols <- rainbow(length(unique(cl)),v=v,s=s)[cl+1]; cols[cl==0] <- "gray70"
names(cols) <- rownames(emb);
clusters[[type]][[name]] <<- cols;
misc[['clusters']][[type]][[name]] <<- cols;
return(invisible(cols))
},
# determine subpopulation-specific genes
getDifferentialGenes=function(type='counts',clusterType=NULL,groups=NULL,name='customClustering', z.threshold=3,upregulated.only=FALSE,verbose=FALSE, append.specificity.metrics=TRUE, append.auc=FALSE) {
"Determine differentially expressed genes, comparing each group against all others using Wilcoxon rank sum test\n
- type data type (currently only default 'counts' is supported)\n
- clusterType optional cluster type to use as a group-defining factor\n
- groups explicit cell group specification - a named cell factor (use NA in the factor to exclude cells from the comparison)\n
- name name slot to store the results in\n
- z.threshold minimal absolute Z score (adjusted) to report\n
- upregulated.only whether to report only genes that are expressed significantly higher in each group\n
- verbose verbose flag\n
return a list, with each element of the list corresponding to a cell group in the provided/used factor (i.e. factor levels); Each element of a list is a data frame listing the differentially epxressed genes (row names), with the following columns: Z - adjusted Z score, with positive values indicating higher expression in a given group compare to the rest; M - log2 fold change; highest- a boolean flag indicating whether the expression of a given gene in a given vcell group was on average higher than in every other cell group; fe - fraction of cells in a given group having non-zero expression level of a given gene;\n
Examples:\n\n
result <- r$getDifferentialGenes(groups=r$clusters$PCA$multilevel);\n
str(r$diffgenes)"
# restrict counts to the cells for which non-NA value has been specified in groups
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
cols <- clusters[[type]][[1]]
} else {
cols <- clusters[[type]][[clusterType]]
if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
} else {
cols <- groups;
}
cm <- counts;
if(!all(rownames(cm) %in% names(cols))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
# determine a subset of cells that's in the cols and cols[cell]!=NA
valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
if(!all(valid.cells)) {
# take a subset of the count matrix
cm <- cm[valid.cells,]
}
# reorder cols
cols <- as.factor(cols[match(rownames(cm),names(cols))]);
cols <- as.factor(cols);
if(verbose) {
message("running differential expression with ",length(levels(cols))," clusters ... ")
}
# use offsets based on the base model
# run wilcoxon test comparing each group with the rest
lower.lpv.limit <- -100;
# calculate rank per-column (per-gene) average rank matrix
xr <- sparse_matrix_column_ranks(cm);
# calculate rank sums per group
grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
# calculate number of non-zero entries per group
xr@x <- numeric(length(xr@x))+1
gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
#group.size <- as.numeric(tapply(cols,cols,length));
group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
# add contribution of zero entries to the grs
gnz <- (group.size-gnzz)
# rank of a 0 entry for each gene
zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2
# standardize
n1n2 <- group.size*(nrow(cm)-group.size);
# usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
# correcting for 0 ties, of which there are plenty
usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
x <- t((ustat - n1n2/2)/usigma); # standardized U value- z score
# correct for multiple hypothesis
if(verbose) {
message("adjusting p-values ... ")
}
x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),ncol=ncol(x))*sign(x)
rownames(x) <- colnames(cm); colnames(x) <- levels(cols)[1:ncol(x)];
if(verbose) {
message("done.\n")
}
# add fold change information
log.gene.av <- log2(Matrix::colMeans(cm));
group.gene.av <- colSumByFac(cm,as.integer(cols))[-1,,drop=FALSE] / (group.size+1);
log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
# fraction of cells expressing
f.expressing <- t(gnzz / group.size);
max.group <- max.col(log2.fold.change)
ds <- lapply(1:ncol(x),function(i) {
z <- x[,i];
vi <- which((if (upregulated.only) z else abs(z)) >= z.threshold);
r <- data.frame(Z=z[vi],M=log2.fold.change[vi,i],highest=max.group[vi]==i,fe=f.expressing[vi,i], Gene=rownames(x)[vi])
rownames(r) <- r$Gene;
r <- r[order(r$Z,decreasing=TRUE),]
r
})
names(ds) <- colnames(x);
if (append.specificity.metrics) {
ds <- names(ds) %>% setNames(., .) %>%
papply(function(n) appendSpecificityMetricsToDE(ds[[n]], cols, n, p2.counts=cm, append.auc=append.auc), n.cores=n.cores)
}
if(is.null(groups)) {
if(is.null(clusterType)) {
diffgenes[[type]][[names(clusters[[type]])[1]]] <<- ds;
} else {
diffgenes[[type]][[clusterType]] <<- ds;
}
} else {
diffgenes[[type]][[name]] <<- ds;
}
return(invisible(ds))
},
plotDiffGeneHeatmap=function(type='counts',clusterType=NULL, groups=NULL, n.genes=100, z.score=2, gradient.range.quantile=0.95, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, ... ) {
if(!is.null(clusterType)) {
x <- diffgenes[[type]][[clusterType]];
if(is.null(x)) { stop("differential genes for the specified cluster type haven't been calculated") }
} else {
x <- diffgenes[[type]][[1]];
if(is.null(x)) { stop("no differential genes found for data type ",type) }
}
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
cols <- clusters[[type]][[1]]
} else {
cols <- clusters[[type]][[clusterType]]
if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
} else {
# use clusters information
if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
cols <- as.factor(groups[match(rownames(counts),names(groups))]);
}
cols <- as.factor(cols);
# select genes to show
if(!is.null(z.score)) {
x <- lapply(x,function(d) d[d$Z >= z.score & d$highest==TRUE,])
if(!is.null(n.genes)) {
x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
}
} else {
if(!is.null(n.genes)) {
x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
}
}
x <- lapply(x,rownames);
# make expression matrix
#browser()
#x <- x[!unlist(lapply(x,is.null))]
#cols <- cols[cols %in% names(x)]
#cols <- droplevels(cols)
em <- counts[,unlist(x)];
# renormalize rows
if(all(sign(em)>=0)) {
if(is.null(gradientPalette)) {
gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
}
em <- apply(em,1,function(x) {
zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
if(diff(zlim)==0) {
zlim <- as.numeric(range(x))
}
x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
x <- (x-zlim[1])/(zlim[2]-zlim[1])
})
} else {
if(is.null(gradientPalette)) {
gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
}
em <- apply(em,1,function(x) {
zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
if(diff(zlim)==0) {
zlim <- c(-1,1)*as.numeric(max(abs(x)))
}
x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
x <- (x-zlim[1])/(zlim[2]-zlim[1])
})
}
# cluster cell types by averages
rowfac <- factor(rep(names(x),unlist(lapply(x,length))),levels=names(x))
if(inner.clustering) {
clclo <- hclust(as.dist(1-cor(do.call(cbind,tapply(1:nrow(em),rowfac,function(ii) Matrix::colMeans(em[ii,,drop=FALSE]))))),method='complete')$order
} else {
clclo <- 1:length(levels(rowfac))
}
if(inner.clustering) {
# cluster genes within each cluster
clgo <- tapply(1:nrow(em),rowfac,function(ii) {
ii[hclust(as.dist(1-cor(t(em[ii,]))),method='complete')$order]
})
} else {
clgo <- tapply(1:nrow(em),rowfac,I)
}
if(inner.clustering) {
# cluster cells within each cluster
clco <- tapply(1:ncol(em),cols,function(ii) {
if(length(ii)>3) {
ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='complete')$order]
} else {
ii
}
})
} else {
clco <- tapply(1:ncol(em),cols,I)
}
#clco <- clco[names(clgo)]
# filter down to the clusters that are included
#vic <- cols %in% clclo
colors <- fac2col(cols,v=v,s=s,return.details=TRUE)
cellcols <- colors$colors[unlist(clco[clclo])]
genecols <- rev(rep(colors$palette,unlist(lapply(clgo,length)[clclo])))
bottomMargin <- ifelse(drawGroupNames,4,0.5);
my.heatmap2(em[rev(unlist(clgo[clclo])),unlist(clco[clclo])],col=gradientPalette,Colv=NA,Rowv=NA,labRow=NA,labCol=NA,RowSideColors=genecols,ColSideColors=cellcols,margins=c(bottomMargin,0.5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=T, box=box, ...)
abline(v=cumsum(unlist(lapply(clco[clclo],length))),col=1,lty=3)
abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
},
# recalculate library sizes using robust regression within clusters
getRefinedLibSizes=function(clusterType=NULL, groups=NULL,type='counts') {
if (!requireNamespace("robustbase", quietly = TRUE)) {
stop("Package \"robustbase\" needed for this function to work. Please install it.", call. = FALSE)
}
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
groups <- clusters[[type]][[1]]
} else {
groups <- clusters[[type]][[clusterType]]
if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
}
if(is.null(groups)) { stop("clustering must be determined first, or passed as a groups parameter") }
# calculated pooled profiles per cluster
lvec <- colSumByFac(misc[['rawCounts']],as.integer(groups))[-1,,drop=FALSE];
lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4
# TODO: implement internal robust regression
## x <- misc[['rawCounts']]
## x <- x/as.numeric(depth);
## inplaceWinsorizeSparseCols(x,10);
## x <- x*as.numeric(depth);
x <- mclapply(1:length(levels(groups)),function(j) {
ii <- names(groups)[which(groups==j)]
av <- lvec[,j]
avi <- which(av>0)
av <- av[avi]
cvm <- as.matrix(misc[['rawCounts']][ii,avi])
x <- unlist(lapply(ii,function(i) {
cv <- cvm[i,]
#as.numeric(coef(glm(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
}))
names(x) <- ii;
x
},mc.cores=30)
lib.sizes <- unlist(x)[rownames(misc[['rawCounts']])]
lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(misc[['rawCounts']]))
depth <<- lib.sizes;
return(invisible(lib.sizes))
},
# plot heatmap for a given set of genes
plotGeneHeatmap=function(genes, type='counts', clusterType=NULL, groups=NULL, z.score=2, gradient.range.quantile=0.95, cluster.genes=FALSE, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, useRaster=TRUE, smooth.span=max(1,round(nrow(counts)/1024)), ... ) {
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
cols <- clusters[[type]][[1]]
} else {
cols <- clusters[[type]][[clusterType]]
if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
} else {
# use clusters information
if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
cols <- as.factor(groups[match(rownames(counts),names(groups))]);
}
cols <- as.factor(cols);
# make expression matrix
if(!all(genes %in% colnames(counts))) { warning(paste("the following specified genes were not found in the data: [",paste(genes[!genes %in% colnames(counts)],collapse=" "),"], omitting",sep="")) }
x <- intersect(genes,colnames(counts));
if(length(x)<1) { stop("too few genes") }
em <- as.matrix(t(counts[,x]));
# renormalize rows
if(all(sign(em)>=0)) {
if(is.null(gradientPalette)) {
gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
}
em <- t(apply(em,1,function(x) {
zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
if(diff(zlim)==0) {
zlim <- as.numeric(range(x))
}
x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
x <- (x-zlim[1])/(zlim[2]-zlim[1])
}))
} else {
if(is.null(gradientPalette)) {
gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
}
em <- t(apply(em,1,function(x) {
zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
if(diff(zlim)==0) {
zlim <- c(-1,1)*as.numeric(max(abs(x)))
}
x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
x <- (x-zlim[1])/(zlim[2]-zlim[1])
}))
}
# cluster cell types by averages
clclo <- 1:length(levels(cols))
if(cluster.genes) {
# cluster genes within each cluster
clgo <- hclust(as.dist(1-cor(t(em))),method='complete')$order
} else {
clgo <- 1:nrow(em)
}
if(inner.clustering) {
# cluster cells within each cluster
clco <- tapply(1:ncol(em),cols,function(ii) {
if(length(ii)>3) {
ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='single')$order]
} else {
ii
}
# TODO: implement smoothing span support
})
} else {
clco <- tapply(1:ncol(em),cols,I)
}
cellcols <- fac2col(cols,v=v,s=s)[unlist(clco[clclo])]
#genecols <- rev(rep(fac2col(cols,v=v,s=s,return.level.colors=T),unlist(lapply(clgo,length)[clclo])))
bottomMargin <- 0.5;
# reorder and potentially smooth em
em <- em[rev(clgo),unlist(clco[clclo])];
my.heatmap2(em,col=gradientPalette,Colv=NA,Rowv=NA,labCol=NA,ColSideColors=cellcols,margins=c(bottomMargin,5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=useRaster, box=box, ...)
bp <- cumsum(unlist(lapply(clco[clclo],length))); # cluster border positions
abline(v=bp,col=1,lty=3)
#abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
if(drawGroupNames) {
clpos <- (c(0,bp[-length(bp)])+bp)/2;
labpos <- rev(seq(0,length(bp)+1)/(length(bp)+1)*nrow(em)); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
text(x=clpos,y=labpos,labels = levels(cols),cex=1)
# par(xpd=TRUE)
# clpos <- (c(0,bp[-length(bp)])+bp)/2;
# labpos <- seq(0,length(bp)+1)/(length(bp)+1)*max(bp); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
# text(x=labpos,y=-2,labels = levels(col))
# segments(labpos,-1,clpos,0.5,lwd=0.5)
# par(xpd=FALSE)
}
},
# show embedding
plotEmbedding=function(type=NULL, embeddingType=NULL, clusterType=NULL, groups=NULL, colors=NULL, gene=NULL, plot.theme=ggplot2::theme_bw(), ...) {
if (is.null(type)) {
if ('counts' %in% names(embeddings)) {
type <- 'counts'
} else if (length(embeddings) > 0) {
type <- names(embeddings)[1]
} else {
stop("first, generate an embedding")
}
}
if(is.null(embeddings[[type]]))
stop("first, generate embeddings for type ",type)
if(is.null(embeddingType)) {
embeddingType <- 1 # take the first one
}
emb <- embeddings[[type]][[embeddingType]]
if (!is.null(gene)) {
if (!(gene %in% colnames(counts)))
stop("Gene '", gene, "' isn't presented in the count matrix")
colors <- .self$counts[,gene]
}
if(is.null(colors) && is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
groups <- clusters[[type]][[1]]
} else {
groups <- clusters[[type]][[clusterType]]
if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
}
sccore::embeddingPlot(emb, groups=groups, colors=colors, plot.theme=plot.theme, ...)
},
# get overdispersed genes
getOdGenes=function(n.odgenes=NULL,alpha=5e-2,use.unadjusted.pvals=FALSE) {
if(is.null(misc[['varinfo']])) { stop("please run adjustVariance first")}
if(is.null(n.odgenes)) { #return according to alpha
if(use.unadjusted.pvals) {
rownames(misc[['varinfo']])[misc[['varinfo']]$lp <= log(alpha)]
} else {
rownames(misc[['varinfo']])[misc[['varinfo']]$lpa <= log(alpha)]
}
} else { # return top n.odgenes sites
rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
}
},
getNormalizedExpressionMatrix=function(n.odgenes=NULL,genes=NULL) {
"Return variance-normalized matrix for specified genes or a number of OD genes\n
- genes explicit vector of genes to return
- n.odgenes whether a certain number of top overdispersed genes should be used (default NULL - all significant ones); ignored if genes are passed
return: cell x gene matrix\n"
if(is.null(genes)) {
genes <- .self$getOdGenes(n.odgenes)
}
x <- .self$counts[,genes];
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
x
},
calculatePcaReduction=function(nPcs=20, type='counts', name='PCA', use.odgenes=TRUE, n.odgenes=NULL, odgenes=NULL, center=TRUE, cells=NULL, fastpath=TRUE, maxit=100, verbose=TRUE, var.scale=(type == "counts"), ...) {
"Calculate PCA reduction of the data\n
- nPcs number of PCs\n
- type dataset view to reduce (counts by default, but can specify a name of an existing reduction)\n
- name name for the PCA reduction to be created\n
- use.odgenes whether pre-calculated set of overdispersed genes should be used (default=TRUE)\n
- n.odgenes whether a certain number of top overdispersed genes should be used\n
- odgenes explicitly specify a set of genes to use for the reduction\n
- center should data be centerred prior to PCA (default=TRUE)\n
- cells optional subset of cells on which PCA should be ran\n
- fastpath use C implementation for speedup\n
- maxit maximum number of irlba iterations to use\n
- verbose verbose\n
- ... additional arguments forwarded to irlba::irlba\n
return invisible PCA result (the reduction itself is saved in $reductions[[name]])"
if(type=='counts') {
x <- counts;
} else {
if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
x <- reductions[[type]]
}
if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
if(is.null(misc[['odgenes']] )) { stop("please run adjustVariance() first")}
odgenes <- misc[['odgenes']];
if(!is.null(n.odgenes)) {
if(n.odgenes>length(odgenes)) {
#warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
} else {
odgenes <- odgenes[1:n.odgenes]
}
}
}
if(!is.null(odgenes)) {
x <- x[,odgenes]
if(verbose) message('running PCA using ',length(odgenes),' OD genes .')
} else { #all genes?
if(verbose) message('running PCA all ',ncol(x),' genes .')
}
# apply scaling if using raw counts
if(var.scale) {
#x <- t(t(x)*misc[['varinfo']][colnames(x),'gsf'])
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
}
if(verbose) message('.')
if(!is.null(cells)) {
# cell subset is just for PC determination
nPcs <- min(min(length(cells),ncol(x))-1,nPcs)
cm <- Matrix::colMeans(x[cells,])
pcs <- irlba(x[cells,], nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
} else {
nPcs <- min(min(nrow(x),ncol(x))-1,nPcs)
if(center) {
cm <- Matrix::colMeans(x)
pcs <- irlba(x, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
} else {
pcs <- irlba(x, nv=nPcs, nu=0, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
}
}
rownames(pcs$v) <- colnames(x);
if(verbose) message('.')
# adjust for centering!
if(center) {
pcs$center <- cm;
pcas <- as.matrix(t(as(t(x %*% pcs$v), "dgeMatrix") - t(cm %*% pcs$v)))
} else {
pcas <- as.matrix(x %*% pcs$v);
}
misc$PCA <<- pcs;
if(verbose) message('.')
#pcas <- scde::winsorize.matrix(pcas,0.05)
# # control for sequencing depth
# if(is.null(batch)) {
# mx <- model.matrix(x ~ d,data=data.frame(x=1,d=depth))
# } else {
# mx <- model.matrix(x ~ d*b,data=data.frame(x=1,d=depth,b=batch))
# }
# # TODO: how to get rid of residual depth effects in the PCA-based clustering?
# #pcas <- t(t(colLm(pcas,mx,returnResid=TRUE))+Matrix::colMeans(pcas))
# pcas <- colLm(pcas,mx,returnResid=TRUE)
rownames(pcas) <- rownames(x)
colnames(pcas) <- paste('PC',seq(ncol(pcas)),sep='')
#pcas <- pcas[,-1]
#pcas <- scde::winsorize.matrix(pcas,0.1)
if(verbose) message(' done\n')
reductions[[name]] <<- pcas;
## nIcs <- nPcs;
## a <- ica.R.def(t(pcas),nIcs,tol=1e-3,fun='logcosh',maxit=200,verbose=T,alpha=1,w.init=matrix(rnorm(nIcs*nPcs),nIcs,nPcs))
## reductions[['ICA']] <<- as.matrix( x %*% pcs$v %*% a);
## colnames(reductions[['ICA']]) <<- paste('IC',seq(ncol(reductions[['ICA']])),sep='');
return(invisible(pcas))
},
# will reset odgenes to be a superset of the standard odgene selection (guided by n.odgenes or alpha), and
# a set of recursively determined odgenes based on a given group (or a cluster info)
expandOdGenes=function(type='counts', clusterType=NULL, groups=NULL , min.group.size=30, od.alpha=1e-1, use.odgenes=FALSE, n.odgenes=NULL, odgenes=NULL, n.odgene.multiplier=1, gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=10,max.odgenes=Inf,take.top.odgenes=TRUE, recursive=TRUE) {
# determine groups
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
groups <- clusters[[type]][[1]]
} else {
groups <- clusters[[type]][[clusterType]]
if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
} else {
groups <- as.factor(groups[names(groups) %in% rownames(counts)]);
groups <- droplevels(groups);
}
# determine initial set of odgenes
if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
if(is.null(misc[['varinfo']] )) { stop("please run adjustVariance() first")}
df <- misc$varinfo
odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
#odgenes <- misc[['odgenes']];
if(!is.null(n.odgenes)) {
if(n.odgenes>length(odgenes)) {
#warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
} else {
odgenes <- odgenes[1:n.odgenes]
}
}
}
# filter out small groups
if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
if(sum(!is.na(groups))<min.group.size) {
warning("clustering specifies fewer cells than min.group.size")
return(odgenes)
}
if(length(levels(groups))<2) {
warning("cannot expand od genes based on a single group")
return(odgenes)
}
# determine groups for which variance normalization will be reran
if(recursive) {
if(verbose) message("recursive group enumeration ...")
# derive cluster hierarchy
# use raw counts to derive clustering
z <- misc$rawCounts;
rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
rownames(tc) <- levels(groups)
d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
dlab <- function(l) {
if(is.leaf(l)) {
return(list(labels(l)))
} else {
return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
}
}
# for each level in the cluster hierarchy, except for the top
rgroups <- dlab(as.dendrogram(hc))[-1]
rgroups <- c(list(levels(groups)),rgroups)
if(verbose) message("done.\n");
} else {
rgroups <- lapply(levels(groups),I)
}
names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
# run local variance normalization
if(verbose) message("running local variance normalization ");
# run variance normalization, determine PCs
gpcs <- papply(rgroups,function(group) {
cells <- names(groups)[groups %in% group]
# variance normalization
df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
#if(!is.null(n.odgenes)) {
# odgenes <- rownames(df)[order(df$lp,decreasing=F)[1:n.odgenes]]
#} else {
df <- df[!is.na(df$lp),,drop=FALSE]
df <- df[order(df$lp,decreasing=FALSE),,drop=FALSE]
n.od <- min(max(sum(df$lpa<log(od.alpha)),min.odgenes),max.odgenes);
if(n.od>0) {
odgenes <- rownames(df)[1:min(n.od*n.odgene.multiplier,nrow(df))]
} else {
return(NULL)
}
sf <- df$gsf[match(odgenes,rownames(df))];
return(list(sf=sf,cells=cells,odgenes=odgenes))
},n.cores=n.cores,mc.preschedule=TRUE)
if(verbose) message(" done\n");
odg <- unique(unlist(lapply(gpcs,function(z) z$odgenes)))
# TODO: consider gsf?
odgenes <- unique(c(odgenes,odg));
misc[['odgenes']] <<- odgenes;
return(invisible(odgenes));
},
localPcaKnn=function(nPcs=5, type='counts', clusterType=NULL, groups=NULL , k=30, b=1, a=1, min.group.size=30, name='localPCA', baseReduction='PCA', od.alpha=1e-1, n.odgenes=NULL,gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=5,take.top.odgenes=FALSE, recursive=TRUE,euclidean=FALSE,perplexity=k,debug=FALSE,return.pca=FALSE,skip.pca=FALSE) {
if(type=='counts') {
x <- counts;
} else {
if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
x <- reductions[[type]]
}
if(is.null(groups)) {
# look up the clustering based on a specified type
if(is.null(clusterType)) {
# take the first one
groups <- clusters[[type]][[1]]
} else {
groups <- clusters[[type]][[clusterType]]
if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
}
} else {
groups <- as.factor(groups[names(groups) %in% rownames(x)]);
groups <- droplevels(groups);
}
if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
if(sum(!is.na(groups))<min.group.size) { stop("clustering specifies fewer cells than min.group.size") }
if(recursive) {
if(verbose) message("recursive group enumeration ...")
## # derive cluster hierarchy
## rowFac <- rep(-1,nrow(x)); names(rowFac) <- rownames(x);
## rowFac[names(groups)] <- as.integer(groups);
## tc <- colSumByFac(x,as.integer(rowFac))[-1,]
## rownames(tc) <- levels(groups)
## #tc <- rbind("total"=Matrix::colSums(tc),tc)
## #d <- jsDist(t(((tc/pmax(1,Matrix::rowSums(tc)))))); rownames(d) <- colnames(d) <- rownames(tc)
## d <- 1-cor(t(tc))
## hc <- hclust(as.dist(d),method='ward.D')
# use raw counts to derive clustering
z <- misc$rawCounts;
rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
rownames(tc) <- levels(groups)
d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
dlab <- function(l) {
if(is.leaf(l)) {
return(list(labels(l)))
} else {
return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
}
}
# for each level in the cluster hierarchy, except for the top
rgroups <- dlab(as.dendrogram(hc))[-1]
rgroups <- c(list(levels(groups)),rgroups)
if(verbose) message("done.\n");
} else {
rgroups <- lapply(levels(groups),I)
}
names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
if(verbose) message("determining local PCs ");
# run variance normalization, determine PCs
gpcs <- papply(rgroups,function(group) {
cells <- names(groups)[groups %in% group]
# variance normalization
df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
if(!is.null(n.odgenes)) {
odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:n.odgenes]]
} else {
odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
}
if(length(odgenes)<min.odgenes) {
if(take.top.odgenes) {
odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:min.odgenes]]
} else {
return(NULL);
}
}
sf <- df$gsf[match(odgenes,rownames(df))];
if(return.pca && skip.pca) {
return(list(sf=sf,cells=cells,odgenes=odgenes))
}
y <- t(t(x[cells,odgenes])*sf)
cm <- Matrix::colMeans(y)
# PCA
pcs <- irlba(y, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=TRUE,reorth=TRUE)
rownames(pcs$v) <- colnames(y);
pcs$center <- cm;
# row-randomize x to get a sense for the pcs
m1 <- y;
if(euclidean) {
#for (i in 1:nrow(m1)) m1[i,] <- m1[i,order(runif(length(m1[i,])))]
m1@i <- sample(m1@i)
rpcas <- t(t(m1 %*% pcs$v) - t(pcs$center %*% pcs$v))
pcs$rsd <- apply(rpcas,2,sd)
pcs$trsd <- sd(dist(rpcas))
}
# sample within-cluster distances (based on main PCA)
pcas <- as.matrix(t(t(t(t(x[,odgenes])*sf) %*% pcs$v) - t(pcs$center %*% pcs$v)))
if(verbose) message(".")
return(list(pcs=pcs,sf=sf,df=df,cells=cells,pcas=pcas,odgenes=odgenes))
},n.cores=n.cores)
if(verbose) message(" done\n");
if(return.pca) return(gpcs)
ivi <- unlist(lapply(gpcs,is.null))
if(any(ivi)) { gpcs <- gpcs[!ivi]; rgroups <- rgroups[!ivi]; }
if(debug) { browser() }
# calculate cell relevance to each cluster (p_k,i matrix)
# use global PCA distances
if(verbose) message("calculating global distances ...");
gcdist <- as.matrix(dist(gpcs[[1]]$pcas))
if(verbose) message(" done.\n")
# for each PCA
# for each cell, determine p_k_i
# for each PC
# determine cell projections
# subset genes
# subtract center
# scale, multiply
# for each pair, determine cell distances
# use cell distances to complete weight matrix
# add to the w*d^2 and w matrices
# normalize by the sqrt(sum(w))
if(euclidean) {
if(verbose) message("calculating local Euclidean distances .");
dcs <- papply(gpcs,function(p) {
pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
nci <- setdiff(rownames(gcdist),p$cells)
if(length(nci)>0) {
# determine within cluster sd
scells <- sample(p$cells,min(1e3,length(p$cells)))
cldsd <- sd(as.numeric(gcdist[scells,scells]))
ncid <- rowMeans(gcdist[nci,scells])
pk[nci] <- exp(-b*(ncid/cldsd)^2)
}
dsq <- as.matrix(dist(p$pcas)^2)
w <- (1-exp(-a*(dsq)/(p$pcs$trsd^2))) * (pk %o% pk)
if(verbose) message(".")
list(dsq=dsq,w=w)
},n.cores=n.cores)
if(verbose) message(".")
d <- Reduce('+',lapply(dcs,function(x) x$dsq*x$w))
if(verbose) message(".")
d <- sqrt(d/Reduce('+',lapply(dcs,function(x) x$w)));
diag(d) <- 0
if(verbose) message(" done.\n")
} else {
# weighted correlation
if(verbose) message("calculating local correlation distances .");
dcs <- papply(gpcs,function(p) {
pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
nci <- setdiff(rownames(gcdist),p$cells)
if(length(nci)>0) {
# determine within cluster sd
scells <- sample(p$cells,min(1e3,length(p$cells)))
cldsd <- sd(as.numeric(gcdist[scells,scells]))
ncid <- rowMeans(gcdist[nci,scells])
pk[nci] <- exp(-b*(ncid/cldsd)^2)
}
x <- cov(t(p$pcas))*(ncol(p$pcas)-1)
xc <- x / sqrt(diag(x) %o% diag(x)) # correlation
w <- (1-exp(-a*(1-xc))) * (pk %o% pk)
if(verbose) message(".")
list(x=x,w=w)
},n.cores=n.cores)
if(verbose) message(".")
# calculate sum_{k}_{w_k*v} matrix
wm <- Reduce('+',lapply(dcs,function(z) z$w*diag(z$x)))
d <- Reduce('+',lapply(dcs,function(z) z$x*z$w))
d <- 1-d/sqrt(wm*t(wm))
diag(d) <- 0
if(verbose) message(" done.\n")
}
## d <- dcs[[1]]$dsq
## d <- as.matrix(dist(r$reductions$PCA))
## knn <- apply(d,2,function(x) order(x,decreasing=F)[1:(k+1)])
## cat(".")
## m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
## m <- m+t(m); # symmetrize
## diag(m) <- 0;
## rownames(m) <- colnames(m) <- rownames(d)
## x <- list(m=m)
## i <- 5; cl <- rep(NA,nrow(x$m)); names(cl) <- rownames(x$m); cl[rownames(x$m)[i]] <- 1; cl[which(x$m[,i]>0)] <- 2; cl <- as.factor(cl);
## r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
## i <- 5; cl <- 1/(dcs[[1]]$dsq[,i]+1e-6); names(cl) <- rownames(x$m);
## i <- 5; cl <- 1/(d[,i]+1e-6); names(cl) <- rownames(x$m);
## r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
# kNN
if(verbose) message("creating kNN graph .");
knn <- apply(d,2,function(x) order(x,decreasing=FALSE)[1:(k+1)])
if(verbose) message(".")
#m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=d[as.integer(t(t(knn)+((1:ncol(knn))-1)*nrow(d)))])
m <- m+t(m); # symmetrize
diag(m) <- 0;
rownames(m) <- colnames(m) <- rownames(d)
if(verbose) message(".")
g <- graph_from_adjacency_matrix(m,mode='undirected',weighted=TRUE);
if(verbose) message(".")
graphs[[name]] <<- g;
if(verbose) message(" done.\n")
emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, num_threads=n.cores)$Y;
rownames(emb) <- colnames(d)
embeddings[[type]][[name]] <<- emb;
# calculate cell-cell distance, considering weighting
# getting total cell-cell distance
return(invisible(list(d=d,m=m,gpcs=gpcs)))
},
# test pathway overdispersion
# this is a compressed version of the PAGODA1 approach
# env - pathway to gene environment
testPathwayOverdispersion=function(setenv,
type='counts',
max.pathway.size=1e3,
min.pathway.size=10,
n.randomizations=5,
verbose=FALSE,
score.alpha=0.05,
plot=FALSE,
cells=NULL,
adjusted.pvalues=TRUE,
z.score = qnorm(0.05/2, lower.tail = FALSE),
use.oe.scale = FALSE, return.table=FALSE,
name='pathwayPCA',
correlation.distance.threshold=0.2,
loading.distance.threshold=0.01,
top.aspects=Inf,
recalculate.pca=FALSE,
save.pca=TRUE) {
nPcs <- 1;
if(type=='counts') {
x <- counts;
# apply scaling if using raw counts
x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
} else {
if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
x <- reductions[[type]]
}
if(!is.null(cells)) {
x <- x[cells,]
}
proper.gene.names <- colnames(x);
if(is.null(misc[['pwpca']]) || recalculate.pca) {
if(verbose) {
message("determining valid pathways")
}
# determine valid pathways
gsl <- ls(envir = setenv)
gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
names(gsl) <- gsl
if(verbose) {
message("processing ", length(gsl), " valid pathways")
}
cm <- Matrix::colMeans(x)
pwpca <- papply(gsl, function(sn) {
lab <- proper.gene.names %in% get(sn, envir = setenv)
if(sum(lab)<1) { return(NULL) }
pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
pcs$d <- pcs$d/sqrt(nrow(x))
pcs$rotation <- pcs$v;
pcs$v <- NULL;
# get standard deviations for the random samples
ngenes <- sum(lab)
z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
si <- sample(ncol(x), ngenes)
pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
}))
z <- z/sqrt(nrow(x));
# local normalization of each component relative to sampled PC1 sd
avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
if(avar>0.5) {
# flip orientations to roughly correspond with the means
pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
pcs$scores <- pcs$scores*cs
pcs$rotation <- pcs$rotation*cs
rownames(pcs$rotation) <- colnames(x)[lab];
} # don't bother otherwise - it's not significant
return(list(xp=pcs,z=z,n=ngenes))
}, n.cores = n.cores,mc.preschedule=TRUE)
if(save.pca) {
misc[['pwpca']] <<- pwpca;
}
} else {
if(verbose) {
message("re-using previous overdispersion calculations")
pwpca <- misc[['pwpca']];
}
}
if(verbose) {
message("scoring pathway overdispersion signifcance")
}
# score overdispersion
true.n.cells <- nrow(x)
pagoda.effective.cells <- function(pwpca, start = NULL) {
n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
of <- function(p, v, sp) {
sn <- p[1]
vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
residuals <- (v-vfit)^2
return(sum(residuals))
}
x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
return((x$par)^2+1/2)
}
n.cells <- pagoda.effective.cells(pwpca)
vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
vars <- as.numeric((pwpca[[i]]$xp$d))
cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
})))
# fix p-to-q mistake in qWishartSpike
qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE) {
params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
}
# add right tail approximation to ptw, which gives up quite early
pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
q.tw <- (q - params$centering)/(params$scaling)
p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
p
}
vshift <- 0
ev <- 0
vdf$var <- vdf$var-(vshift-ev)*vdf$n
basevar <- 1
vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
#vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
if(plot) {
par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
un <- sort(unique(vdf$n))
on <- order(vdf$n, decreasing = FALSE)
pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
}
rs <- (vshift-ev)*vdf$n
vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
if(adjusted.pvalues) {
vdf$valid <- vdf$cz >= z.score
} else {
vdf$valid <- vdf$z >= z.score
}
if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
# apply additional filtering based on >0.5 sd above the local random estimate
vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
vdf$name <- names(pwpca)[vdf$i];
if(return.table) {
df <- df[vdf$valid, ]
df <- df[order(df$score, decreasing = TRUE), ]
return(df)
}
if(verbose) {
message("compiling pathway reduction")
}
# calculate pathway reduction matrix
# return scaled patterns
xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
xm <- x$xp$scores
}))
if(use.oe.scale) {
xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
vdf$sd <- as.numeric(vdf$oe)
} else {
# chi-squared
xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
}
rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
misc[['pathwayODInfo']] <<- vdf
# collapse gene loading
if(verbose) {
message("clustering aspects based on gene loading ... ",appendLF=FALSE)
}
tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
if(verbose) {
message(nrow(tam2$xv)," aspects remaining")
}
if(verbose) {
message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
}
tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
if(verbose) {
message(nrow(tam3$xv)," aspects remaining\n")
}
tam2$xvw <- tam3$xvw <- NULL; # to save space
tam3$env <- setenv;
# clean up aspect names, as GO ids are meaningless
names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
misc[['pathwayOD']] <<- tam3;
reductions[[name]] <<- tam3$xv;
return(invisible(tam3))
},
getEmbedding=function(type='counts', embeddingType='largeVis', name=NULL, dims=2, M=1, gamma=1/M, perplexity=50, sgd_batches=NULL, diffusion.steps=0, diffusion.power=0.5,
distance='pearson', n.cores = .self$n.cores, n.sgd.cores=n.cores, ... ) {
if(dims<1) stop("dimensions must be >=1")
if(type=='counts') {
x <- counts;
} else {
if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
x <- reductions[[type]]
}
if(is.null(name)) {
name <- embeddingType
}
if(embeddingType=='largeVis') {
edgeMat <- misc[['edgeMat']][[type]];
if(is.null(edgeMat)) { stop(paste('KNN graph for type ',type,' not found. Please run makeKnnGraph with type=',type,sep='')) }
if(is.null(sgd_batches)) { sgd_batches <- nrow(edgeMat)*1e3 }
#edgeMat <- sparseMatrix(i=xn$s+1,j=xn$e+1,x=xn$rd,dims=c(nrow(x),nrow(x)))
edgeMat <- (edgeMat + t(edgeMat))/2; # symmetrize
#edgeMat <- sparseMatrix(i=c(xn$s,xn$e)+1,j=c(xn$e,xn$s)+1,x=c(xn$rd,xn$rd),dims=c(nrow(x),nrow(x)))
# if(diffusion.steps>0) {
# Dinv <- Diagonal(nrow(edgeMat),1/colSums(edgeMat))
# Im <- Diagonal(nrow(edgeMat))
# W <- (Diagonal(nrow(edgeMat)) + edgeMat %*% Dinv)/2
# for(i in 1:diffusion.steps) {
# edgeMat <- edgeMat %*% W
# }
# }
#require(largeVis)
#if(!is.null(seed)) { set.seed(seed) }
if(!is.na(perplexity)) {
wij <- buildWijMatrix(edgeMat,perplexity=perplexity,threads=n.cores)
} else {
wij <- edgeMat;
}
if(diffusion.steps>0) {
Dinv <- Diagonal(nrow(wij),1/colSums(wij))
W <- Dinv %*% wij ;
W <- (1-diffusion.power)*Diagonal(nrow(wij)) + diffusion.power*W
#W <- (Diagonal(nrow(wij)) + W)/2
#W <- (Diagonal(nrow(wij)) + sign(W)*(abs(W)^(diffusion.power)))/2
#W <- sign(W)*(abs(W)^diffusion.power)
#W <- (Diagonal(nrow(wij)) + W)/2
for(i in 1:diffusion.steps) {
wij <- wij %*% W
}
#browser()
if(!is.na(perplexity)) {
wij <- buildWijMatrix(wij,perplexity=perplexity,threads=n.cores)
}
}
coords <- projectKNNs(wij = wij, M = M, dim=dims, verbose = TRUE,sgd_batches = sgd_batches,gamma=gamma, seed=1, threads=n.cores, ...)
colnames(coords) <- rownames(x);
emb <- embeddings[[type]][[name]] <<- t(coords);
} else if(embeddingType=='tSNE') {
if(nrow(x)>4e4) {
warning('Too many cells to pre-calcualte correlation distances, switching to L2. Please, consider using UMAP.');
distance <- 'L2';
}
dup.ids <- which(duplicated(x))
if (length(dup.ids) > 0) {
max.vals <- abs(x[dup.ids,] * 0.01)
x[dup.ids,] <- runif(length(x[dup.ids,]), -max.vals, max.vals)
}
if (distance=='L2') {
if(verbose) message("running tSNE using ",n.cores," cores:\n")
emb <- Rtsne::Rtsne(x, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
} else {
if(verbose) message('calculating distance ... ');
if(verbose) message('pearson ...')
d <- 1-cor(t(x))
if(verbose) message("running tSNE using ",n.cores," cores:\n")
emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
}
rownames(emb) <- rownames(x)
embeddings[[type]][[name]] <<- emb;
} else if(embeddingType=='FR') {
g <- graphs[[type]];
if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
emb <- layout.fruchterman.reingold(g, weights=E(g)$weight)
rownames(emb) <- rownames(x); colnames(emb) <- c("D1","D2")
embeddings[[type]][[name]] <<- emb;
} else if (embeddingType == "UMAP") {
if (!requireNamespace("uwot", quietly=TRUE))
stop("You need to install package 'uwot' to be able to use UMAP embedding.")
distance <- switch (distance, pearson = "cosine", L2 = "euclidean", distance)
emb <- uwot::umap(as.matrix(x), metric=distance, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
rownames(emb) <- rownames(x)
embeddings[[type]][[name]] <<- emb;
} else if (embeddingType == "UMAP_graph") {
g <- graphs[[type]];
if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
emb <- embedKnnGraphUmap(g, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
embeddings[[type]][[name]] <<- emb;
} else {
stop('unknown embeddingType ',embeddingType,' specified');
}
return(invisible(emb));
}
)
);
网友评论