The current implementation of aggregateGeneSet
performs a cbind
operation over each step of the main loop. As n.points
increases this becomes extremely slow. Optimizing this step is simple and would greatly improve computational cost, see below.
Original
PDFs = pathMeans = Sizes = NULL
for(i in 1:length(geneSets)){
if(!silent & i%%5==0){cat(".")}
Indexes = geneSets[[i]]
if(length(Indexes)!=0){
Norm<-(2*MaxDiff[i]/{n.points-1}) #normalize
PDF<-sapply(Indexes,function(j){
x = SD[j]
MaxDiffInt<-MaxDiff[i]/x
#y<-dt(seq(-MaxDiffInt,MaxDiffInt,length.out=n.points),DOF)
x1<-seq(-MaxDiffInt,MaxDiffInt,length.out=n.points)
y<-dt(x1[1:(n.points/2)],DOF[j])
y<-c(y,rev(y))
y/sum(y)/Norm
})
Tmp<-multi_conv(PDF)
Tmp = Tmp*(n.points-1)/MaxDiff[i]/2
}else{
warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
Tmp = rep(NA, n.points)
}
PDFs = cbind(PDFs,Tmp)
pathMeans = c(pathMeans, mean(Means[Indexes]))
Sizes = c(Sizes, length(Indexes))
}
Optimized
# pre-allocate matrix with NA_real
PDFs <- matrix(NA_real_, nrow = n.points, ncol = length(geneSets))
pathMeans = Sizes = NULL
for(i in 1:length(geneSets)){
if(!silent & i%%5==0){cat(".")}
Indexes = geneSets[[i]]
if(length(Indexes)!=0){
Norm<-(2*MaxDiff[i]/{n.points-1}) #normalize
PDF<-sapply(Indexes,function(j){
x = SD[j]
MaxDiffInt<-MaxDiff[i]/x
#y<-dt(seq(-MaxDiffInt,MaxDiffInt,length.out=n.points),DOF)
x1<-seq(-MaxDiffInt,MaxDiffInt,length.out=n.points)
y<-dt(x1[1:(n.points/2)],DOF[j])
y<-c(y,rev(y))
y/sum(y)/Norm
})
Tmp<-multi_conv(PDF)
Tmp = Tmp*(n.points-1)/MaxDiff[i]/2
}else{
warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
# no longer needed
# Tmp = rep(NA, n.points)
}
PDFs[,i] <- Tmp
pathMeans = c(pathMeans, mean(Means[Indexes]))
Sizes = c(Sizes, length(Indexes))
}
Parallelized
As an additional note, this code is the main time consuming step of the qusage workflow and could be parallelized for significant speed-up at the cost of having to do some minor data manipulation afterwards
res <- parallel::mclapply(1:length(geneSets), \(i){
Indexes = geneSets[[i]]
if(length(Indexes)!=0){
Norm <- (2*MaxDiff[i]/(n.points-1)) #normalize
PDF <- sapply(Indexes, function(j){
x <- SD[j]
MaxDiffInt <- MaxDiff[i]/x
x1 <- seq(-MaxDiffInt, MaxDiffInt, length.out = n.points)
y <- dt(x1[1:(n.points/2)], DOF[j])
y <- c(y, rev(y))
y/sum(y)/Norm
})
Tmp <- multi_conv(PDF)
Tmp <- Tmp*(n.points-1)/MaxDiff[i]/2
} else{
warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
Tmp = rep(NA, n.points)
}
list(PDFs = Tmp, pathMeans = mean(Means[Indexes]), Sizes = length(Indexes))
}, mc.cores = ncores)