Original

Question

Improve performance of qusage aggregateGeneSet

0

Entering edit mode

oliverd • 0

@949ecd58

Last seen 9 hours ago

United States

The current implementation of aggregateGeneSet performs a cbind operation over each step of the main loop. As n.points increases this becomes extremely slow. Optimizing this step is simple and would greatly improve computational cost, see below.

Original


PDFs = pathMeans = Sizes = NULL
  for(i in 1:length(geneSets)){
    if(!silent & i%%5==0){cat(".")}
    Indexes = geneSets[[i]]
    if(length(Indexes)!=0){
      Norm<-(2*MaxDiff[i]/{n.points-1}) #normalize 
      PDF<-sapply(Indexes,function(j){
          x = SD[j]
          MaxDiffInt<-MaxDiff[i]/x
          #y<-dt(seq(-MaxDiffInt,MaxDiffInt,length.out=n.points),DOF)  
          x1<-seq(-MaxDiffInt,MaxDiffInt,length.out=n.points)
          y<-dt(x1[1:(n.points/2)],DOF[j])
          y<-c(y,rev(y))
          y/sum(y)/Norm
      })
      Tmp<-multi_conv(PDF)
      Tmp = Tmp*(n.points-1)/MaxDiff[i]/2
    }else{
      warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
      Tmp = rep(NA, n.points)
    }
    PDFs = cbind(PDFs,Tmp)
    pathMeans = c(pathMeans, mean(Means[Indexes]))
    Sizes = c(Sizes, length(Indexes))
  }

Optimized

# pre-allocate matrix with NA_real
PDFs <- matrix(NA_real_, nrow = n.points, ncol = length(geneSets))
pathMeans = Sizes = NULL
  for(i in 1:length(geneSets)){
    if(!silent & i%%5==0){cat(".")}
    Indexes = geneSets[[i]]
    if(length(Indexes)!=0){
      Norm<-(2*MaxDiff[i]/{n.points-1}) #normalize 
      PDF<-sapply(Indexes,function(j){
          x = SD[j]
          MaxDiffInt<-MaxDiff[i]/x
          #y<-dt(seq(-MaxDiffInt,MaxDiffInt,length.out=n.points),DOF)  
          x1<-seq(-MaxDiffInt,MaxDiffInt,length.out=n.points)
          y<-dt(x1[1:(n.points/2)],DOF[j])
          y<-c(y,rev(y))
          y/sum(y)/Norm
      })
      Tmp<-multi_conv(PDF)
      Tmp = Tmp*(n.points-1)/MaxDiff[i]/2
    }else{
      warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
      # no longer needed
      # Tmp = rep(NA, n.points)
    }
    PDFs[,i] <- Tmp
    pathMeans = c(pathMeans, mean(Means[Indexes]))
    Sizes = c(Sizes, length(Indexes))
  }

Parallelized

As an additional note, this code is the main time consuming step of the qusage workflow and could be parallelized for significant speed-up at the cost of having to do some minor data manipulation afterwards

res <- parallel::mclapply(1:length(geneSets), \(i){
    Indexes = geneSets[[i]]
    if(length(Indexes)!=0){
        Norm <- (2*MaxDiff[i]/(n.points-1)) #normalize 
        PDF <- sapply(Indexes, function(j){
            x <- SD[j]
            MaxDiffInt <- MaxDiff[i]/x
            x1 <- seq(-MaxDiffInt, MaxDiffInt, length.out = n.points)
            y <- dt(x1[1:(n.points/2)], DOF[j])
            y <- c(y, rev(y))
            y/sum(y)/Norm
        })
        Tmp <- multi_conv(PDF)
        Tmp <- Tmp*(n.points-1)/MaxDiff[i]/2
    } else{
      warning(paste("Gene set: (index ",i, ") has 0 overlap with eset.",sep=""))
      Tmp = rep(NA, n.points)
    }
    list(PDFs = Tmp, pathMeans = mean(Means[Indexes]), Sizes = length(Indexes))
}, mc.cores = ncores)

qusage • 40 views

ADD COMMENT • link updated 8 hours ago by James W. MacDonald 67k • written 10 hours ago by oliverd • 0

score 0 · Answer 1 · 2025-01-14

This support site is meant mostly to help people with questions about using Bioconductor packages rather than posting code improvements. I don't see a public facing GitHub page for this package, but you can certainly clone the repo, (git clone https://git.bioconductor.org/packages/qusage), make the changes, generate a patch file and then send the patch to the maintainer.