segfault when using data.table package in conjunction with foreach
0
0
Entering edit mode
@matthew-keller-2483
Last seen 10.2 years ago
Hi all, I'm trying to use the package read.table within a foreach loop. I'm grabbing 500M rows of data at a time from two different files and then doing an aggregate/tapply like function in read.table after that. I had planned on doing a foreach loop 39 times at once for the 39 files I have, but obviously that won't work until I figure out why the segfault is occurring. The sessionInfo, code, and error are pasted below. If you have any ideas, would love to hear them. (I have no control over the version of R - 2.13.0 - being used). Best Matt SESSION INFO: > sessionInfo() R version 2.13.0 (2011-04-13) Platform: x86_64-unknown-linux-gnu (64-bit) locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=C [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] data.table_1.7.10 doMC_1.2.2 multicore_0.1-5 foreach_1.3.2 codetools_0.2-8 iterators_1.0.3 MY CODE: computeAllPairSums <- function(filename, nbindiv,nrows.to.read) { con <- file(filename, open="r") on.exit(close(con)) ans <- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv) chunk <- 0L while (TRUE) { #read.table faster than scan df0 <- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"), colClasses=c("integer", "integer", "NULL", "numeric"),nrows=nrows.to.read,comment.char="") DT <- data.table(df0) setkey(DT,ID1,ID2) ss <- DT[,sum(sharing),by="ID1,ID2"] if (nrow(df0) == 0L) break chunk <- chunk + 1L cat("Processing chunk", chunk, "... ") idd <- as.matrix(subset(ss,select=1:2)) newvec <- as.vector(as.matrix(subset(ss,select=3))) ans[idd] <- ans[idd] + newvec cat("OK\n") } ans } require(foreach) require(doMC) registerDoMC(cores=2) num <- 8891 nr <- 500000000L #500 million rows at a time MMM <- foreach(IT = 1:2) %dopar% { require(data.table) if (IT==1){ x <- system.time({computeAllPairSums( paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on regular file PID 6489, 24 gb if (IT==2){ z <- system.time({computeAllPairSums.gz( paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz file PID 6490, 24 gb } MY R OUTPUT/ERROR: MMM <- foreach(IT = 1:2) %dopar% { + require(data.table) + if (IT==1){ x <- system.time({computeAllPairSums( paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on regular file PID 6053, 5.9 gb + if (IT==2){ z <- system.time({computeAllPairSums.gz( paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz file PID 6054, 4 gb + } Loading required package: data.table Loading required package: data.table data.table 1.7.10 For help type: help("data.table") data.table 1.7.10 For help type: help("data.table") *** caught segfault *** address 0x2ae93df90000, cause 'memory not mapped' Traceback: 1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj, byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE], if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch), verbose, PACKAGE = "data.table") 2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2") 3: DT[, sum(sharing), by = "ID1,ID2"] 4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr) 5: system.time({ computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr)}) 6: eval(expr, envir, enclos) 7: eval(c.expr, envir = args, enclos = envir) 8: doTryCatch(return(expr), name, parentenv, handler) 9: tryCatchOne(expr, names, parentenv, handlers[[1L]]) 10: tryCatchList(expr, classes, parentenv, handlers) 11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e) 12: FUN(X[[1L]], ...) 13: lapply(S, FUN, ...) 14: doTryCatch(return(expr), name, parentenv, handler) 15: tryCatchOne(expr, names, parentenv, handlers[[1L]]) 16: tryCatchList(expr, classes, parentenv, handlers) 17: tryCatch(expr, error = function(e) { call <- conditionCall(e) if (!is.null(call)) { if (identical(call[[1L]], quote(doTryCatch))) call <- sys.call(-4L) dcall <- deparse(call)[1L] prefix <- paste("Error in", dcall, ": ") LONG <- 75L msg <- conditionMessage(e) sm <- strsplit(msg, "\n")[[1L]] w <- 14L + nchar(dcall, type = "w") + nchar(sm[1L], type = "w") if is.na(w)) w <- 14L + nchar(dcall, type = "b") + nchar(sm[1L], type = "b") if (w > LONG) prefix <- paste(prefix, "\n ", sep = "") } else prefix <- "Error : " msg <- paste(prefix, conditionMessage(e), "\n", sep = "") .Internal(seterrmessage(msg[1L])) if (!silent && identical(getOption("show.error.messages"), TRUE)) { cat(msg, file = stderr()) .Internal(printDeferredWarnings()) } invisible(structure(msg, class = "try-error"))}) 18: try(lapply(S, FUN, ...), silent = TRUE) 19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE)) 20: FUN(1:2[[1L]], ...) 21: lapply(1:cores, inner.do) 22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed = set.seed, mc.silent = silent, mc.cores = cores) 23: e$fun(obj, substitute(ex), parent.frame(), e$data) 24: foreach(IT = 1:2) %dopar% { require(data.table) if (IT == 1) { x <- system.time({ computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr) }) } if (IT == 2) { z <- system.time({ computeAllPairSums.gz(paste(GERMLINE, "bc.chr22.q.20.gz", sep = ""), num, nr) }) }} Possible actions: 1: abort (with core dump, if enabled) 2: normal R exit 3: exit R without saving workspace 4: exit R saving workspace -- Matthew C Keller Asst. Professor of Psychology University of Colorado at Boulder www.matthewckeller.com
• 1.4k views
ADD COMMENT

Login before adding your answer.

Traffic: 543 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6