edgeR with two different groups (unpaired) at two time points (paired): Need some input
3
0
Entering edit mode
Sindre ▴ 110
@sindre-6193
Last seen 4.3 years ago
Hi! The edgeR manual is quite nice, but I'm not quite sure if I'm on the right track.. The question to answer is: "Is there any significantly changed genes in diabetics at time point 2, compared to healthy?" Heres my codes: #Making the design matrix status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), levels=c("Healthy","Diabetic")) patients <- factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75,7 2,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62,61 ,55,53,21,04)) timepoints = as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1, 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) design <- model.matrix(~patients + timepoints*status) > colnames(design) [1] "(Intercept)" "patients4" [3] "patients8" "patients13" [5] "patients21" "patients53" [7] "patients55" "patients61" [9] "patients62" "patients67" [11] "patients70" "patients71" [13] "patients72" "patients73" [15] "patients75" "patients76" [17] "patients77" "patients79" [19] "patients81" "patients82" [21] "patients83" "patients86" [23] "patients87" "patients88" [25] "timepoints2" "statusDiabetic" [27] "timepoints2:statusDiabetic" #Reading in HTSeq data description <- c("test"); fileNames <- list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); files <- sort(fileNames,decreasing=TRUE); targets <- data.frame(files=files, group=status, description=description); library(edgeR); d <- readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,commen t.char="!"); colnames(d) <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271","N 270","N213","N208","N201","N188","187","N186","N183","N182","N181","N1 75","N172","N171","N170","N113","N108","N101","D279","D277","D276","D2 73","D267","D262","D261","D255","D253","D221","D204","D179","D177","D1 76","D173","D167","D162","D161","D155","D153","D121","D104")); d$samples; dim(d) #Normalization d <- (calcNormFactors(d,method="RLE")); #Estimating the dispersion: d <- estimateCommonDisp(d,verbose=TRUE) Disp = 0.05823 , BCV = 0.2413 d <- estimateTagwiseDisp(d,trend="none") #diffeksp > fit <- glmFit(d,design) Error in glmFit.default(y = y$counts, design = design, dispersion = dispersion, : Design matrix not of full rank. The following coefficients not estimable: statusDiabetic Can someone tell me why I get this error? Thanks alot!
edgeR edgeR • 1.6k views
ADD COMMENT
0
Entering edit mode
Sindre ▴ 110
@sindre-6193
Last seen 4.3 years ago
On 2013-10-22 20:03, Sindre Lee wrote: > Hi! > The edgeR manual is quite nice, but I'm not quite sure if I'm on the > right track.. > > The question to answer is: "Is there any significantly changed genes > in diabetics at time point 2, compared to healthy?" > > Heres my codes: > > #Making the design matrix > > status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), > levels=c("Healthy","Diabetic")) > patients <- > factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75 ,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62, 61,55,53,21,04)) > timepoints = > as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1, 1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) > design <- model.matrix(~patients + timepoints*status) >> colnames(design) > [1] "(Intercept)" "patients4" > [3] "patients8" "patients13" > [5] "patients21" "patients53" > [7] "patients55" "patients61" > [9] "patients62" "patients67" > [11] "patients70" "patients71" > [13] "patients72" "patients73" > [15] "patients75" "patients76" > [17] "patients77" "patients79" > [19] "patients81" "patients82" > [21] "patients83" "patients86" > [23] "patients87" "patients88" > [25] "timepoints2" "statusDiabetic" > [27] "timepoints2:statusDiabetic" > > #Reading in HTSeq data > > description <- c("test"); > fileNames <- > list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); > files <- sort(fileNames,decreasing=TRUE); > targets <- data.frame(files=files, group=status, > description=description); > library(edgeR); > d <- > readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,comm ent.char="!"); > colnames(d) > <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271", "N270","N213","N208","N201","N188","187","N186","N183","N182","N181"," N175","N172","N171","N170","N113","N108","N101","D279","D277","D276"," D273","D267","D262","D261","D255","D253","D221","D204","D179","D177"," D176","D173","D167","D162","D161","D155","D153","D121","D104")); > d$samples; > dim(d) > > #Normalization > > d <- (calcNormFactors(d,method="RLE")); > > #Estimating the dispersion: > > d <- estimateCommonDisp(d,verbose=TRUE) > Disp = 0.05823 , BCV = 0.2413 > d <- estimateTagwiseDisp(d,trend="none") > > #diffeksp > >> fit <- glmFit(d,design) > Error in glmFit.default(y = y$counts, design = design, dispersion = > dispersion, : > Design matrix not of full rank. The following coefficients not > estimable: > statusDiabetic > > Can someone tell me why I get this error? > > Thanks alot! MARK! In group 1: 13 samples In group 2: 11 samples I guess this is the problem? How to handle this?
ADD COMMENT
0
Entering edit mode
The problem is that the columns of your design matrix are linearly dependent: the statusDiabetic column can be written as a sum of all the columns corresponding to diabetic patients. In other words, the patient and status variables are confounded. I think your only option with edgeR is to drop the patients term from your model and just use "~ status * timepoints", but someone else might have another idea. -Ryan On Tue Oct 22 13:56:46 2013, Sindre Lee wrote: > On 2013-10-22 20:03, Sindre Lee wrote: >> Hi! >> The edgeR manual is quite nice, but I'm not quite sure if I'm on the >> right track.. >> >> The question to answer is: "Is there any significantly changed genes >> in diabetics at time point 2, compared to healthy?" >> >> Heres my codes: >> >> #Making the design matrix >> >> status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), >> levels=c("Healthy","Diabetic")) >> patients <- >> factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,7 5,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62 ,61,55,53,21,04)) >> >> timepoints = >> as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1 ,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) >> >> design <- model.matrix(~patients + timepoints*status) >>> colnames(design) >> [1] "(Intercept)" "patients4" >> [3] "patients8" "patients13" >> [5] "patients21" "patients53" >> [7] "patients55" "patients61" >> [9] "patients62" "patients67" >> [11] "patients70" "patients71" >> [13] "patients72" "patients73" >> [15] "patients75" "patients76" >> [17] "patients77" "patients79" >> [19] "patients81" "patients82" >> [21] "patients83" "patients86" >> [23] "patients87" "patients88" >> [25] "timepoints2" "statusDiabetic" >> [27] "timepoints2:statusDiabetic" >> >> #Reading in HTSeq data >> >> description <- c("test"); >> fileNames <- >> list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); >> files <- sort(fileNames,decreasing=TRUE); >> targets <- data.frame(files=files, group=status, >> description=description); >> library(edgeR); >> d <- >> readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,com ment.char="!"); >> >> colnames(d) >> <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271" ,"N270","N213","N208","N201","N188","187","N186","N183","N182","N181", "N175","N172","N171","N170","N113","N108","N101","D279","D277","D276", "D273","D267","D262","D261","D255","D253","D221","D204","D179","D177", "D176","D173","D167","D162","D161","D155","D153","D121","D104")); >> >> d$samples; >> dim(d) >> >> #Normalization >> >> d <- (calcNormFactors(d,method="RLE")); >> >> #Estimating the dispersion: >> >> d <- estimateCommonDisp(d,verbose=TRUE) >> Disp = 0.05823 , BCV = 0.2413 >> d <- estimateTagwiseDisp(d,trend="none") >> >> #diffeksp >> >>> fit <- glmFit(d,design) >> Error in glmFit.default(y = y$counts, design = design, dispersion = >> dispersion, : >> Design matrix not of full rank. The following coefficients not >> estimable: >> statusDiabetic >> >> Can someone tell me why I get this error? >> >> Thanks alot! > > MARK! > In group 1: 13 samples > In group 2: 11 samples > > I guess this is the problem? How to handle this? > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: > http://news.gmane.org/gmane.science.biology.informatics.conductor
ADD REPLY
0
Entering edit mode
@gordon-smyth
Last seen 20 minutes ago
WEHI, Melbourne, Australia
Dear Sindre, I think that you want to find genes for which the change from time1 to time2 is different in the diabetic patients vs the healthy patients. In your experiment, you have two types of factors. The time points are within patients whereas the desease status is between patients. See section 3.5 of the edgeR User's Guide. Setting up the design matrix for such an experiment is a little tricky, not because of the edgeR but because of the limitations of model.matrix() in R. I suggest you do it like this. Set up a variable for timepoint 2 for healthy patients only: time2.healthy <- as.numeric(status=="Healthy" & timepoints==2) Same for diabetics: time2.diabetic <- as.numeric(status=="Diabetic" & timepoints==2) Then design <- model.matrix(~patients+time2.healthy+time2.diabetic) The patients term will pick up the baseline expression level for each patient at time1. time2.healthy will pick up the time2 (treatment) effect for healthy patients and time2.diabetic will do the same for diabetics. Finally, you will want to test the time2.diabetic-time2.healthy contrast. Best wishes Gordon > Date: Tue, 22 Oct 2013 20:03:32 +0200 > From: Sindre Lee <sindre.lee at="" studmed.uio.no=""> > To: <bioconductor at="" r-project.org=""> > Subject: [BioC] edgeR with two different groups (unpaired) at two time > points (paired): Need some input > > Hi! > The edgeR manual is quite nice, but I'm not quite sure if I'm on the > right track.. > > The question to answer is: "Is there any significantly changed genes in > diabetics at time point 2, compared to healthy?" > > Heres my codes: > > #Making the design matrix > > status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), > levels=c("Healthy","Diabetic")) > patients <- > factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75 ,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62, 61,55,53,21,04)) > timepoints = > as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1, 1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) > design <- model.matrix(~patients + timepoints*status) >> colnames(design) > [1] "(Intercept)" "patients4" > [3] "patients8" "patients13" > [5] "patients21" "patients53" > [7] "patients55" "patients61" > [9] "patients62" "patients67" > [11] "patients70" "patients71" > [13] "patients72" "patients73" > [15] "patients75" "patients76" > [17] "patients77" "patients79" > [19] "patients81" "patients82" > [21] "patients83" "patients86" > [23] "patients87" "patients88" > [25] "timepoints2" "statusDiabetic" > [27] "timepoints2:statusDiabetic" > > #Reading in HTSeq data > > description <- c("test"); > fileNames <- > list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); > files <- sort(fileNames,decreasing=TRUE); > targets <- data.frame(files=files, group=status, > description=description); > library(edgeR); > d <- > readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,comm ent.char="!"); > colnames(d) > <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271", "N270","N213","N208","N201","N188","187","N186","N183","N182","N181"," N175","N172","N171","N170","N113","N108","N101","D279","D277","D276"," D273","D267","D262","D261","D255","D253","D221","D204","D179","D177"," D176","D173","D167","D162","D161","D155","D153","D121","D104")); > d$samples; > dim(d) > > #Normalization > > d <- (calcNormFactors(d,method="RLE")); > > #Estimating the dispersion: > > d <- estimateCommonDisp(d,verbose=TRUE) > Disp = 0.05823 , BCV = 0.2413 > d <- estimateTagwiseDisp(d,trend="none") > > #diffeksp > >> fit <- glmFit(d,design) > Error in glmFit.default(y = y$counts, design = design, dispersion = > dispersion, : > Design matrix not of full rank. The following coefficients not > estimable: > statusDiabetic > > Can someone tell me why I get this error? > > Thanks alot! > ______________________________________________________________________ The information in this email is confidential and intend...{{dropped:4}}
ADD COMMENT
0
Entering edit mode
@gordon-smyth
Last seen 20 minutes ago
WEHI, Melbourne, Australia
Please keep the discussion on the Bioconductor mailing list. On Thu, 24 Oct 2013, Sindre Lee wrote: > On 2013-10-24 00:40, Gordon K Smyth wrote: >> Dear Sindre, >> >> I think that you want to find genes for which the change from time1 >> to time2 is different in the diabetic patients vs the healthy >> patients. >> >> In your experiment, you have two types of factors. The time points >> are within patients whereas the desease status is between patients. >> See section 3.5 of the edgeR User's Guide. >> >> Setting up the design matrix for such an experiment is a little >> tricky, not because of the edgeR but because of the limitations of >> model.matrix() in R. >> >> I suggest you do it like this. Set up a variable for timepoint 2 for >> healthy patients only: >> >> time2.healthy <- as.numeric(status=="Healthy" & timepoints==2) >> >> Same for diabetics: >> >> time2.diabetic <- as.numeric(status=="Diabetic" & timepoints==2) >> >> Then >> >> design <- model.matrix(~patients+time2.healthy+time2.diabetic) >> >> The patients term will pick up the baseline expression level for each >> patient at time1. time2.healthy will pick up the time2 (treatment) >> effect for healthy patients and time2.diabetic will do the same for >> diabetics. >> >> Finally, you will want to test the time2.diabetic-time2.healthy contrast. >> >> Best wishes >> Gordon >> >>> Date: Tue, 22 Oct 2013 20:03:32 +0200 >>> From: Sindre Lee <sindre.lee at="" studmed.uio.no=""> >>> To: <bioconductor at="" r-project.org=""> >>> Subject: [BioC] edgeR with two different groups (unpaired) at two time >>> points (paired): Need some input >>> >>> Hi! >>> The edgeR manual is quite nice, but I'm not quite sure if I'm on the >>> right track.. >>> >>> The question to answer is: "Is there any significantly changed genes in >>> diabetics at time point 2, compared to healthy?" >>> >>> Heres my codes: >>> >>> #Making the design matrix >>> >>> status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), >>> levels=c("Healthy","Diabetic")) >>> patients <- >>> factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81, 75,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,6 2,61,55,53,21,04)) >>> timepoints = >>> as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1, 1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) >>> design <- model.matrix(~patients + timepoints*status) >>>> colnames(design) >>> [1] "(Intercept)" "patients4" >>> [3] "patients8" "patients13" >>> [5] "patients21" "patients53" >>> [7] "patients55" "patients61" >>> [9] "patients62" "patients67" >>> [11] "patients70" "patients71" >>> [13] "patients72" "patients73" >>> [15] "patients75" "patients76" >>> [17] "patients77" "patients79" >>> [19] "patients81" "patients82" >>> [21] "patients83" "patients86" >>> [23] "patients87" "patients88" >>> [25] "timepoints2" "statusDiabetic" >>> [27] "timepoints2:statusDiabetic" >>> >>> #Reading in HTSeq data >>> >>> description <- c("test"); >>> fileNames <- >>> list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); >>> files <- sort(fileNames,decreasing=TRUE); >>> targets <- data.frame(files=files, group=status, >>> description=description); >>> library(edgeR); >>> d <- >>> readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,co mment.char="!"); >>> colnames(d) >>> <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271 ","N270","N213","N208","N201","N188","187","N186","N183","N182","N181" ,"N175","N172","N171","N170","N113","N108","N101","D279","D277","D276" ,"D273","D267","D262","D261","D255","D253","D221","D204","D179","D177" ,"D176","D173","D167","D162","D161","D155","D153","D121","D104")); >>> d$samples; >>> dim(d) >>> >>> #Normalization >>> >>> d <- (calcNormFactors(d,method="RLE")); >>> >>> #Estimating the dispersion: >>> >>> d <- estimateCommonDisp(d,verbose=TRUE) >>> Disp = 0.05823 , BCV = 0.2413 >>> d <- estimateTagwiseDisp(d,trend="none") >>> >>> #diffeksp >>> >>>> fit <- glmFit(d,design) >>> Error in glmFit.default(y = y$counts, design = design, dispersion = >>> dispersion, : >>> Design matrix not of full rank. The following coefficients not >>> estimable: >>> statusDiabetic >>> >>> Can someone tell me why I get this error? >>> >>> Thanks alot! >>> >> > > Will this do? > > lrt <- glmLRT(fit, > contrast=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,1)) > > Coefficient: -1*time2.healthy 1*time2.diabetic > Yes, that is correct. To find genes DE at time2 in healthy patients you can use lrt <- glmLRT(fit, coef=25) To find genes DE at time2 in diabetic patients you can use lrt <- glmLRT(fit, coef=26) To find genes whose response differs in diabetic patients vs healthy, the call that you give is correct. You could construct this by: con <- makeContrasts(time2.diabetic-time2.healthy, levels=design) lrt <- glmLRT(fit, contrast=con) Or alternatively con <- rep(0,26) con[26] <- 1 con[25] <- -1 Best wishes Gordon > design <- model.matrix(~patients+time2.healthy+time2.diabetic) > colnames(design) [1] "(Intercept)" "patients4" "patients8" "patients13" [5] "patients21" "patients53" "patients55" "patients61" [9] "patients62" "patients67" "patients70" "patients71" [13] "patients72" "patients73" "patients75" "patients76" [17] "patients77" "patients79" "patients81" "patients82" [21] "patients83" "patients86" "patients87" "patients88" [25] "time2.healthy" "time2.diabetic" > ______________________________________________________________________ The information in this email is confidential and intend...{{dropped:4}}
ADD COMMENT

Login before adding your answer.

Traffic: 679 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6