Entering edit mode
Helen Smith
▴
100
@helen-smith-6087
Last seen 10.3 years ago
Hi All,
I have a general bioconductor question. I've been struggling with how
to do this all week, so any help would be much appreciated,
I have used correlation analysis in the past to define a range of
genes which correlate with a certain prognosis score (all prognosis
scores known). Please see the script at the bottom of this page that I
have previously applied.
This new data set is a bit different as I know the prognosis of 2/3's
of the samples but the third set is unknown and I want to figure out
individual samples within this third set are more like sample set A or
B (different samples within this 'unknown set C' may be split, they
are not all necessarily the all the same (good or bad)):
* Sample set A
o poor prognosis
* Sample set B
o good prognosis
* Sample set C
o Unknown prognosis
Can the script be amended in some way to account for this and cluster
samples within group C near set A or B within the heatmaps?
Thanks again,
Helen
Previous script:
setwd("mnlkn")
dat<-read.csv("correlation analysis- gene names (SIBLING
REMOVAL).csv",header=T)
dat[1:10,]
Gene<-dat[,1]
dat<-dat[,-1]
blastocyst<-dat[,1:10]
######################################################################
##############################
expression<-matrix(NA,nrow(sample1),ncol(sample1))
####Take Average logfc of multiples of the same gene####
gid<-unique(Gene)
mGene<-matrix(NA,length(gid),ncol(sample1))
for(i in 1:length(gid)){
rid<-which(Gene==gid[i])
for(j in 1:ncol(blastocyst)){
mGene[i,j]<-mean(blastocyst[rid,j])
}
}
mGene[1:10,1:10]
dim(mGene)
t.mGene<-t(mGene)
t.mGene[1:10,1:10]
######################################################################
##############################
######prog scores and ranks for All 15 samples, with patient
duplicates)
###pearson###
age<-c(33,28,37,41,38,37,33,33,28,36)
###spearman###
age.rank<-c(2,1,4,6,5,4,2,2,1,3)
######################################################################
##############################
COR_3<-rep(NA,19992)
COR_4<-rep(NA,19992)
for(i in 1:19992){
COR_3[i]<-cor(age.rank,rank(abs(t.mGene[,i])),method="spearman")
COR_4[i]<-cor(age,(abs(t.mGene[,i])))
}
par(mfrow=c(2,1))
plot(COR_3,ylim=c(-1,1));abline(h=c(0.75,-0.75),lwd=2,col=2,lty=2)
plot(COR_4,ylim=c(-1,1));abline(h=c(0.75,-0.75),lwd=2,col=2,lty=2)
######################################################################
###
######################################################################
###
##########################SPEARMANS
RANK#################################
####Setting a series of different thresholds
THRESH<-seq(0.7,0.8,by=0.01)
par(ask=TRUE)
THRESH
#ranked prognosis scores, aka COR_3####
for(k in 1:10){
thresh<-THRESH[k]
rank.abs.2<-which(abs(COR_3)>thresh)
rank.pos.2<-which(COR_3>thresh)
rank.neg.2<-which(COR_3<(-thresh))
### Creating labels for plot
## Gene[rank.neg.2]
Prog.id<-c("good","bad","good","good","good","bad","good","bad","bad",
"bad")
######
id<-rank.neg.2
### heat map
ht.dat<-as.data.frame(mGene[id,])
names(ht.dat)<-Prog.id
row.names(ht.dat)<-gid[id]
ht.dat<-as.matrix(ht.dat)
heatmap(ht.dat)}
######################################################################
####
##plot specfic thresholds for powerpoint before and after breaks down
### Setting threshold for boundary of interest
############absolute
thresh<-0.5
rank.abs.2<-which(abs(COR_3)>thresh)
id<-rank.abs.2
### heat map
ht.dat<-as.data.frame(mGene[id,])
names(ht.dat)<-Prog.id
row.names(ht.dat)<-gid[id]
ht.dat<-as.matrix(ht.dat)
heatmap(ht.dat)
ht.dat
####identify genes with greater expression than 50 (log 5.6438)
new.mat<-ht.dat
new.mat[(which(abs(new.mat)<5.6438))]<-""
as.numeric(new.mat)
t(new.mat)
## Getting the Genes of interest
GENES<-gid[id];GENES
## boxplot
par(mfrow=c(1,1))
par(las=2)
boxplot(ht.dat)
genes<-gid[id]###lists names of genes on heatmap
genes
length(genes)
##########positive
thresh<-0.76
rank.pos.2<-which(COR_3>thresh)
id<-rank.pos.2
### heat map
ht.dat<-as.data.frame(mGene[id,])
names(ht.dat)<-Prog.id
row.names(ht.dat)<-gid[id]
ht.dat<-as.matrix(ht.dat)
heatmap(ht.dat)
ht.dat
####identify genes with less greater than 2fc (lof1)
new.mat<-ht.dat
new.mat[(which(abs(new.mat)<1))]<-""
as.numeric(new.mat)
t(new.mat)
## Getting the Genes of interest
GENES<-gid[id];GENES
[[alternative HTML version deleted]]