Regarding the third point, the data you have downloaded is preprocessed for hg38.
In R, you need to import the tabe (which is a tab seperated text file), and optionally parse the ids. You can use the following sniplet (which I adapted from lung cancer, may require changes) to do so:
#infer the classes for faster import
temp=read.table("TCGA_PRAD_meth450k_level3_wd250.table", nrows = 5, header=T, sep="\t", stringsAsFactors=F)
classes <- sapply(temp, class)
classes[1]="character" #the chromosomes
#import the file
tcga_beta_450=read.table("TCGA_PRAD_meth450k_level3_wd250.table", nrows=nr,colClasses = classes, header=T, sep="\t", stringsAsFactors=F)
names(tcga_beta_450)[1:3]=c("chr", "start", "end")
#import the metadata and match sample ids
tcga_samples=read.table("nationwidechildrens.org_biospecimen_sample_prad.txt", header=T, sep="\t", stringsAsFactors=F)
tcga_samplenames_450=sub("^.+TCGA","TCGA",names(tcga_beta_450)[-(1:3)])
tcga_samplenames_450=sub("\\.0\\dD.+_Beta_value","",tcga_samplenames_450)
tcga_samplenames_450=gsub("\\.","-",tcga_samplenames_450)
m=match(tcga_samplenames_450,tcga_samples$bcr_sample_barcode)
tcga_beta_450=tcga_beta_450[,c(1:3,which(!is.na(m))+3)]
tcga_samplenames_450=tcga_samplenames_450[!is.na(m)]
m=m[!is.na(m)]
tissue=sub("^.+ ","",tcga_samples$sample_type[m])
type=sub("^jhu.usc.edu_","",names(tcga_beta_450)[-(1:3)])
type=sub(".HumanMethylation.+$","",type)
#make sample names like
#TCGA_PRAD_Tumor-66-2794-01A
names(tcga_beta_450)[-(1:3)]=paste0("TCGA_",type,"_",tissue, "_",sub("TCGA-","",tcga_samplenames_450))
names(tcga_beta_450)=gsub("-","_",names(tcga_beta_450))
#find matched pairs (tumor/normal) and sort table accordingly
tumor=grep("Tumor", names(tcga_beta_450))
normal=grep("Normal", names(tcga_beta_450))
m=match(gsub("_?\\d?\\d?[A-Za-z]+(_|$)","",names(tcga_beta_450[tumor])),gsub("_?\\d?\\d?[A-Za-z]+(_|$)","",names(tcga_beta_450[normal])) )
mTumor=tumor[!is.na(m)]
mNormal=normal[m[!is.na(m)]]
paste(names(tcga_beta_450)[mTumor], names(tcga_beta_450)[mNormal] )
save(tcga_beta_450,file=paste0("TCGA_PRAD_450k_samples_250wd_",date,".RData"))
This can now be used for calibration similar as in the tutorial. However, I encorage you to have a good look at the data and choose the parameters accordingly as it seems reasonable to you. For example, what regions you want to consider as fully methylated:
tcga_reg=GRanges(tcga_beta_450$chr, IRanges(tcga_beta_450$start,tcga_beta_450$end))
# suggestion 1: fully methylated is at least 90% methylation in at least 95% of the samples
fullMeth=which(rowSums(tcga_beta_450[,c(tumor, normal)]<.9)<length(c(tumor, normal))*.05)
# suggestion 2: define fully methylated as average >90% and variance<0.05
tcga_means=rowMeans(tcga_beta_450[,c(tumor, normal)])
tcga_var=apply(tcga_beta_450[,c(tumor, normal)],1,var)
#hist(tcga_var)
fullMeth=which(tcga_means>.9 & tcga_var<0.05)
#match the regions
ol=findOverlaps( tcga_reg[fullMeth],getRegions(prad_QSEAset), select="first")
nna=!is.na(ol)
ol=ol[nna]
signal=matrix(tcga_means[fullMeth[nna]],nrow=length(ol), ncol=nrow(getSampleTable(prad_QSEAset)), byrow=F )
prad_QSEAset=addEnrichmentParameters(prad_QSEAset, pattern_name="CpG",signal=signal,windowIdx=ol,bins=seq(.5,40.5,2))
Dear Matthias,
Thank you for the detailed reply to my queries. It is very helpful. As per you suggestion I have prepared my sample table and it looks like this
I will perform both IC and MedIP samples analysis with window size adjustments if required, and post the results here.
Thank you again
Best,
Rakesh
Looks good, this should work.