Hi, I am trying to subset 19 genes from the TCGA raw count table and convert the raw counts to TPM and get the median value of the genes across samples within the same cancer type and plot in a heatmap.
The results do not look right to me. e.g., MLSN should be high in MESO, FOLH1 (prostate-specific tumor antigen) should be high in PRAD
The code.
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276",
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){
library(recount3)
## Get all possible projects
all_proj <- available_projects()
tcga_proj <- all_proj %>% filter(file_source == "tcga")
if (cancer_types != "all") {
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,]
}
final_tcga <- NULL
for (tcga_ind in seq(nrow(tcga_proj))) {
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")]
rse_gene_temp<- create_rse(proj_info)
count_matrix <- rse_gene_temp@assays@data$raw_counts
## Select gene ind
if (!identical("all", gene_vector)) {
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector
} else {
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name)
}
if (expression_type == "TPM") {
#### Creating TPM
kilobase_length <- rse_gene_temp@rowRanges$bp_length
reads_per_rpk <- count_matrix/kilobase_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
exprs_vector <- tpm_matrix[gene_ind,]
} else{
#log2 normalize the count matrix
total_reads<- colSums(count_matrix)
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1)
exprs_vector <- normalized_counts[gene_ind,]
}
tcga_data_frame <- rse_gene_temp@colData@listData |>
as.data.frame()
if (nrow(exprs_vector) > length(gene_vector)) {
ind_vector <- rownames(exprs_vector)
} else {
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name
}
tcga_data_frame[,ind_vector] <- exprs_vector
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything())
final_tcga <- rbind(final_tcga, tcga_data_frame)
}
## Group cancer data
final_tcga_filt <- final_tcga %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) {
message("Multiple columns share gene names, returning ENSG")
}
return(final_tcga_filt)
}
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all")
tcga_df<- tcga_TAA %>%
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_df<- tcga_df %>%
filter(sample_type == "cancer")
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
I used a different count table from ExperimentHub and calculated the TPM, someone else uses the RSEM data from cbioportal and the results look right to me. I am wondering if there is any bug in my script for recount3 or if there is a data problem there.
The RSEM results
The gist can be found at https://gist.github.com/crazyhottommy/042aa268bbc8c6a4067545600da10d24
Thanks a lot for your time.
Best, Tommy
something was wrong with the original code. I rewrite the code below and now the results make sense..
```{r}
BiocManager::install("recount3")
library(recount3) library(purrr) human_projects <- available_projects()
tcga_info = subset( human_projects, file_source == "tcga" & project_type == "data_sources" )
seq(nrow(tcga_info))
proj_info <- map(seq(nrow(tcga_info)), ~tcga_info[.x, ])
rse_tcga <- map(proj_info, ~create_rse(.x))
You can also scale the expression for each gene across the cancer types