Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
xgb_survival_network/makeRplots.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
398 lines (294 sloc)
24.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reshape2) | |
library(rjson) | |
# helper methods for boxplots in Figures 1 and 2 | |
load_results <- function(result_path, result_file, pathways, cohorts, n_replication) { | |
results <- list() | |
for(pathway in pathways) { | |
results[[pathway]] <- list() | |
for(cohort in cohorts) { | |
results[[pathway]][[cohort]] <- rep(0,n_replication) | |
for(replication in 1:n_replication) { | |
load(sprintf(result_file, result_path, cohort, pathway, replication)) | |
results[[pathway]][[cohort]][replication] <- result$CI | |
print(paste(result$ntree,result$max_depth,result$eta, result$colsample_bytree, result$gamma, result$min_child_weight, result$lambda)) | |
} | |
} | |
} | |
return(results) | |
} | |
load_json_results <- function(result_path, cohorts, n_replication, result_file="%s/%s/xgb_measure_CI_replication_%d_result.json") { | |
results <- list() | |
for(cohort in cohorts) { | |
results[[cohort]] <- rep(0,n_replication) | |
for(replication in 1:n_replication) { | |
result <- fromJSON(file=sprintf(result_file, result_path, cohort, replication)) | |
results[[cohort]][replication] <- result$CI[[cohort]] | |
} | |
} | |
return(results) | |
} | |
load_pancancer_json_results <- function(result_path, cohorts, n_replication, result_file="%s/xgb_measure_CI_replication_%d_result.json") { | |
results <- list() | |
num_reps = 0 | |
for (replication in 1:n_replication) { | |
if (! file.exists(sprintf(result_file, result_path, replication))) { | |
print(sprintf("WARNING: Results for replication %d are missing.", replication)) | |
next | |
} | |
num_reps = num_reps + 1 | |
result <- fromJSON(file=sprintf(result_file, result_path, replication)) | |
print(paste(result$n_round,result$max_depth,result$learning_rate, result$colsample_bytree, result$gamma, result$min_child_weight, result$lambda)) | |
for (cohort in cohorts) { | |
results[[cohort]][replication] <- result$CI[[cohort]] | |
} | |
} | |
print(sprintf("Read %d replications.", num_reps)) | |
return(results) | |
} | |
# Figure 1A: plot single cohort results against baseline methods | |
library(ggplot2) | |
library(ggpubr) | |
cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM") | |
n_replications <- 100 | |
result_path <- "./results_all_cohorts" | |
pathways <- c("none", "hallmark", "pid") | |
random_forest_result_file <- "%s/%s/random_forest_pathway_%s_measure_CI_replication_%d_result.RData" | |
svm_result_file <- "%s/%s/svm_pathway_%s_measure_CI_replication_%d_result.RData" | |
glmkl_result_file <- "%s/%s/glmkl_pathway_%s_measure_CI_replication_%d_result.RData" | |
random_forest_results <- load_results(result_path, random_forest_result_file, pathways[1], cohorts, n_replications) | |
svm_results <- load_results(result_path, svm_result_file, pathways[1], cohorts, n_replications) | |
glmkl_results <- load_results(result_path, glmkl_result_file, pathways[2:length(pathways)], cohorts, n_replications) | |
single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications) | |
result_list <- list("RF"=random_forest_results[["none"]], "SVM"=svm_results[["none"]], "MKL[H]"=glmkl_results[["hallmark"]], "MKL[P]"=glmkl_results[["pid"]], "XGB[SINGLE]"=single_cohort_results) | |
result_df <- melt(result_list) | |
colnames(result_df) <- c("CI", "cohort", "method") | |
pdf("model_rf_svm_glmk_xgb_single_cohort_correct_feature_selection_100_replications_all_25_cohorts_mad_important_features_500_no_age_gender_missing_age_gender_removed_sig_level_high_wider.pdf", width=10, height=10) | |
method_comparisons <- list( c("SVM", "XGB[SINGLE]"), c("RF", "XGB[SINGLE]"), c("MKL[P]", "XGB[SINGLE]"), c("MKL[H]", "XGB[SINGLE]") ) | |
p <- ggplot(data = result_df, aes(x=method, y=CI)) + geom_boxplot(aes(fill=method)) + labs(y = "C-Index", fill = "Method") + scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#F564E3")) | |
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = method_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")), step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) | |
dev.off() | |
### Figure 1B: plot correlations between the different survival prediction methods | |
library(rjson) | |
cohort_a <- "TCGA-UVM" | |
replication_a <- 3 | |
cohort_b <- "TCGA-BLCA" | |
replication_b <- 3 | |
methods <- c("glmkl_pathway_hallmark", "glmkl_pathway_pid", "random_forest_pathway_none", "svm_pathway_none", "xgb_single_cohort") | |
method_survivals_a <- list() | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[1], replication_a)) | |
method_survivals_a[["MKL[H]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1]) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[2], replication_a)) | |
method_survivals_a[["MKL[P]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1]) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[3], replication_a)) | |
method_survivals_a[["RF"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[4], replication_a)) | |
method_survivals_a[["SVM"]] <- cbind("y_true"=result$y_test, "y_pred"=as.vector(result$y_predicted[,1])) | |
result <- fromJSON(file = sprintf("xgb_single_cohort_fixed_train_test_split_results/%s/xgb_measure_CI_result_replication_%d.json", cohort_a, replication_a)) | |
method_survivals_a[["XGB[SINGLE]"]] <- cbind("y_true"=result$y_test[[cohort_a]], "y_pred"= result$y_predicted[[cohort_a]]) | |
result <- fromJSON(file = sprintf("xgb_pancancer_fixed_train_test_split_results/xgb_measure_CI_result_replication_%d.json", replication_a)) | |
method_survivals_a[["XGB[PAN]"]] <- cbind("y_true"=result$y_test[[cohort_a]], "y_pred"= result$y_predicted[[cohort_a]]) | |
method_survivals_b <- list() | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[1], replication_b)) | |
method_survivals_b[["MKL[H]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1]) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[2], replication_b)) | |
method_survivals_b[["MKL[P]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1]) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[3], replication_b)) | |
method_survivals_b[["RF"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted) | |
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[4], replication_b)) | |
method_survivals_b[["SVM"]] <- cbind("y_true"=result$y_test, "y_pred"=as.vector(result$y_predicted[,1])) | |
result <- fromJSON(file = sprintf("xgb_single_cohort_fixed_train_test_split_results/%s/xgb_measure_CI_result_replication_%d.json", cohort_b, replication_b)) | |
method_survivals_b[["XGB[SINGLE]"]] <- cbind("y_true"=result$y_test[[cohort_b]], "y_pred"= result$y_predicted[[cohort_b]]) | |
result <- fromJSON(file = sprintf("xgb_pancancer_fixed_train_test_split_results/xgb_measure_CI_result_replication_%d.json", replication_b)) | |
method_survivals_b[["XGB[PAN]"]] <- cbind("y_true"=result$y_test[[cohort_b]], "y_pred"= result$y_predicted[[cohort_b]]) | |
# check whether all methods where tested on the same patients | |
y_true_matrix_a <- do.call(cbind, lapply(method_survivals_a, function(x) x[,"y_true"])) | |
all_y_equal_a <- all(apply(y_true_matrix_a, 2, identical, y_true_matrix_a[,1])) | |
if (! all_y_equal_a) { | |
print("WARNING: Not all methods used the same test patients for survival prediction") | |
} | |
y_true_matrix_b <- do.call(cbind, lapply(method_survivals_b, function(x) x[,"y_true"])) | |
all_y_equal_b <- all(apply(y_true_matrix_b, 2, identical, y_true_matrix_b[,1])) | |
if (! all_y_equal_b) { | |
print("WARNING: Not all methods used the same test patients for survival prediction") | |
} | |
# compute correlations between predictions of the different methods | |
# ATTENTION: RF and XGB compute risk scores, while GLMKL and SVM compute survival times, so correlations may be negative | |
y_true_matrix_a <- do.call(cbind, lapply(method_survivals_a, function(x) x[,"y_pred"])) | |
method_correlations_a.spearman <- cor(y_true_matrix_a, method='spearman') | |
y_true_matrix_b <- do.call(cbind, lapply(method_survivals_b, function(x) x[,"y_pred"])) | |
method_correlations_b.spearman <- cor(y_true_matrix_b, method='spearman') | |
# visualize correlation matrix | |
mm = 1/25.4 | |
library(corrplot) | |
pdf(sprintf("./survival_prediction/method_prediction_correlations/method_prediction_spearman_correlations_%s_replication_%d_narrow_with_pancancer.pdf", cohort_a, replication_a), width=44*mm, height=44*mm, pointsize=6) | |
corrplot(method_correlations_a.spearman, method="circle", tl.col="black", mar=c(0,0,0,2), cl.ratio = 0.5) | |
#mtext(bquote(bold(.(cohort_a))), at=2.5, line=-19.0, cex=1.2) | |
mtext(bquote(bold(.(cohort_a))), at=75*mm, line=-280.0*mm, cex=1) | |
dev.off() | |
pdf(sprintf("./survival_prediction/method_prediction_correlations/method_prediction_spearman_correlations_%s_replication_%d_narrow_with_pancancer.pdf", cohort_b, replication_b), width=44*mm, height=44*mm, pointsize=6) | |
corrplot(method_correlations_b.spearman, method="circle", tl.col="black", mar=c(0,0,0,2), cl.ratio = 0.5) | |
#mtext(bquote(bold(.(cohort_b))), at=2.5, line=-19.0, cex=1.2) | |
mtext(bquote(bold(.(cohort_b))), at=75*mm, line=-280.0*mm, cex=1) | |
dev.off() | |
### Figure 1C: plot cohort median ages against median CI of XGBoost single-cohort method | |
library(rjson) | |
load_json_results <- function(result_path, cohorts, n_replication, result_file="%s/%s/xgb_measure_CI_replication_%d_result.json") { | |
results <- list() | |
for(cohort in cohorts) { | |
results[[cohort]] <- rep(0,n_replication) | |
for(replication in 1:n_replication) { | |
result <- fromJSON(file=sprintf(result_file, result_path, cohort, replication)) | |
results[[cohort]][replication] <- result$CI[[cohort]] | |
} | |
} | |
return(results) | |
} | |
cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM") | |
n_replications <- 100 | |
single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications) | |
single_cohort_median_ci <- unlist(lapply(single_cohort_results, median)) | |
ages_json <- fromJSON(file="./survival_prediction/TCGA_ages_at_index.txt") | |
ages <- lapply(ages_json, unlist) | |
median_age <- unlist(lapply(ages, median)) | |
# make correlation plot | |
library("ggpubr") | |
mm = 1/25.4 | |
single_cohort_ci_age_df <- data.frame(cohort=cohorts, ci=single_cohort_median_ci, age=median_age[cohorts]) | |
pdf("./survival_prediction/TCGA_single_cohort_CI_age_spearman_correlation_cohort_labels.pdf", width=87*mm, height=60*mm) | |
ggscatter(single_cohort_ci_age_df, x = "ci", y = "age", | |
add = "reg.line", conf.int = TRUE, | |
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line | |
cor.coef = TRUE, cor.method = "spearman", | |
cor.coeff.args = list(method = "spearman", label.x = 0.48, label.y = 45, label.sep = "\n"), cor.coef.size=2.8, | |
label = "cohort", repel = TRUE, font.label = c(6, "plain"), | |
xlab = "Median C-Index", ylab = "Median age") + font("xlab", size = 8) + font("ylab", size = 8) + font("axis.text", size = 6) | |
dev.off() | |
### Figure 2A: plot frequency of genes selected in the XGBoost training of different numbers of cohorts | |
library(ggplot2) | |
# single-cohort | |
feature_importance_table <- read.delim("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=TRUE, row.names=1, stringsAsFactors=FALSE) | |
feature_importance_counts <- rowSums(!is.na(feature_importance_table[,2:ncol(feature_importance_table)])) | |
feature_importance_count_df <- data.frame("gene"=names(feature_importance_counts), "num_cohorts"=feature_importance_counts) | |
mm = 1/25.4 | |
pdf("./survival_prediction/feature_importance_single_cohort_mean_absolute_deviation_genes_fraction_per_cohort_histogram_wide.pdf", width=174*mm, height=50*mm) | |
ggplot(feature_importance_count_df, aes(x=num_cohorts)) + geom_histogram(aes(y=..count../sum(..count..)), binwidth=1.0, color="black", fill="#E76BF3") + ylab("Fraction of genes") + xlab("Number of cohorts") + scale_x_continuous(breaks=1:25, limits = c(0.5,25.5)) + scale_y_continuous(labels = scales::percent) + theme(axis.title.x = element_text(size=8), axis.text.x = element_text(size=6), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) | |
dev.off() | |
### Figure 2B: compare pan-cancer results with single-cohort results | |
library(ggplot2) | |
library(ggpubr) | |
cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM") | |
n_replications <- 100 | |
single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications) | |
pancancer_results <- load_pancancer_json_results("results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_2_results", cohorts, n_replications) | |
result_list <- list("XGB[SINGLE]"=single_cohort_results, "XGB[PAN]"=pancancer_results) | |
result_df <- melt(result_list) | |
colnames(result_df) <- c("CI", "cohort", "method") | |
pdf("model_xgb_pancancer_single_cohort_correct_feature_selection_no_age_gender_comparison_100_replications_high_wide.pdf", width=10, height=9) | |
method_comparisons <- list( c("XGB[SINGLE]", "XGB[PAN]")) | |
p <- ggplot(data = result_df, aes(x=method, y=CI)) + geom_boxplot(aes(fill=method)) + labs(y = "C-Index", fill = "Method") + scale_fill_manual(values=c("#619CFF", "#E76BF3")) | |
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = method_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")), step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) | |
dev.off() | |
### Figure 2D: plot C-Indices for the 8 new cohorts | |
library(rjson) | |
library(reshape2) | |
library(ggplot2) | |
cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM") | |
pancancer_json <- fromJSON(file = "xgb_pancancer_new_cohorts_all_cohort_training_results/xgb_measure_CI_new_cohorts_result.json") | |
pancancer_cis <- unlist(pancancer_json$CI) | |
single_cohort_ci_df <- c() | |
for (cohort in cohorts) { | |
cohort_json <- fromJSON(file = sprintf("xgb_single_cohort_new_cohorts_all_cohort_training_results/%s/xgb_measure_CI_result.json", cohort)) | |
cohort_cis <- unlist(cohort_json$CI) | |
single_cohort_ci_df <- cbind(single_cohort_ci_df, cohort_cis) | |
} | |
single_cohort_ci_means <- apply(single_cohort_ci_df, 1, mean) | |
cis <- data.frame("Cohorts"=names(pancancer_cis), "Pan-cancer"=pancancer_cis, "Single-cohort (mean)"=single_cohort_ci_means, check.names=FALSE) | |
cis$Cohorts <- factor(cis$Cohorts, levels = cis$Cohorts) | |
cis.melted <- melt(cis, id.vars='Cohorts') | |
cohort_sizes <- c("36 (18)","47 (9)","64 (9)","178 (6)","493 (10)","134 (4)","501 (16)","118 (9)") | |
mm = 1/25.4 | |
pdf("./survival_prediction/new_cohorts_CIs_pancancer_vs_single_cohort_means_with_cohort_sizes_narrow.pdf", width=108*mm, height=66*mm) | |
ggplot(cis.melted, aes(Cohorts, value)) + geom_bar(aes(fill = variable), position = "dodge", stat="identity") + ylab("C-Index") + theme(legend.position="top", legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust=1, size=6), legend.text = element_text(size=8), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) + geom_text(aes(label = ifelse(variable=="Pan-cancer", cohort_sizes, " ")), nudge_y = 0.05 + pmax(0, cis[,"Single-cohort (mean)"] - cis[,"Pan-cancer"]), size=6/(14/5)) + guides(fill = guide_legend(keywidth = unit(0.3, "cm"), keyheight = unit(0.3, "cm"))) + scale_fill_manual(values=c("#619CFF", "#E76BF3")) | |
dev.off() | |
### Figure 3A: plot feature importance sums of top 100 genes in pan-cancer XGBoost approach | |
library(ggplot2) | |
pancancer_feature_csv <- read.table("./survival_prediction/result_feature_importance_pancancer_100_replications_sums_with_gene_types.tab", sep='\t', header=FALSE, na.strings='', stringsAsFactors=FALSE) | |
colnames(pancancer_feature_csv) <- c("Gene", "Feature_importance", "Gene_type") | |
pancancer_feature_csv$Gene_type[is.na(pancancer_feature_csv$Gene_type)] <- 'NAN' | |
pancancer_feature_table <- pancancer_feature_csv[1:100,] | |
color_map <- data.frame('protein_coding'='#1f77b4', 'lncRNA'='#ff7f0e', 'processed_pseudogene'='#2ca02c', 'transcribed_unprocessed_pseudogene'='#9467bd', 'NAN'='#d62728') | |
gene_type_colors <- as.vector(unlist(sapply(pancancer_feature_table$Gene_type, function(x) color_map[x]))) | |
pancancer_feature_table$Gene <- factor(pancancer_feature_table$Gene, levels = pancancer_feature_table$Gene) | |
pancancer_feature_table$Gene_type <- factor(pancancer_feature_table$Gene_type, levels = names(color_map)) | |
pdf("./survival_prediction/xgb_pancancer_feature_importance_scores_annotated_gene_types_wide.pdf", width=6.85, height=3.19) | |
ggplot(pancancer_feature_table, aes(Gene, Feature_importance)) + | |
geom_bar(aes(fill=Gene_type), color="black", size=0.4, position = "dodge", stat="identity") + | |
ylab("Sum of feature importances") + | |
theme(legend.position="top", legend.title = element_blank(), legend.text = element_text(size=8), axis.text.x = element_text(angle = 90, hjust=1, size = 5), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) + | |
scale_fill_manual(label = c('Protein coding', 'lncRNA', 'Processed pseudogene', 'Transcribed unprocessed pseudogene', 'Unknown'), values = c('#1f77b4', '#ff7f0e', '#2ca02c', '#9467bd', '#d62728'), guide = guide_legend(keywidth = 0.5, keyheight = 0.5)) | |
dev.off() | |
### Figure 3B: plot feature entropies of single-cohort and pan-cancer features | |
library(ggplot2) | |
library(reshape2) | |
pancancer_entropies <- read.table("./survival_prediction/pancancer_feature_entropies.tab", sep='\t', header=FALSE, check.names=FALSE, stringsAsFactors=FALSE, row.names=1) | |
single_cohort_entropies <- read.table("./survival_prediction/single_cohort_feature_entropies.tab", sep='\t', header=FALSE, check.names=FALSE, stringsAsFactors=FALSE, row.names=1) | |
num_features <- 100 | |
library(plyr) | |
# perform one-sided Wilcoxon Rank Sum test | |
wt <- wilcox.test(single_cohort_entropies[1:num_features,1], pancancer_entropies[1:num_features,1], paired=FALSE, alternative="less") | |
print(wt$p.value) | |
entropy_df.original_order <- data.frame("entropy"=c(pancancer_entropies[1:num_features,1], single_cohort_entropies[1:num_features,1]), "method"=c(rep("XGB[PAN]", num_features), rep("XGB[SINGLE]", num_features))) | |
# calculate means | |
mu <- ddply(entropy_df.original_order, "method", summarise, grp.mean=mean(entropy, na.rm=TRUE)) | |
pdf("./survival_prediction/densities_pancancer_single_cohort_entropies_original_feature_importance_order_top_100_wide.pdf", width=6.85, height=2.28) | |
ggplot(entropy_df.original_order, aes(x=entropy)) + geom_density(aes(fill=method), alpha=0.8) + theme(legend.title = element_blank(), legend.text = element_text(size=8), axis.title.x = element_text(size=8), axis.text.x = element_text(size=6), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) + xlab("Entropy in XGBoost single-cohort approach") + ylab("Density") + geom_vline(data=mu, aes(xintercept=grp.mean, color=method), linetype="dashed") + scale_fill_manual(values=c("#619CFF", "#E76BF3")) + scale_color_manual(values=c("#0041B0", "#940DA1")) | |
dev.off() | |
### Figure SI3: plot feature importance sums of all important genes in pan-cancer XGBoost approach | |
library(ggplot2) | |
pancancer_feature_table <- read.table("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=FALSE, na.strings='', stringsAsFactors=FALSE) | |
colnames(pancancer_feature_table) <- c("Gene", "Feature_importance") | |
pancancer_feature_table$Gene <- factor(pancancer_feature_table$Gene, levels = pancancer_feature_table$Gene) | |
pdf("./survival_prediction/xgb_pancancer_feature_importance_scores_all_genes.pdf", width=6.85, height=3.19) | |
ggplot(pancancer_feature_table, aes(Gene, Feature_importance)) + | |
geom_bar(fill="#619CFF", color="#619CFF", size=0.4, position = "dodge", stat="identity") + | |
ylab("Sum of feature importances") + | |
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) | |
dev.off() | |
### Figure SI5: plot pan-cancer XGBoost results for patient subsets of different sizes | |
library(ggplot2) | |
library(ggpubr) | |
cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM") | |
n_replications <- 100 | |
pancancer_results_all <- load_pancancer_json_results("results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_2_results", cohorts, n_replications) | |
pancancer_results_1000 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_1000_samples_results", cohorts, n_replications) | |
pancancer_results_500 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_500_samples_results", cohorts, n_replications) | |
pancancer_results_50 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_50_samples_results", cohorts, n_replications) | |
result_list <- list("all"=pancancer_results_all, "1000"=pancancer_results_1000, "500"=pancancer_results_500, "50"=pancancer_results_50) | |
result_df <- melt(result_list) | |
colnames(result_df) <- c("CI", "cohort", "num_patients") | |
result_df$num_patients <- factor(result_df$num_patients, levels = unique(result_df$num_patients)) | |
pdf("model_xgb_pancancer_all_patients_patient_subsets_comparison_100_replications_high_wide.pdf", width=10, height=9) | |
num_patients_comparisons <- list( c("all", "1000"), c("all", "500"), c("all", "50")) | |
p <- ggplot(data = result_df, aes(x=num_patients, y=CI)) + geom_boxplot(aes(fill=num_patients)) + labs(y = "C-Index", fill = "Patients") + scale_fill_manual(values=c("#619CFF", "#F8766D", "#7CAE00", "#C77CFF")) | |
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = num_patients_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")), step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) | |
dev.off() |