makeRplots.R

library(reshape2)
library(rjson)

# helper methods for boxplots in Figures 1 and 2
load_results <- function(result_path, result_file, pathways, cohorts, n_replication) {
	results <- list()
	for(pathway in pathways) {
		results[[pathway]] <- list()
		for(cohort in cohorts) {
			results[[pathway]][[cohort]] <- rep(0,n_replication)
			for(replication in 1:n_replication) {
				load(sprintf(result_file, result_path, cohort, pathway, replication))
				results[[pathway]][[cohort]][replication] <- result$CI
				print(paste(result$ntree,result$max_depth,result$eta, result$colsample_bytree, result$gamma, result$min_child_weight, result$lambda))
			}
		}
	}
	return(results)
}

load_json_results <- function(result_path, cohorts, n_replication, result_file="%s/%s/xgb_measure_CI_replication_%d_result.json") {
	results <- list()
	for(cohort in cohorts) {
		results[[cohort]] <- rep(0,n_replication)
		for(replication in 1:n_replication) {
			result <- fromJSON(file=sprintf(result_file, result_path, cohort, replication))
			results[[cohort]][replication] <- result$CI[[cohort]]
		}
	}
	return(results)
}

load_pancancer_json_results <- function(result_path, cohorts, n_replication, result_file="%s/xgb_measure_CI_replication_%d_result.json") {
	results <- list()
    num_reps = 0
	for (replication in 1:n_replication) {
        if (! file.exists(sprintf(result_file, result_path, replication))) {
            print(sprintf("WARNING: Results for replication %d are missing.", replication))
            next
        }
        num_reps = num_reps + 1
		result <- fromJSON(file=sprintf(result_file, result_path, replication))
		print(paste(result$n_round,result$max_depth,result$learning_rate, result$colsample_bytree, result$gamma, result$min_child_weight, result$lambda))
		for (cohort in cohorts) {
			results[[cohort]][replication] <- result$CI[[cohort]]
		}
	}
    print(sprintf("Read %d replications.", num_reps))
	return(results)
}

# Figure 1A: plot single cohort results against baseline methods 
library(ggplot2)
library(ggpubr)

cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", 
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", 
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", 
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", 
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM")

n_replications <- 100
result_path <- "./results_all_cohorts"
pathways <- c("none", "hallmark", "pid")

random_forest_result_file <- "%s/%s/random_forest_pathway_%s_measure_CI_replication_%d_result.RData"
svm_result_file <- "%s/%s/svm_pathway_%s_measure_CI_replication_%d_result.RData"
glmkl_result_file <- "%s/%s/glmkl_pathway_%s_measure_CI_replication_%d_result.RData" 

random_forest_results <- load_results(result_path, random_forest_result_file, pathways[1], cohorts, n_replications)
svm_results <- load_results(result_path, svm_result_file, pathways[1], cohorts, n_replications)
glmkl_results <- load_results(result_path, glmkl_result_file, pathways[2:length(pathways)], cohorts, n_replications)

single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications)

result_list <- list("RF"=random_forest_results[["none"]], "SVM"=svm_results[["none"]], "MKL[H]"=glmkl_results[["hallmark"]], "MKL[P]"=glmkl_results[["pid"]], "XGB[SINGLE]"=single_cohort_results)
result_df <- melt(result_list)
colnames(result_df) <- c("CI", "cohort", "method")

pdf("model_rf_svm_glmk_xgb_single_cohort_correct_feature_selection_100_replications_all_25_cohorts_mad_important_features_500_no_age_gender_missing_age_gender_removed_sig_level_high_wider.pdf", width=10, height=10)
method_comparisons <- list( c("SVM", "XGB[SINGLE]"), c("RF", "XGB[SINGLE]"), c("MKL[P]", "XGB[SINGLE]"), c("MKL[H]", "XGB[SINGLE]") )
p <- ggplot(data = result_df, aes(x=method, y=CI)) + geom_boxplot(aes(fill=method)) + labs(y = "C-Index", fill = "Method") + scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#F564E3"))
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = method_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),  step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
dev.off()


### Figure 1B: plot correlations between the different survival prediction methods
library(rjson)

cohort_a <- "TCGA-UVM"
replication_a <- 3

cohort_b <- "TCGA-BLCA"
replication_b <- 3

methods <- c("glmkl_pathway_hallmark", "glmkl_pathway_pid", "random_forest_pathway_none", "svm_pathway_none", "xgb_single_cohort")

method_survivals_a <- list()
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[1], replication_a))
method_survivals_a[["MKL[H]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1])

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[2], replication_a))
method_survivals_a[["MKL[P]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1])

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[3], replication_a))
method_survivals_a[["RF"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted)

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_a, methods[4], replication_a))
method_survivals_a[["SVM"]] <- cbind("y_true"=result$y_test, "y_pred"=as.vector(result$y_predicted[,1]))

result <- fromJSON(file = sprintf("xgb_single_cohort_fixed_train_test_split_results/%s/xgb_measure_CI_result_replication_%d.json", cohort_a, replication_a))
method_survivals_a[["XGB[SINGLE]"]] <- cbind("y_true"=result$y_test[[cohort_a]], "y_pred"= result$y_predicted[[cohort_a]])

result <- fromJSON(file = sprintf("xgb_pancancer_fixed_train_test_split_results/xgb_measure_CI_result_replication_%d.json", replication_a))
method_survivals_a[["XGB[PAN]"]] <- cbind("y_true"=result$y_test[[cohort_a]], "y_pred"= result$y_predicted[[cohort_a]])


method_survivals_b <- list()
load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[1], replication_b))
method_survivals_b[["MKL[H]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1])

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[2], replication_b))
method_survivals_b[["MKL[P]"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted[,1])

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[3], replication_b))
method_survivals_b[["RF"]] <- cbind("y_true"=result$y_test, "y_pred"=result$y_predicted)

load(sprintf("results_all_cohorts/%s/%s_measure_CI_replication_%d_result.RData", cohort_b, methods[4], replication_b))
method_survivals_b[["SVM"]] <- cbind("y_true"=result$y_test, "y_pred"=as.vector(result$y_predicted[,1]))

result <- fromJSON(file = sprintf("xgb_single_cohort_fixed_train_test_split_results/%s/xgb_measure_CI_result_replication_%d.json", cohort_b, replication_b))
method_survivals_b[["XGB[SINGLE]"]] <- cbind("y_true"=result$y_test[[cohort_b]], "y_pred"= result$y_predicted[[cohort_b]])

result <- fromJSON(file = sprintf("xgb_pancancer_fixed_train_test_split_results/xgb_measure_CI_result_replication_%d.json", replication_b))
method_survivals_b[["XGB[PAN]"]] <- cbind("y_true"=result$y_test[[cohort_b]], "y_pred"= result$y_predicted[[cohort_b]])


# check whether all methods where tested on the same patients
y_true_matrix_a <- do.call(cbind, lapply(method_survivals_a, function(x) x[,"y_true"]))
all_y_equal_a <- all(apply(y_true_matrix_a, 2, identical, y_true_matrix_a[,1]))
if (! all_y_equal_a) {
    print("WARNING: Not all methods used the same test patients for survival prediction")
}
y_true_matrix_b <- do.call(cbind, lapply(method_survivals_b, function(x) x[,"y_true"]))
all_y_equal_b <- all(apply(y_true_matrix_b, 2, identical, y_true_matrix_b[,1]))
if (! all_y_equal_b) {
    print("WARNING: Not all methods used the same test patients for survival prediction")
}


# compute correlations between predictions of the different methods
# ATTENTION: RF and XGB compute risk scores, while GLMKL and SVM compute survival times, so correlations may be negative
y_true_matrix_a <- do.call(cbind, lapply(method_survivals_a, function(x) x[,"y_pred"]))
method_correlations_a.spearman <- cor(y_true_matrix_a, method='spearman')
y_true_matrix_b <- do.call(cbind, lapply(method_survivals_b, function(x) x[,"y_pred"]))
method_correlations_b.spearman <- cor(y_true_matrix_b, method='spearman')

# visualize correlation matrix
mm = 1/25.4
library(corrplot)
pdf(sprintf("./survival_prediction/method_prediction_correlations/method_prediction_spearman_correlations_%s_replication_%d_narrow_with_pancancer.pdf", cohort_a, replication_a), width=44*mm, height=44*mm, pointsize=6)
corrplot(method_correlations_a.spearman, method="circle", tl.col="black", mar=c(0,0,0,2), cl.ratio = 0.5)
#mtext(bquote(bold(.(cohort_a))), at=2.5, line=-19.0, cex=1.2)
mtext(bquote(bold(.(cohort_a))), at=75*mm, line=-280.0*mm, cex=1)
dev.off()

pdf(sprintf("./survival_prediction/method_prediction_correlations/method_prediction_spearman_correlations_%s_replication_%d_narrow_with_pancancer.pdf", cohort_b, replication_b), width=44*mm, height=44*mm, pointsize=6)
corrplot(method_correlations_b.spearman, method="circle", tl.col="black", mar=c(0,0,0,2), cl.ratio = 0.5)
#mtext(bquote(bold(.(cohort_b))), at=2.5, line=-19.0, cex=1.2)
mtext(bquote(bold(.(cohort_b))), at=75*mm, line=-280.0*mm, cex=1)
dev.off()


### Figure 1C: plot cohort median ages against median CI of XGBoost single-cohort method
library(rjson)

load_json_results <- function(result_path, cohorts, n_replication, result_file="%s/%s/xgb_measure_CI_replication_%d_result.json") {
	results <- list()
	for(cohort in cohorts) {
		results[[cohort]] <- rep(0,n_replication)
		for(replication in 1:n_replication) {
			result <- fromJSON(file=sprintf(result_file, result_path, cohort, replication))
			results[[cohort]][replication] <- result$CI[[cohort]]
		}
	}
	return(results)
}


cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", 
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", 
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", 
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", 
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM")

n_replications <- 100
single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications)

single_cohort_median_ci <- unlist(lapply(single_cohort_results, median))

ages_json <- fromJSON(file="./survival_prediction/TCGA_ages_at_index.txt")
ages <- lapply(ages_json, unlist)

median_age <- unlist(lapply(ages, median))

# make correlation plot
library("ggpubr")
mm = 1/25.4
single_cohort_ci_age_df <- data.frame(cohort=cohorts, ci=single_cohort_median_ci, age=median_age[cohorts])
pdf("./survival_prediction/TCGA_single­_cohort_CI_age_spearman_correlation_cohort_labels.pdf", width=87*mm, height=60*mm)
ggscatter(single_cohort_ci_age_df, x = "ci", y = "age", 
          add = "reg.line", conf.int = TRUE, 
          add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
          cor.coef = TRUE, cor.method = "spearman",
          cor.coeff.args = list(method = "spearman", label.x = 0.48, label.y = 45, label.sep = "\n"), cor.coef.size=2.8,
          label = "cohort", repel = TRUE, font.label = c(6, "plain"),
          xlab = "Median C-Index", ylab = "Median age") + font("xlab", size = 8) + font("ylab", size = 8) + font("axis.text", size = 6)
dev.off()


### Figure 2A: plot frequency of genes selected in the XGBoost training of different numbers of cohorts
library(ggplot2)

# single-cohort
feature_importance_table <- read.delim("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=TRUE, row.names=1, stringsAsFactors=FALSE)
feature_importance_counts <- rowSums(!is.na(feature_importance_table[,2:ncol(feature_importance_table)]))

feature_importance_count_df <- data.frame("gene"=names(feature_importance_counts), "num_cohorts"=feature_importance_counts)

mm = 1/25.4
pdf("./survival_prediction/feature_importance_single_cohort_mean_absolute_deviation_genes_fraction_per_cohort_histogram_wide.pdf", width=174*mm, height=50*mm)
ggplot(feature_importance_count_df, aes(x=num_cohorts)) + geom_histogram(aes(y=..count../sum(..count..)), binwidth=1.0, color="black", fill="#E76BF3") + ylab("Fraction of genes") + xlab("Number of cohorts") + scale_x_continuous(breaks=1:25, limits = c(0.5,25.5)) + scale_y_continuous(labels = scales::percent) + theme(axis.title.x = element_text(size=8), axis.text.x = element_text(size=6), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6))
dev.off()


### Figure 2B: compare pan-cancer results with single-cohort results
library(ggplot2)
library(ggpubr)

cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", 
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", 
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", 
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", 
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM")
            
n_replications <- 100
single_cohort_results <- load_json_results("results_xgb_pancancer_xgb_correct_feature_selection_single_cohort_no_age_gender_results", cohorts, n_replications)
pancancer_results <- load_pancancer_json_results("results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_2_results", cohorts, n_replications)

result_list <- list("XGB[SINGLE]"=single_cohort_results, "XGB[PAN]"=pancancer_results)
result_df <- melt(result_list)
colnames(result_df) <- c("CI", "cohort", "method")

pdf("model_xgb_pancancer_single_cohort_correct_feature_selection_no_age_gender_comparison_100_replications_high_wide.pdf", width=10, height=9)
method_comparisons <- list( c("XGB[SINGLE]", "XGB[PAN]"))
p <- ggplot(data = result_df, aes(x=method, y=CI)) + geom_boxplot(aes(fill=method)) + labs(y = "C-Index", fill = "Method") + scale_fill_manual(values=c("#619CFF", "#E76BF3"))
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = method_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),  step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
dev.off()


### Figure 2D: plot C-Indices for the 8 new cohorts
library(rjson)
library(reshape2)
library(ggplot2)

cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", 
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", 
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", 
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", 
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM")


pancancer_json <- fromJSON(file = "xgb_pancancer_new_cohorts_all_cohort_training_results/xgb_measure_CI_new_cohorts_result.json")
pancancer_cis <- unlist(pancancer_json$CI)

single_cohort_ci_df <- c()

for (cohort in cohorts) {
    cohort_json <- fromJSON(file = sprintf("xgb_single_cohort_new_cohorts_all_cohort_training_results/%s/xgb_measure_CI_result.json", cohort))
    cohort_cis <- unlist(cohort_json$CI)
    single_cohort_ci_df <- cbind(single_cohort_ci_df, cohort_cis)
}

single_cohort_ci_means <- apply(single_cohort_ci_df, 1, mean)

cis <- data.frame("Cohorts"=names(pancancer_cis), "Pan-cancer"=pancancer_cis, "Single-cohort (mean)"=single_cohort_ci_means, check.names=FALSE)
cis$Cohorts <- factor(cis$Cohorts, levels = cis$Cohorts)
cis.melted <- melt(cis, id.vars='Cohorts')

cohort_sizes <- c("36 (18)","47 (9)","64 (9)","178 (6)","493 (10)","134 (4)","501 (16)","118 (9)")

mm = 1/25.4
pdf("./survival_prediction/new_cohorts_CIs_pancancer_vs_single_cohort_means_with_cohort_sizes_narrow.pdf", width=108*mm, height=66*mm)
ggplot(cis.melted, aes(Cohorts, value)) + geom_bar(aes(fill = variable), position = "dodge", stat="identity") + ylab("C-Index") + theme(legend.position="top", legend.title = element_blank(), axis.text.x = element_text(angle = 45, hjust=1, size=6), legend.text = element_text(size=8), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) + geom_text(aes(label = ifelse(variable=="Pan-cancer", cohort_sizes, " ")), nudge_y = 0.05 + pmax(0, cis[,"Single-cohort (mean)"] - cis[,"Pan-cancer"]), size=6/(14/5)) + guides(fill = guide_legend(keywidth = unit(0.3, "cm"), keyheight = unit(0.3, "cm"))) + scale_fill_manual(values=c("#619CFF", "#E76BF3"))
dev.off()


### Figure 3A: plot feature importance sums of top 100 genes in pan-cancer XGBoost approach
library(ggplot2)

pancancer_feature_csv <- read.table("./survival_prediction/result_feature_importance_pancancer_100_replications_sums_with_gene_types.tab", sep='\t', header=FALSE, na.strings='', stringsAsFactors=FALSE)
colnames(pancancer_feature_csv) <- c("Gene", "Feature_importance", "Gene_type")
pancancer_feature_csv$Gene_type[is.na(pancancer_feature_csv$Gene_type)] <- 'NAN'

pancancer_feature_table <- pancancer_feature_csv[1:100,]

color_map <- data.frame('protein_coding'='#1f77b4', 'lncRNA'='#ff7f0e', 'processed_pseudogene'='#2ca02c', 'transcribed_unprocessed_pseudogene'='#9467bd', 'NAN'='#d62728')
gene_type_colors <- as.vector(unlist(sapply(pancancer_feature_table$Gene_type, function(x) color_map[x])))

pancancer_feature_table$Gene <- factor(pancancer_feature_table$Gene, levels = pancancer_feature_table$Gene)
pancancer_feature_table$Gene_type <- factor(pancancer_feature_table$Gene_type, levels = names(color_map))

pdf("./survival_prediction/xgb_pancancer_feature_importance_scores_annotated_gene_types_wide.pdf", width=6.85, height=3.19)
ggplot(pancancer_feature_table, aes(Gene, Feature_importance)) +
    geom_bar(aes(fill=Gene_type), color="black", size=0.4, position = "dodge", stat="identity") +
    ylab("Sum of feature importances") +
    theme(legend.position="top", legend.title = element_blank(), legend.text = element_text(size=8), axis.text.x = element_text(angle = 90, hjust=1, size = 5), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) +
    scale_fill_manual(label = c('Protein coding', 'lncRNA', 'Processed pseudogene', 'Transcribed unprocessed pseudogene', 'Unknown'), values = c('#1f77b4', '#ff7f0e', '#2ca02c', '#9467bd', '#d62728'), guide = guide_legend(keywidth = 0.5, keyheight = 0.5))
dev.off()


### Figure 3B: plot feature entropies of single-cohort and pan-cancer features
library(ggplot2)
library(reshape2)

pancancer_entropies <- read.table("./survival_prediction/pancancer_feature_entropies.tab", sep='\t', header=FALSE, check.names=FALSE, stringsAsFactors=FALSE, row.names=1)
single_cohort_entropies <- read.table("./survival_prediction/single_cohort_feature_entropies.tab", sep='\t', header=FALSE, check.names=FALSE, stringsAsFactors=FALSE, row.names=1)

num_features <- 100
library(plyr)

# perform one-sided Wilcoxon Rank Sum test
wt <- wilcox.test(single_cohort_entropies[1:num_features,1], pancancer_entropies[1:num_features,1], paired=FALSE, alternative="less")
print(wt$p.value)

entropy_df.original_order <- data.frame("entropy"=c(pancancer_entropies[1:num_features,1], single_cohort_entropies[1:num_features,1]), "method"=c(rep("XGB[PAN]", num_features), rep("XGB[SINGLE]", num_features)))

# calculate means
mu <- ddply(entropy_df.original_order, "method", summarise, grp.mean=mean(entropy, na.rm=TRUE))

pdf("./survival_prediction/densities_pancancer_single_cohort_entropies_original_feature_importance_order_top_100_wide.pdf", width=6.85, height=2.28)
ggplot(entropy_df.original_order, aes(x=entropy)) + geom_density(aes(fill=method), alpha=0.8) + theme(legend.title = element_blank(), legend.text = element_text(size=8), axis.title.x = element_text(size=8), axis.text.x = element_text(size=6), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6)) + xlab("Entropy in XGBoost single-cohort approach") + ylab("Density") + geom_vline(data=mu, aes(xintercept=grp.mean, color=method), linetype="dashed") + scale_fill_manual(values=c("#619CFF", "#E76BF3")) + scale_color_manual(values=c("#0041B0", "#940DA1"))
dev.off()


### Figure SI3: plot feature importance sums of all important genes in pan-cancer XGBoost approach
library(ggplot2)

pancancer_feature_table <- read.table("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=FALSE, na.strings='', stringsAsFactors=FALSE)
colnames(pancancer_feature_table) <- c("Gene", "Feature_importance")

pancancer_feature_table$Gene <- factor(pancancer_feature_table$Gene, levels = pancancer_feature_table$Gene)

pdf("./survival_prediction/xgb_pancancer_feature_importance_scores_all_genes.pdf", width=6.85, height=3.19)
ggplot(pancancer_feature_table, aes(Gene, Feature_importance)) +
    geom_bar(fill="#619CFF", color="#619CFF", size=0.4, position = "dodge", stat="identity") +
    ylab("Sum of feature importances") +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(size=8), axis.title.y=element_text(size=8), axis.text.y = element_text(size=6))
dev.off()


### Figure SI5: plot pan-cancer XGBoost results for patient subsets of different sizes
library(ggplot2)
library(ggpubr)

cohorts <- c("TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", 
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", 
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", 
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", 
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM")
            
n_replications <- 100
pancancer_results_all <- load_pancancer_json_results("results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_2_results", cohorts, n_replications)
pancancer_results_1000 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_1000_samples_results", cohorts, n_replications)
pancancer_results_500 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_500_samples_results", cohorts, n_replications)
pancancer_results_50 <- load_pancancer_json_results("xgb_pancancer_subsampling/xgb_pancancer_subsampling_50_samples_results", cohorts, n_replications)

result_list <- list("all"=pancancer_results_all, "1000"=pancancer_results_1000, "500"=pancancer_results_500, "50"=pancancer_results_50)
result_df <- melt(result_list)
colnames(result_df) <- c("CI", "cohort", "num_patients")
result_df$num_patients <- factor(result_df$num_patients, levels = unique(result_df$num_patients))

pdf("model_xgb_pancancer_all_patients_patient_subsets_comparison_100_replications_high_wide.pdf", width=10, height=9)
num_patients_comparisons <- list( c("all", "1000"), c("all", "500"), c("all", "50"))
p <- ggplot(data = result_df, aes(x=num_patients, y=CI)) + geom_boxplot(aes(fill=num_patients)) + labs(y = "C-Index", fill = "Patients") + scale_fill_manual(values=c("#619CFF", "#F8766D", "#7CAE00", "#C77CFF"))
p + facet_wrap( ~ cohort, ncol=5, scales="free",'strip.position' = 'bottom') + geom_hline(yintercept=0.5, linetype="dashed", color = "green") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.x=element_blank()) + stat_compare_means(comparisons = num_patients_comparisons, method = 'wilcox.test', level="p.signif", symnum.args=list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),  step.increase=0.2) + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
dev.off()