run_xgb_survival_random_subsets.py

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils import resample

from tqdm import tqdm
import json
import os
import sys
import argparse
import itertools
import random

def path2surv_concordance_index(y_predicted, y, delta):
    if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))):
        return 0
    N = len(y)
    Delta = ((np.ones((N,N)) *  (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) *  (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) *  (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) *  np.array(delta)).astype(bool) & ((np.ones((N,N)) *  np.array(y)).transpose() - (np.ones((N,N)) *  np.array(y)) < 0))
    Diff = ((np.ones((N,N)) *  np.array(y_predicted)).transpose() - (np.ones((N,N)) *  np.array(y_predicted))) * ((np.ones((N,N)) *  np.array(y)).transpose() - (np.ones((N,N)) *  np.array(y))) > 0
    CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
    if np.isnan(CI):
        CI = 0
    return CI

cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]

parser = argparse.ArgumentParser(prog='run_xgb_survial_replications.py')
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
parser.add_argument('-n', '--num_subsample', type=int, help='number of samples that should be used in training set', required=True)
parser.add_argument('-s', '--replication_start', type=int, default=1, help='first replication to run (default: 1)')
parser.add_argument('-e', '--replication_end', type=int, default=10, help='last replication to run (default: 10)')
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')
parser.add_argument('-d', '--seed', type=int, default=135, help='seed used to compute the random seed for splitting the data into training and test sets (default: 135)')

args = parser.parse_args()

result_path = args.result
feature_path = args.features
num_subsample = args.num_subsample
replication_start = args.replication_start
replication_end = args.replication_end
single_cohort = args.cohort
num_threads = args.threads
seed = args.seed


if replication_start < 1:
    print("The first replication must be at least 1")
    sys.exit(3)

if replication_end < replication_start:
    print("The last replication must be greater or equal than the first replication")
    sys.exit(4)

if num_subsample < 1 or num_subsample > 6419:
    print("Invalid number of subsamples")
    sys.exit(5)

if not os.path.exists(result_path):
    os.makedirs(result_path)

if not os.path.exists(feature_path):
    os.makedirs(feature_path)

if single_cohort != '':
	cohorts = [single_cohort]
	if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
		print("The selected cancer cohort is not in the available TCGA datasets")
		sys.exit(5)

use_xgb_features = True
num_select_features = 500

num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
max_depth_set = list(range(2,15+1))
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
alpha_set = [0,0.1,0.2,0.5,1,2]
gamma_set = [0,0.1,0.2,0.5,1,2]
min_child_weight_set = [1,2,3,5,7,10]
subsample_set = [0.5,0.65,0.8,0.9,1]
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
nrounds_feature_selection = 20
nrounds_parameter_tuning = 500
fold_count = 4
train_ratio = 0.8

# read data
rnaseq = pd.DataFrame()
clinical = pd.DataFrame()
cohort_dict = dict()
for cohort in cohorts:
    print("Reading data for cohort %s" % cohort)
    mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
    mrna = pd.read_csv(mrna_path, index_col=0).T
    mrna = mrna.dropna(axis=0, how="all")

    clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
    clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
    index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
    clinical_file = clinical_file.set_index(index_name)
    clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
    clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
    clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
    clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])

    clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]

    clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]

    # only use patients that have all features
    common_patients = mrna.index.intersection(clinical_unique.index)
    clinical_unique = clinical_unique.loc[common_patients]
    mrna = mrna.loc[common_patients]

    print("Found %d patients for cohort %s" % (len(common_patients), cohort))

    # save cohort with clinical information
    clinical_unique = clinical_unique.assign(cohort=cohort)

    rnaseq = rnaseq.append(mrna)
    clinical = clinical.append(clinical_unique)
    cohort_dict[cohort] = common_patients

num_rna_features = rnaseq.shape[1]

# only use specified cohorts for training and testing
cohort_patients = [p for c in cohorts for p in cohort_dict[c]]
rnaseq = rnaseq.loc[cohort_patients]

delta = (clinical["vital_status"] == "Alive").astype(int)
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)

# encode censored patients with negative survival time (as expected by xgboost)
y = y.multiply([1 if d == 0 else -1 for d in delta])

dead_patients = delta.index[delta == 0]
alive_patients = delta.index[delta == 1]


for replication in range(replication_start,replication_end+1):

    random_state = seed*replication

    train_dead_patients = []
    train_alive_patients = []
    test_dead_patients = []
    test_alive_patients = []
    for cohort in cohorts:
        dead_patients_cohort = np.intersect1d(dead_patients, cohort_dict[cohort])
        alive_patients_cohort = np.intersect1d(alive_patients, cohort_dict[cohort])
        train_dead_patients_cohort, test_dead_patients_cohort = train_test_split(dead_patients_cohort, test_size=1-train_ratio, random_state=random_state)
        train_alive_patients_cohort, test_alive_patients_cohort = train_test_split(alive_patients_cohort, test_size=1-train_ratio, random_state=random_state)
        train_dead_patients.extend(train_dead_patients_cohort)
        train_alive_patients.extend(train_alive_patients_cohort)
        test_dead_patients.extend(test_dead_patients_cohort)
        test_alive_patients.extend(test_alive_patients_cohort)

    train_patients = np.concatenate([train_dead_patients, train_alive_patients])
    test_patients = np.concatenate([test_dead_patients, test_alive_patients])

    # for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and cohort to ensure every group is represented in the same proportion as in the original data)
    #censoring_criteria = pd.Series(delta[clinical.index].astype(str) + clinical['cohort'], index=clinical.index)
    #train_patients = resample(train_patients, replace=False, n_samples=num_subsample, random_state=random_state, stratify=censoring_criteria[train_patients])

    # for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and same number of patient from every cohort to ensure every cohort is represented equally)
    print("Subsampling %d patients" % num_subsample)
    subsampled_train_patients = []
    subsamples_per_cohort = list(map(len, (np.array_split(range(num_subsample), len(cohorts)))))
    random.seed(random_state)
    random.shuffle(subsamples_per_cohort)
    for i, cohort in enumerate(cohorts):
         train_cohort_patients = np.intersect1d(train_patients, cohort_dict[cohort])
         if len(train_cohort_patients) < subsamples_per_cohort[i]:
             print("Can only subsample %d patients for cohort %s" % (len(train_cohort_patients), cohort))
             subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=len(train_cohort_patients), random_state=random_state, stratify=delta[train_cohort_patients]))
         else:
             subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=subsamples_per_cohort[i], random_state=random_state, stratify=delta[train_cohort_patients]))
    train_patients = np.array(subsampled_train_patients)
    print("Training on %d patients" % len(train_patients))

    important_features = []
    if use_xgb_features:
        feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)

        kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
        kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

        print("Selecting features")
        fold = 1
        for train_indices, test_indices in kfold_samples:
            print("Feature selection fold " + str(fold))

            train_indices_samples = train_patients[train_indices]

            x_train = rnaseq.loc[train_indices_samples]

            # remove genes with 0 mean absolute deviation
            x_train = x_train.loc[:,x_train.mad() != 0]

            y_train = y.loc[train_indices_samples]
            delta_train = delta.loc[train_indices_samples]

            random.seed(352)
            max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
            learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
            num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
            colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
            subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)

            for i in tqdm(range(nrounds_feature_selection)):
                max_depth = max_depth_grid[i]
                learning_rate = learning_rate_grid[i]
                num_round = num_round_grid[i]
                colsample_bytree = colsample_bytree_grid[i]
                subsample = subsample_grid[i]

                dtrain = xgb.DMatrix(x_train, label = y_train)
                param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
                state = xgb.train(param, dtrain, num_round)
                feature_importances = pd.Series(state.get_score(importance_type='gain'))
                feature_importance_vector[feature_importances.index] += feature_importances

            fold += 1

        feature_importance_vector /= nrounds_feature_selection * fold_count
        feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
        num_important_features = min(num_select_features, len(feature_importance_vector))
        important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
        important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]

        feature_file = "%s/important_genes_replication_%d.json" % (feature_path, replication)
        if single_cohort != '':
            feature_file = "%s/%s/important_genes_replication_%d.json" % (feature_path, single_cohort, replication)
            if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
                os.makedirs("%s/%s" % (feature_path, single_cohort))
        with open(feature_file, 'w') as output:
            json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)

    else:
        important_features = rnaseq.columns

    x_continuous = rnaseq[important_features]

    concordance_index_dict = dict()

    kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
    kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

    print("Tuning XGBoost parameters")
    fold = 1
    for train_indices, test_indices in kfold_samples:
        print("Parameter tuning fold " + str(fold))

        train_samples = train_patients[train_indices]
        test_samples = train_patients[test_indices]

        x_train = x_continuous.loc[train_samples]
        x_test = x_continuous.loc[test_samples]

        y_train = y[train_samples]
        delta_train = delta[train_samples]
        y_test = y[test_samples]
        delta_test = delta[test_samples]

        random.seed(123)
        max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
        learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
        num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
        colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
        alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
        gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
        min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
        subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
        lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)


        for i in tqdm(range(nrounds_parameter_tuning)):
            max_depth = max_depth_grid[i]
            learning_rate = learning_rate_grid[i]
            num_round = num_round_grid[i]
            colsample_bytree = colsample_bytree_grid[i]
            alpha = alpha_grid[i]
            gamma = gamma_grid[i]
            min_child_weight = min_child_weight_grid[i]
            subsample = subsample_grid[i]
            lambda_val = lambda_grid[i]

            dtrain = xgb.DMatrix(x_train, label = y_train)
            dtest = xgb.DMatrix(x_test)
            param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
            state = xgb.train(param, dtrain, num_round)
            prediction = state.predict(dtest)
            train_prediction = state.predict(dtrain)
            concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))

        fold += 1

    concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
    params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
    num_round_max_CI = params_max_CI[0]
    max_depth_max_CI = params_max_CI[1]
    learning_rate_max_CI = params_max_CI[2]
    colsample_bytree_max_CI = params_max_CI[3]
    alpha_max_CI = params_max_CI[4]
    gamma_max_CI = params_max_CI[5]
    min_child_weight_max_CI = params_max_CI[6]
    subsample_max_CI = params_max_CI[7]
    lambda_max_CI = params_max_CI[8]

    x_train = x_continuous.loc[train_patients]
    x_test = x_continuous.loc[test_patients]

    y_train = y.loc[train_patients]
    y_test = y.loc[test_patients]

    delta_train = delta.loc[train_patients]
    delta_test = delta.loc[test_patients]

    dtrain = xgb.DMatrix(x_train, label = y_train)
    dtest = xgb.DMatrix(x_test, label = y_test)
    param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
    state = xgb.train(param, dtrain, num_round_max_CI)
    test_prediction = state.predict(dtest)

    result = dict()
    result["num_round"] = num_round_max_CI
    result["max_depth"] = max_depth_max_CI
    result["learning_rate"] = learning_rate_max_CI
    result["colsample_bytree"] = colsample_bytree_max_CI
    result["alpha"] = alpha_max_CI
    result["gamma"] = gamma_max_CI
    result["min_child_weight"] = min_child_weight_max_CI
    result["subsample"] = subsample_max_CI
    result["lambda"] = lambda_max_CI
    result["importance"] = state.get_score(importance_type='gain')
    result["y_test"] = dict()
    result["delta_test"] = dict()
    result["y_predicted"] = dict()
    result["test_patients"] = dict()
    result["CI"] = dict()
    result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))

    for cohort in cohorts:
        test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
        dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
        prediction = state.predict(dcohort)
        result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
        result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
        result["y_predicted"][cohort] = prediction.tolist()
        result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
        result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
        print(cohort + " CI: " + str(result["CI"][cohort]))

    result_file = "%s/xgb_measure_CI_replication_%d_result.json" % (result_path, replication)
    if single_cohort != '':
        result_file = "%s/%s/xgb_measure_CI_replication_%d_result.json" % (result_path, single_cohort, replication)
        if not os.path.exists("%s/%s" % (result_path, single_cohort)):
            os.makedirs("%s/%s" % (result_path, single_cohort))
    with open(result_file, 'w') as output:
        json.dump(result, output, indent=4)
	import numpy as np
	import pandas as pd
	import xgboost as xgb
	from sklearn.model_selection import StratifiedKFold, train_test_split
	from sklearn.utils import resample

	from tqdm import tqdm
	import json
	import os
	import sys
	import argparse
	import itertools
	import random

	def path2surv_concordance_index(y_predicted, y, delta):
	if np.any(np.isnan(y_predicted) \| np.any(np.isinf(y_predicted))):
	return 0
	N = len(y)
	Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) \| ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0))
	Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0
	CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
	if np.isnan(CI):
	CI = 0
	return CI

	cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
	"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
	"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
	"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
	"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]

	parser = argparse.ArgumentParser(prog='run_xgb_survial_replications.py')
	parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
	parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
	parser.add_argument('-n', '--num_subsample', type=int, help='number of samples that should be used in training set', required=True)
	parser.add_argument('-s', '--replication_start', type=int, default=1, help='first replication to run (default: 1)')
	parser.add_argument('-e', '--replication_end', type=int, default=10, help='last replication to run (default: 10)')
	parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
	parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')
	parser.add_argument('-d', '--seed', type=int, default=135, help='seed used to compute the random seed for splitting the data into training and test sets (default: 135)')

	args = parser.parse_args()

	result_path = args.result
	feature_path = args.features
	num_subsample = args.num_subsample
	replication_start = args.replication_start
	replication_end = args.replication_end
	single_cohort = args.cohort
	num_threads = args.threads
	seed = args.seed


	if replication_start < 1:
	print("The first replication must be at least 1")
	sys.exit(3)

	if replication_end < replication_start:
	print("The last replication must be greater or equal than the first replication")
	sys.exit(4)

	if num_subsample < 1 or num_subsample > 6419:
	print("Invalid number of subsamples")
	sys.exit(5)

	if not os.path.exists(result_path):
	os.makedirs(result_path)

	if not os.path.exists(feature_path):
	os.makedirs(feature_path)

	if single_cohort != '':
	cohorts = [single_cohort]
	if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
	print("The selected cancer cohort is not in the available TCGA datasets")
	sys.exit(5)

	use_xgb_features = True
	num_select_features = 500

	num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
	max_depth_set = list(range(2,15+1))
	learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
	colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
	alpha_set = [0,0.1,0.2,0.5,1,2]
	gamma_set = [0,0.1,0.2,0.5,1,2]
	min_child_weight_set = [1,2,3,5,7,10]
	subsample_set = [0.5,0.65,0.8,0.9,1]
	lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
	nrounds_feature_selection = 20
	nrounds_parameter_tuning = 500
	fold_count = 4
	train_ratio = 0.8

	# read data
	rnaseq = pd.DataFrame()
	clinical = pd.DataFrame()
	cohort_dict = dict()
	for cohort in cohorts:
	print("Reading data for cohort %s" % cohort)
	mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
	mrna = pd.read_csv(mrna_path, index_col=0).T
	mrna = mrna.dropna(axis=0, how="all")

	clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
	clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
	index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
	clinical_file = clinical_file.set_index(index_name)
	clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
	clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
	clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
	clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])

	clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) \| ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]

	clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]

	# only use patients that have all features
	common_patients = mrna.index.intersection(clinical_unique.index)
	clinical_unique = clinical_unique.loc[common_patients]
	mrna = mrna.loc[common_patients]

	print("Found %d patients for cohort %s" % (len(common_patients), cohort))

	# save cohort with clinical information
	clinical_unique = clinical_unique.assign(cohort=cohort)

	rnaseq = rnaseq.append(mrna)
	clinical = clinical.append(clinical_unique)
	cohort_dict[cohort] = common_patients

	num_rna_features = rnaseq.shape[1]

	# only use specified cohorts for training and testing
	cohort_patients = [p for c in cohorts for p in cohort_dict[c]]
	rnaseq = rnaseq.loc[cohort_patients]

	delta = (clinical["vital_status"] == "Alive").astype(int)
	y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)

	# encode censored patients with negative survival time (as expected by xgboost)
	y = y.multiply([1 if d == 0 else -1 for d in delta])

	dead_patients = delta.index[delta == 0]
	alive_patients = delta.index[delta == 1]


	for replication in range(replication_start,replication_end+1):

	random_state = seed*replication

	train_dead_patients = []
	train_alive_patients = []
	test_dead_patients = []
	test_alive_patients = []
	for cohort in cohorts:
	dead_patients_cohort = np.intersect1d(dead_patients, cohort_dict[cohort])
	alive_patients_cohort = np.intersect1d(alive_patients, cohort_dict[cohort])
	train_dead_patients_cohort, test_dead_patients_cohort = train_test_split(dead_patients_cohort, test_size=1-train_ratio, random_state=random_state)
	train_alive_patients_cohort, test_alive_patients_cohort = train_test_split(alive_patients_cohort, test_size=1-train_ratio, random_state=random_state)
	train_dead_patients.extend(train_dead_patients_cohort)
	train_alive_patients.extend(train_alive_patients_cohort)
	test_dead_patients.extend(test_dead_patients_cohort)
	test_alive_patients.extend(test_alive_patients_cohort)

	train_patients = np.concatenate([train_dead_patients, train_alive_patients])
	test_patients = np.concatenate([test_dead_patients, test_alive_patients])

	# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and cohort to ensure every group is represented in the same proportion as in the original data)
	#censoring_criteria = pd.Series(delta[clinical.index].astype(str) + clinical['cohort'], index=clinical.index)
	#train_patients = resample(train_patients, replace=False, n_samples=num_subsample, random_state=random_state, stratify=censoring_criteria[train_patients])

	# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and same number of patient from every cohort to ensure every cohort is represented equally)
	print("Subsampling %d patients" % num_subsample)
	subsampled_train_patients = []
	subsamples_per_cohort = list(map(len, (np.array_split(range(num_subsample), len(cohorts)))))
	random.seed(random_state)
	random.shuffle(subsamples_per_cohort)
	for i, cohort in enumerate(cohorts):
	train_cohort_patients = np.intersect1d(train_patients, cohort_dict[cohort])
	if len(train_cohort_patients) < subsamples_per_cohort[i]:
	print("Can only subsample %d patients for cohort %s" % (len(train_cohort_patients), cohort))
	subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=len(train_cohort_patients), random_state=random_state, stratify=delta[train_cohort_patients]))
	else:
	subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=subsamples_per_cohort[i], random_state=random_state, stratify=delta[train_cohort_patients]))
	train_patients = np.array(subsampled_train_patients)
	print("Training on %d patients" % len(train_patients))

	important_features = []
	if use_xgb_features:
	feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)

	kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
	kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

	print("Selecting features")
	fold = 1
	for train_indices, test_indices in kfold_samples:
	print("Feature selection fold " + str(fold))

	train_indices_samples = train_patients[train_indices]

	x_train = rnaseq.loc[train_indices_samples]

	# remove genes with 0 mean absolute deviation
	x_train = x_train.loc[:,x_train.mad() != 0]

	y_train = y.loc[train_indices_samples]
	delta_train = delta.loc[train_indices_samples]

	random.seed(352)
	max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
	learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
	num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
	colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
	subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)

	for i in tqdm(range(nrounds_feature_selection)):
	max_depth = max_depth_grid[i]
	learning_rate = learning_rate_grid[i]
	num_round = num_round_grid[i]
	colsample_bytree = colsample_bytree_grid[i]
	subsample = subsample_grid[i]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round)
	feature_importances = pd.Series(state.get_score(importance_type='gain'))
	feature_importance_vector[feature_importances.index] += feature_importances

	fold += 1

	feature_importance_vector /= nrounds_feature_selection * fold_count
	feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
	num_important_features = min(num_select_features, len(feature_importance_vector))
	important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
	important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]

	feature_file = "%s/important_genes_replication_%d.json" % (feature_path, replication)
	if single_cohort != '':
	feature_file = "%s/%s/important_genes_replication_%d.json" % (feature_path, single_cohort, replication)
	if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
	os.makedirs("%s/%s" % (feature_path, single_cohort))
	with open(feature_file, 'w') as output:
	json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)

	else:
	important_features = rnaseq.columns

	x_continuous = rnaseq[important_features]

	concordance_index_dict = dict()

	kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
	kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

	print("Tuning XGBoost parameters")
	fold = 1
	for train_indices, test_indices in kfold_samples:
	print("Parameter tuning fold " + str(fold))

	train_samples = train_patients[train_indices]
	test_samples = train_patients[test_indices]

	x_train = x_continuous.loc[train_samples]
	x_test = x_continuous.loc[test_samples]

	y_train = y[train_samples]
	delta_train = delta[train_samples]
	y_test = y[test_samples]
	delta_test = delta[test_samples]

	random.seed(123)
	max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
	learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
	num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
	colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
	alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
	gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
	min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
	subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
	lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)


	for i in tqdm(range(nrounds_parameter_tuning)):
	max_depth = max_depth_grid[i]
	learning_rate = learning_rate_grid[i]
	num_round = num_round_grid[i]
	colsample_bytree = colsample_bytree_grid[i]
	alpha = alpha_grid[i]
	gamma = gamma_grid[i]
	min_child_weight = min_child_weight_grid[i]
	subsample = subsample_grid[i]
	lambda_val = lambda_grid[i]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	dtest = xgb.DMatrix(x_test)
	param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round)
	prediction = state.predict(dtest)
	train_prediction = state.predict(dtrain)
	concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))

	fold += 1

	concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
	params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
	num_round_max_CI = params_max_CI[0]
	max_depth_max_CI = params_max_CI[1]
	learning_rate_max_CI = params_max_CI[2]
	colsample_bytree_max_CI = params_max_CI[3]
	alpha_max_CI = params_max_CI[4]
	gamma_max_CI = params_max_CI[5]
	min_child_weight_max_CI = params_max_CI[6]
	subsample_max_CI = params_max_CI[7]
	lambda_max_CI = params_max_CI[8]

	x_train = x_continuous.loc[train_patients]
	x_test = x_continuous.loc[test_patients]

	y_train = y.loc[train_patients]
	y_test = y.loc[test_patients]

	delta_train = delta.loc[train_patients]
	delta_test = delta.loc[test_patients]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	dtest = xgb.DMatrix(x_test, label = y_test)
	param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round_max_CI)
	test_prediction = state.predict(dtest)

	result = dict()
	result["num_round"] = num_round_max_CI
	result["max_depth"] = max_depth_max_CI
	result["learning_rate"] = learning_rate_max_CI
	result["colsample_bytree"] = colsample_bytree_max_CI
	result["alpha"] = alpha_max_CI
	result["gamma"] = gamma_max_CI
	result["min_child_weight"] = min_child_weight_max_CI
	result["subsample"] = subsample_max_CI
	result["lambda"] = lambda_max_CI
	result["importance"] = state.get_score(importance_type='gain')
	result["y_test"] = dict()
	result["delta_test"] = dict()
	result["y_predicted"] = dict()
	result["test_patients"] = dict()
	result["CI"] = dict()
	result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))

	for cohort in cohorts:
	test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
	dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
	prediction = state.predict(dcohort)
	result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
	result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
	result["y_predicted"][cohort] = prediction.tolist()
	result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
	result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
	print(cohort + " CI: " + str(result["CI"][cohort]))

	result_file = "%s/xgb_measure_CI_replication_%d_result.json" % (result_path, replication)
	if single_cohort != '':
	result_file = "%s/%s/xgb_measure_CI_replication_%d_result.json" % (result_path, single_cohort, replication)
	if not os.path.exists("%s/%s" % (result_path, single_cohort)):
	os.makedirs("%s/%s" % (result_path, single_cohort))
	with open(result_file, 'w') as output:
	json.dump(result, output, indent=4)