run_xgb_survival_test_new_cohorts.py

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import json
import os
import sys
import argparse
import itertools
import random


def path2surv_concordance_index(y_predicted, y, delta):
    if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))):
        return 0
    N = len(y)
    Delta = ((np.ones((N,N)) *  (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) *  (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) *  (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) *  np.array(delta)).astype(bool) & ((np.ones((N,N)) *  np.array(y)).transpose() - (np.ones((N,N)) *  np.array(y)) < 0))
    Diff = ((np.ones((N,N)) *  np.array(y_predicted)).transpose() - (np.ones((N,N)) *  np.array(y_predicted))) * ((np.ones((N,N)) *  np.array(y)).transpose() - (np.ones((N,N)) *  np.array(y))) > 0
    CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
    if np.isnan(CI):
        CI = 0
    return CI

train_cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
            "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
            "TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
            "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
            "TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]

test_cohorts = ["TCGA-CHOL", "TCGA-DLBC", "TCGA-KICH", "TCGA-PCPG", "TCGA-PRAD",
            "TCGA-TGCT", "TCGA-THCA", "TCGA-THYM"]

cohorts = train_cohorts + test_cohorts

parser = argparse.ArgumentParser(prog='run_xgb_survial_test_new_cohorts.py')
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')

args = parser.parse_args()

result_path = args.result
feature_path = args.features
single_cohort = args.cohort
num_threads = args.threads

if not os.path.exists(result_path):
    os.makedirs(result_path)

if not os.path.exists(feature_path):
    os.makedirs(feature_path)

if single_cohort != '':
	cohorts = [single_cohort]
	if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
		print("The selected cancer cohort is not in the available TCGA datasets")
		sys.exit(5)

use_xgb_features = True

num_select_features = 500

num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
max_depth_set = list(range(2,15+1))
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
alpha_set = [0,0.1,0.2,0.5,1,2]
gamma_set = [0,0.1,0.2,0.5,1,2]
min_child_weight_set = [1,2,3,5,7,10]
subsample_set = [0.5,0.65,0.8,0.9,1]
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
nrounds_feature_selection = 20
nrounds_parameter_tuning = 500
fold_count = 4
train_ratio = 0.8


rnaseq = pd.DataFrame()
clinical = pd.DataFrame()
cohort_dict = dict()
for cohort in cohorts:
    print("Reading data for cohort %s" % cohort)
    mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
    mrna = pd.read_csv(mrna_path, index_col=0).T
    mrna = mrna.dropna(axis=0, how="all")

    clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
    clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
    index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
    clinical_file = clinical_file.set_index(index_name)
    clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
    clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
    clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
    clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])

    clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]

    clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]

    # only use patients that have all features
    common_patients = mrna.index.intersection(clinical_unique.index)
    clinical_unique = clinical_unique.loc[common_patients]
    mrna = mrna.loc[common_patients]

    print("Found %d patients for cohort %s" % (len(common_patients), cohort))

    rnaseq = rnaseq.append(mrna)
    clinical = clinical.append(clinical_unique)
    cohort_dict[cohort] = common_patients

num_rna_features = rnaseq.shape[1]

delta = (clinical["vital_status"] == "Alive").astype(int)
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)

# encode censored patients with negative survival time (as expexted by xgboost)
y = y.multiply([1 if d == 0 else -1 for d in delta])

dead_patients = delta.index[delta == 0]
alive_patients = delta.index[delta == 1]


train_patients = []
test_patients = []
for cohort in cohorts:
    if cohort in train_cohorts:
        train_patients.extend(cohort_dict[cohort])
    elif cohort in test_cohorts:
        test_patients.extend(cohort_dict[cohort])
    else:
        print("WARNING: Encountered cohort %s, which is neither in the train cohorts nor in the test cohorts.")

train_patients = np.array(train_patients)
test_patients = np.array(test_patients)

important_features = []
if use_xgb_features:
    feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)

    kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
    kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

    print("Selecting features")
    fold = 1
    for train_indices, test_indices in kfold_samples:
        print("Feature selection fold " + str(fold))

        train_indices_samples = train_patients[train_indices]

        x_train = rnaseq.loc[train_indices_samples]

        # remove genes with 0 mean absolute deviation
        x_train = x_train.loc[:,x_train.mad() != 0]

        y_train = y.loc[train_indices_samples]
        delta_train = delta.loc[train_indices_samples]

        random.seed(352)
        max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
        learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
        num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
        colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
        subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)

        for i in tqdm(range(nrounds_feature_selection)):
            max_depth = max_depth_grid[i]
            learning_rate = learning_rate_grid[i]
            num_round = num_round_grid[i]
            colsample_bytree = colsample_bytree_grid[i]
            subsample = subsample_grid[i]

            dtrain = xgb.DMatrix(x_train, label = y_train)
            param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
            state = xgb.train(param, dtrain, num_round)
            feature_importances = pd.Series(state.get_score(importance_type='gain'))
            feature_importance_vector[feature_importances.index] += feature_importances

        fold += 1

    feature_importance_vector /= nrounds_feature_selection * fold_count
    feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
    num_important_features = min(num_select_features, len(feature_importance_vector))
    important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
    important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]


    feature_file = "%s/important_genes.json" % feature_path
    if single_cohort != '':
        feature_file = "%s/%s/important_genes.json" % (feature_path, single_cohort)
        if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
            os.makedirs("%s/%s" % (feature_path, single_cohort))
    with open(feature_file, 'w') as output:
        json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)
else:
    important_features = rnaseq.columns


x_continuous = rnaseq[important_features]


concordance_index_dict = dict()

kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

print("Tuning XGBoost parameters")
fold = 1
for train_indices, test_indices in kfold_samples:
    print("Parameter tuning fold " + str(fold))

    train_samples = train_patients[train_indices]
    test_samples = train_patients[test_indices]

    x_train = x_continuous.loc[train_samples]
    x_test = x_continuous.loc[test_samples]

    y_train = y[train_samples]
    delta_train = delta[train_samples]
    y_test = y[test_samples]
    delta_test = delta[test_samples]

    random.seed(123)
    max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
    learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
    num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
    colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
    alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
    gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
    min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
    subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
    lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)


    for i in tqdm(range(nrounds_parameter_tuning)):
        max_depth = max_depth_grid[i]
        learning_rate = learning_rate_grid[i]
        num_round = num_round_grid[i]
        colsample_bytree = colsample_bytree_grid[i]
        alpha = alpha_grid[i]
        gamma = gamma_grid[i]
        min_child_weight = min_child_weight_grid[i]
        subsample = subsample_grid[i]
        lambda_val = lambda_grid[i]

        dtrain = xgb.DMatrix(x_train, label = y_train)
        dtest = xgb.DMatrix(x_test)
        param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
        state = xgb.train(param, dtrain, num_round)
        prediction = state.predict(dtest)
        train_prediction = state.predict(dtrain)
        concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))

    fold += 1

concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
num_round_max_CI = params_max_CI[0]
max_depth_max_CI = params_max_CI[1]
learning_rate_max_CI = params_max_CI[2]
colsample_bytree_max_CI = params_max_CI[3]
alpha_max_CI = params_max_CI[4]
gamma_max_CI = params_max_CI[5]
min_child_weight_max_CI = params_max_CI[6]
subsample_max_CI = params_max_CI[7]
lambda_max_CI = params_max_CI[8]

x_train = x_continuous.loc[train_patients]
x_test = x_continuous.loc[test_patients]

y_train = y.loc[train_patients]
y_test = y.loc[test_patients]

delta_train = delta.loc[train_patients]
delta_test = delta.loc[test_patients]

dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round_max_CI)
test_prediction = state.predict(dtest)

result = dict()
result["num_round"] = num_round_max_CI
result["max_depth"] = max_depth_max_CI
result["learning_rate"] = learning_rate_max_CI
result["colsample_bytree"] = colsample_bytree_max_CI
result["alpha"] = alpha_max_CI
result["gamma"] = gamma_max_CI
result["min_child_weight"] = min_child_weight_max_CI
result["subsample"] = subsample_max_CI
result["lambda"] = lambda_max_CI
result["importance"] = state.get_score(importance_type='gain')
result["y_test"] = dict()
result["delta_test"] = dict()
result["y_predicted"] = dict()
result["test_patients"] = dict()
result["CI"] = dict()
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))

for cohort in test_cohorts:
    test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
    dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
    prediction = state.predict(dcohort)
    result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
    result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
    result["y_predicted"][cohort] = prediction.tolist()
    result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
    result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
    print(cohort + " CI: " + str(result["CI"][cohort]))

result_file = "%s/xgb_measure_CI_new_cohorts_result.json" % result_path
if single_cohort != '':
    result_file = "%s/%s/xgb_measure_CI_new_cohorts_result.json" % (result_path, single_cohort)
    if not os.path.exists("%s/%s" % (result_path, single_cohort)):
        os.makedirs("%s/%s" % (result_path, single_cohort))
with open(result_file, 'w') as output:
    json.dump(result, output, indent=4)
	import numpy as np
	import pandas as pd
	import xgboost as xgb
	from sklearn.model_selection import StratifiedKFold
	from sklearn.model_selection import train_test_split

	from tqdm import tqdm
	import json
	import os
	import sys
	import argparse
	import itertools
	import random


	def path2surv_concordance_index(y_predicted, y, delta):
	if np.any(np.isnan(y_predicted) \| np.any(np.isinf(y_predicted))):
	return 0
	N = len(y)
	Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) \| ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0))
	Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0
	CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
	if np.isnan(CI):
	CI = 0
	return CI

	train_cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
	"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
	"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
	"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
	"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]

	test_cohorts = ["TCGA-CHOL", "TCGA-DLBC", "TCGA-KICH", "TCGA-PCPG", "TCGA-PRAD",
	"TCGA-TGCT", "TCGA-THCA", "TCGA-THYM"]

	cohorts = train_cohorts + test_cohorts

	parser = argparse.ArgumentParser(prog='run_xgb_survial_test_new_cohorts.py')
	parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
	parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
	parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
	parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')

	args = parser.parse_args()

	result_path = args.result
	feature_path = args.features
	single_cohort = args.cohort
	num_threads = args.threads

	if not os.path.exists(result_path):
	os.makedirs(result_path)

	if not os.path.exists(feature_path):
	os.makedirs(feature_path)

	if single_cohort != '':
	cohorts = [single_cohort]
	if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
	print("The selected cancer cohort is not in the available TCGA datasets")
	sys.exit(5)

	use_xgb_features = True

	num_select_features = 500

	num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
	max_depth_set = list(range(2,15+1))
	learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
	colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
	alpha_set = [0,0.1,0.2,0.5,1,2]
	gamma_set = [0,0.1,0.2,0.5,1,2]
	min_child_weight_set = [1,2,3,5,7,10]
	subsample_set = [0.5,0.65,0.8,0.9,1]
	lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
	nrounds_feature_selection = 20
	nrounds_parameter_tuning = 500
	fold_count = 4
	train_ratio = 0.8


	rnaseq = pd.DataFrame()
	clinical = pd.DataFrame()
	cohort_dict = dict()
	for cohort in cohorts:
	print("Reading data for cohort %s" % cohort)
	mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
	mrna = pd.read_csv(mrna_path, index_col=0).T
	mrna = mrna.dropna(axis=0, how="all")

	clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
	clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
	index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
	clinical_file = clinical_file.set_index(index_name)
	clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
	clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
	clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
	clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])

	clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) \| ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]

	clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]

	# only use patients that have all features
	common_patients = mrna.index.intersection(clinical_unique.index)
	clinical_unique = clinical_unique.loc[common_patients]
	mrna = mrna.loc[common_patients]

	print("Found %d patients for cohort %s" % (len(common_patients), cohort))

	rnaseq = rnaseq.append(mrna)
	clinical = clinical.append(clinical_unique)
	cohort_dict[cohort] = common_patients

	num_rna_features = rnaseq.shape[1]

	delta = (clinical["vital_status"] == "Alive").astype(int)
	y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)

	# encode censored patients with negative survival time (as expexted by xgboost)
	y = y.multiply([1 if d == 0 else -1 for d in delta])

	dead_patients = delta.index[delta == 0]
	alive_patients = delta.index[delta == 1]


	train_patients = []
	test_patients = []
	for cohort in cohorts:
	if cohort in train_cohorts:
	train_patients.extend(cohort_dict[cohort])
	elif cohort in test_cohorts:
	test_patients.extend(cohort_dict[cohort])
	else:
	print("WARNING: Encountered cohort %s, which is neither in the train cohorts nor in the test cohorts.")

	train_patients = np.array(train_patients)
	test_patients = np.array(test_patients)

	important_features = []
	if use_xgb_features:
	feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)

	kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
	kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

	print("Selecting features")
	fold = 1
	for train_indices, test_indices in kfold_samples:
	print("Feature selection fold " + str(fold))

	train_indices_samples = train_patients[train_indices]

	x_train = rnaseq.loc[train_indices_samples]

	# remove genes with 0 mean absolute deviation
	x_train = x_train.loc[:,x_train.mad() != 0]

	y_train = y.loc[train_indices_samples]
	delta_train = delta.loc[train_indices_samples]

	random.seed(352)
	max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
	learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
	num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
	colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
	subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)

	for i in tqdm(range(nrounds_feature_selection)):
	max_depth = max_depth_grid[i]
	learning_rate = learning_rate_grid[i]
	num_round = num_round_grid[i]
	colsample_bytree = colsample_bytree_grid[i]
	subsample = subsample_grid[i]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round)
	feature_importances = pd.Series(state.get_score(importance_type='gain'))
	feature_importance_vector[feature_importances.index] += feature_importances

	fold += 1

	feature_importance_vector /= nrounds_feature_selection * fold_count
	feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
	num_important_features = min(num_select_features, len(feature_importance_vector))
	important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
	important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]


	feature_file = "%s/important_genes.json" % feature_path
	if single_cohort != '':
	feature_file = "%s/%s/important_genes.json" % (feature_path, single_cohort)
	if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
	os.makedirs("%s/%s" % (feature_path, single_cohort))
	with open(feature_file, 'w') as output:
	json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)
	else:
	important_features = rnaseq.columns


	x_continuous = rnaseq[important_features]


	concordance_index_dict = dict()

	kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
	kfold_samples = kfold_generator.split(train_patients, delta[train_patients])

	print("Tuning XGBoost parameters")
	fold = 1
	for train_indices, test_indices in kfold_samples:
	print("Parameter tuning fold " + str(fold))

	train_samples = train_patients[train_indices]
	test_samples = train_patients[test_indices]

	x_train = x_continuous.loc[train_samples]
	x_test = x_continuous.loc[test_samples]

	y_train = y[train_samples]
	delta_train = delta[train_samples]
	y_test = y[test_samples]
	delta_test = delta[test_samples]

	random.seed(123)
	max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
	learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
	num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
	colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
	alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
	gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
	min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
	subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
	lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)


	for i in tqdm(range(nrounds_parameter_tuning)):
	max_depth = max_depth_grid[i]
	learning_rate = learning_rate_grid[i]
	num_round = num_round_grid[i]
	colsample_bytree = colsample_bytree_grid[i]
	alpha = alpha_grid[i]
	gamma = gamma_grid[i]
	min_child_weight = min_child_weight_grid[i]
	subsample = subsample_grid[i]
	lambda_val = lambda_grid[i]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	dtest = xgb.DMatrix(x_test)
	param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round)
	prediction = state.predict(dtest)
	train_prediction = state.predict(dtrain)
	concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))

	fold += 1

	concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
	params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
	num_round_max_CI = params_max_CI[0]
	max_depth_max_CI = params_max_CI[1]
	learning_rate_max_CI = params_max_CI[2]
	colsample_bytree_max_CI = params_max_CI[3]
	alpha_max_CI = params_max_CI[4]
	gamma_max_CI = params_max_CI[5]
	min_child_weight_max_CI = params_max_CI[6]
	subsample_max_CI = params_max_CI[7]
	lambda_max_CI = params_max_CI[8]

	x_train = x_continuous.loc[train_patients]
	x_test = x_continuous.loc[test_patients]

	y_train = y.loc[train_patients]
	y_test = y.loc[test_patients]

	delta_train = delta.loc[train_patients]
	delta_test = delta.loc[test_patients]

	dtrain = xgb.DMatrix(x_train, label = y_train)
	dtest = xgb.DMatrix(x_test, label = y_test)
	param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
	state = xgb.train(param, dtrain, num_round_max_CI)
	test_prediction = state.predict(dtest)

	result = dict()
	result["num_round"] = num_round_max_CI
	result["max_depth"] = max_depth_max_CI
	result["learning_rate"] = learning_rate_max_CI
	result["colsample_bytree"] = colsample_bytree_max_CI
	result["alpha"] = alpha_max_CI
	result["gamma"] = gamma_max_CI
	result["min_child_weight"] = min_child_weight_max_CI
	result["subsample"] = subsample_max_CI
	result["lambda"] = lambda_max_CI
	result["importance"] = state.get_score(importance_type='gain')
	result["y_test"] = dict()
	result["delta_test"] = dict()
	result["y_predicted"] = dict()
	result["test_patients"] = dict()
	result["CI"] = dict()
	result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))

	for cohort in test_cohorts:
	test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
	dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
	prediction = state.predict(dcohort)
	result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
	result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
	result["y_predicted"][cohort] = prediction.tolist()
	result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
	result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
	print(cohort + " CI: " + str(result["CI"][cohort]))

	result_file = "%s/xgb_measure_CI_new_cohorts_result.json" % result_path
	if single_cohort != '':
	result_file = "%s/%s/xgb_measure_CI_new_cohorts_result.json" % (result_path, single_cohort)
	if not os.path.exists("%s/%s" % (result_path, single_cohort)):
	os.makedirs("%s/%s" % (result_path, single_cohort))
	with open(result_file, 'w') as output:
	json.dump(result, output, indent=4)