Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils import resample
from tqdm import tqdm
import json
import os
import sys
import argparse
import itertools
import random
def path2surv_concordance_index(y_predicted, y, delta):
if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))):
return 0
N = len(y)
Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0))
Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0
CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
if np.isnan(CI):
CI = 0
return CI
cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]
parser = argparse.ArgumentParser(prog='run_xgb_survial_replications.py')
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
parser.add_argument('-n', '--num_subsample', type=int, help='number of samples that should be used in training set', required=True)
parser.add_argument('-s', '--replication_start', type=int, default=1, help='first replication to run (default: 1)')
parser.add_argument('-e', '--replication_end', type=int, default=10, help='last replication to run (default: 10)')
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')
parser.add_argument('-d', '--seed', type=int, default=135, help='seed used to compute the random seed for splitting the data into training and test sets (default: 135)')
args = parser.parse_args()
result_path = args.result
feature_path = args.features
num_subsample = args.num_subsample
replication_start = args.replication_start
replication_end = args.replication_end
single_cohort = args.cohort
num_threads = args.threads
seed = args.seed
if replication_start < 1:
print("The first replication must be at least 1")
sys.exit(3)
if replication_end < replication_start:
print("The last replication must be greater or equal than the first replication")
sys.exit(4)
if num_subsample < 1 or num_subsample > 6419:
print("Invalid number of subsamples")
sys.exit(5)
if not os.path.exists(result_path):
os.makedirs(result_path)
if not os.path.exists(feature_path):
os.makedirs(feature_path)
if single_cohort != '':
cohorts = [single_cohort]
if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
print("The selected cancer cohort is not in the available TCGA datasets")
sys.exit(5)
use_xgb_features = True
num_select_features = 500
num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
max_depth_set = list(range(2,15+1))
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
alpha_set = [0,0.1,0.2,0.5,1,2]
gamma_set = [0,0.1,0.2,0.5,1,2]
min_child_weight_set = [1,2,3,5,7,10]
subsample_set = [0.5,0.65,0.8,0.9,1]
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
nrounds_feature_selection = 20
nrounds_parameter_tuning = 500
fold_count = 4
train_ratio = 0.8
# read data
rnaseq = pd.DataFrame()
clinical = pd.DataFrame()
cohort_dict = dict()
for cohort in cohorts:
print("Reading data for cohort %s" % cohort)
mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
mrna = pd.read_csv(mrna_path, index_col=0).T
mrna = mrna.dropna(axis=0, how="all")
clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
clinical_file = clinical_file.set_index(index_name)
clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])
clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]
clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]
# only use patients that have all features
common_patients = mrna.index.intersection(clinical_unique.index)
clinical_unique = clinical_unique.loc[common_patients]
mrna = mrna.loc[common_patients]
print("Found %d patients for cohort %s" % (len(common_patients), cohort))
# save cohort with clinical information
clinical_unique = clinical_unique.assign(cohort=cohort)
rnaseq = rnaseq.append(mrna)
clinical = clinical.append(clinical_unique)
cohort_dict[cohort] = common_patients
num_rna_features = rnaseq.shape[1]
# only use specified cohorts for training and testing
cohort_patients = [p for c in cohorts for p in cohort_dict[c]]
rnaseq = rnaseq.loc[cohort_patients]
delta = (clinical["vital_status"] == "Alive").astype(int)
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)
# encode censored patients with negative survival time (as expected by xgboost)
y = y.multiply([1 if d == 0 else -1 for d in delta])
dead_patients = delta.index[delta == 0]
alive_patients = delta.index[delta == 1]
for replication in range(replication_start,replication_end+1):
random_state = seed*replication
train_dead_patients = []
train_alive_patients = []
test_dead_patients = []
test_alive_patients = []
for cohort in cohorts:
dead_patients_cohort = np.intersect1d(dead_patients, cohort_dict[cohort])
alive_patients_cohort = np.intersect1d(alive_patients, cohort_dict[cohort])
train_dead_patients_cohort, test_dead_patients_cohort = train_test_split(dead_patients_cohort, test_size=1-train_ratio, random_state=random_state)
train_alive_patients_cohort, test_alive_patients_cohort = train_test_split(alive_patients_cohort, test_size=1-train_ratio, random_state=random_state)
train_dead_patients.extend(train_dead_patients_cohort)
train_alive_patients.extend(train_alive_patients_cohort)
test_dead_patients.extend(test_dead_patients_cohort)
test_alive_patients.extend(test_alive_patients_cohort)
train_patients = np.concatenate([train_dead_patients, train_alive_patients])
test_patients = np.concatenate([test_dead_patients, test_alive_patients])
# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and cohort to ensure every group is represented in the same proportion as in the original data)
#censoring_criteria = pd.Series(delta[clinical.index].astype(str) + clinical['cohort'], index=clinical.index)
#train_patients = resample(train_patients, replace=False, n_samples=num_subsample, random_state=random_state, stratify=censoring_criteria[train_patients])
# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and same number of patient from every cohort to ensure every cohort is represented equally)
print("Subsampling %d patients" % num_subsample)
subsampled_train_patients = []
subsamples_per_cohort = list(map(len, (np.array_split(range(num_subsample), len(cohorts)))))
random.seed(random_state)
random.shuffle(subsamples_per_cohort)
for i, cohort in enumerate(cohorts):
train_cohort_patients = np.intersect1d(train_patients, cohort_dict[cohort])
if len(train_cohort_patients) < subsamples_per_cohort[i]:
print("Can only subsample %d patients for cohort %s" % (len(train_cohort_patients), cohort))
subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=len(train_cohort_patients), random_state=random_state, stratify=delta[train_cohort_patients]))
else:
subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=subsamples_per_cohort[i], random_state=random_state, stratify=delta[train_cohort_patients]))
train_patients = np.array(subsampled_train_patients)
print("Training on %d patients" % len(train_patients))
important_features = []
if use_xgb_features:
feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Selecting features")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Feature selection fold " + str(fold))
train_indices_samples = train_patients[train_indices]
x_train = rnaseq.loc[train_indices_samples]
# remove genes with 0 mean absolute deviation
x_train = x_train.loc[:,x_train.mad() != 0]
y_train = y.loc[train_indices_samples]
delta_train = delta.loc[train_indices_samples]
random.seed(352)
max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
for i in tqdm(range(nrounds_feature_selection)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
subsample = subsample_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
feature_importances = pd.Series(state.get_score(importance_type='gain'))
feature_importance_vector[feature_importances.index] += feature_importances
fold += 1
feature_importance_vector /= nrounds_feature_selection * fold_count
feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
num_important_features = min(num_select_features, len(feature_importance_vector))
important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]
feature_file = "%s/important_genes_replication_%d.json" % (feature_path, replication)
if single_cohort != '':
feature_file = "%s/%s/important_genes_replication_%d.json" % (feature_path, single_cohort, replication)
if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
os.makedirs("%s/%s" % (feature_path, single_cohort))
with open(feature_file, 'w') as output:
json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)
else:
important_features = rnaseq.columns
x_continuous = rnaseq[important_features]
concordance_index_dict = dict()
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Tuning XGBoost parameters")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Parameter tuning fold " + str(fold))
train_samples = train_patients[train_indices]
test_samples = train_patients[test_indices]
x_train = x_continuous.loc[train_samples]
x_test = x_continuous.loc[test_samples]
y_train = y[train_samples]
delta_train = delta[train_samples]
y_test = y[test_samples]
delta_test = delta[test_samples]
random.seed(123)
max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)
for i in tqdm(range(nrounds_parameter_tuning)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
alpha = alpha_grid[i]
gamma = gamma_grid[i]
min_child_weight = min_child_weight_grid[i]
subsample = subsample_grid[i]
lambda_val = lambda_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
prediction = state.predict(dtest)
train_prediction = state.predict(dtrain)
concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))
fold += 1
concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
num_round_max_CI = params_max_CI[0]
max_depth_max_CI = params_max_CI[1]
learning_rate_max_CI = params_max_CI[2]
colsample_bytree_max_CI = params_max_CI[3]
alpha_max_CI = params_max_CI[4]
gamma_max_CI = params_max_CI[5]
min_child_weight_max_CI = params_max_CI[6]
subsample_max_CI = params_max_CI[7]
lambda_max_CI = params_max_CI[8]
x_train = x_continuous.loc[train_patients]
x_test = x_continuous.loc[test_patients]
y_train = y.loc[train_patients]
y_test = y.loc[test_patients]
delta_train = delta.loc[train_patients]
delta_test = delta.loc[test_patients]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round_max_CI)
test_prediction = state.predict(dtest)
result = dict()
result["num_round"] = num_round_max_CI
result["max_depth"] = max_depth_max_CI
result["learning_rate"] = learning_rate_max_CI
result["colsample_bytree"] = colsample_bytree_max_CI
result["alpha"] = alpha_max_CI
result["gamma"] = gamma_max_CI
result["min_child_weight"] = min_child_weight_max_CI
result["subsample"] = subsample_max_CI
result["lambda"] = lambda_max_CI
result["importance"] = state.get_score(importance_type='gain')
result["y_test"] = dict()
result["delta_test"] = dict()
result["y_predicted"] = dict()
result["test_patients"] = dict()
result["CI"] = dict()
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))
for cohort in cohorts:
test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
prediction = state.predict(dcohort)
result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
result["y_predicted"][cohort] = prediction.tolist()
result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
print(cohort + " CI: " + str(result["CI"][cohort]))
result_file = "%s/xgb_measure_CI_replication_%d_result.json" % (result_path, replication)
if single_cohort != '':
result_file = "%s/%s/xgb_measure_CI_replication_%d_result.json" % (result_path, single_cohort, replication)
if not os.path.exists("%s/%s" % (result_path, single_cohort)):
os.makedirs("%s/%s" % (result_path, single_cohort))
with open(result_file, 'w') as output:
json.dump(result, output, indent=4)