Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import os
import sys
import argparse
import itertools
import random
def path2surv_concordance_index(y_predicted, y, delta):
if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))):
return 0
N = len(y)
Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0))
Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0
CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
if np.isnan(CI):
CI = 0
return CI
cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]
parser = argparse.ArgumentParser(prog='run_xgb_survial_replications.py')
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
parser.add_argument('-s', '--replication_start', type=int, default=1, help='first replication to run (default: 1)')
parser.add_argument('-e', '--replication_end', type=int, default=10, help='last replication to run (default: 10)')
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')
parser.add_argument('-d', '--seed', type=int, default=135, help='seed used to compute the random seed for splitting the data into training and test sets (default: 135)')
args = parser.parse_args()
result_path = args.result
feature_path = args.features
replication_start = args.replication_start
replication_end = args.replication_end
single_cohort = args.cohort
num_threads = args.threads
seed = args.seed
if replication_start < 1:
print("The first replication must be at least 1")
sys.exit(3)
if replication_end < replication_start:
print("The last replication must be greater or equal than the first replication")
sys.exit(4)
if not os.path.exists(result_path):
os.makedirs(result_path)
if not os.path.exists(feature_path):
os.makedirs(feature_path)
if single_cohort != '':
cohorts = [single_cohort]
if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
print("The selected cancer cohort is not in the available TCGA datasets")
sys.exit(5)
use_xgb_features = True
num_select_features = 500
num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
max_depth_set = list(range(2,15+1))
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
alpha_set = [0,0.1,0.2,0.5,1,2]
gamma_set = [0,0.1,0.2,0.5,1,2]
min_child_weight_set = [1,2,3,5,7,10]
subsample_set = [0.5,0.65,0.8,0.9,1]
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
nrounds_feature_selection = 20
nrounds_parameter_tuning = 500
fold_count = 4
train_ratio = 0.8
# read data
rnaseq = pd.DataFrame()
clinical = pd.DataFrame()
cohort_dict = dict()
for cohort in cohorts:
print("Reading data for cohort %s" % cohort)
mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
mrna = pd.read_csv(mrna_path, index_col=0).T
mrna = mrna.dropna(axis=0, how="all")
clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
clinical_file = clinical_file.set_index(index_name)
clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])
clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]
clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]
# only use patients that have all features
common_patients = mrna.index.intersection(clinical_unique.index)
clinical_unique = clinical_unique.loc[common_patients]
mrna = mrna.loc[common_patients]
print("Found %d patients for cohort %s" % (len(common_patients), cohort))
rnaseq = rnaseq.append(mrna)
clinical = clinical.append(clinical_unique)
cohort_dict[cohort] = common_patients
num_rna_features = rnaseq.shape[1]
# only use specified cohorts for training and testing
cohort_patients = [p for c in cohorts for p in cohort_dict[c]]
rnaseq = rnaseq.loc[cohort_patients]
delta = (clinical["vital_status"] == "Alive").astype(int)
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)
# encode censored patients with negative survival time (as expected by xgboost)
y = y.multiply([1 if d == 0 else -1 for d in delta])
dead_patients = delta.index[delta == 0]
alive_patients = delta.index[delta == 1]
for replication in range(replication_start,replication_end+1):
random_state = seed*replication
train_dead_patients = []
train_alive_patients = []
test_dead_patients = []
test_alive_patients = []
for cohort in cohorts:
dead_patients_cohort = np.intersect1d(dead_patients, cohort_dict[cohort])
alive_patients_cohort = np.intersect1d(alive_patients, cohort_dict[cohort])
train_dead_patients_cohort, test_dead_patients_cohort = train_test_split(dead_patients_cohort, test_size=1-train_ratio, random_state=random_state)
train_alive_patients_cohort, test_alive_patients_cohort = train_test_split(alive_patients_cohort, test_size=1-train_ratio, random_state=random_state)
train_dead_patients.extend(train_dead_patients_cohort)
train_alive_patients.extend(train_alive_patients_cohort)
test_dead_patients.extend(test_dead_patients_cohort)
test_alive_patients.extend(test_alive_patients_cohort)
train_patients = np.concatenate([train_dead_patients, train_alive_patients])
test_patients = np.concatenate([test_dead_patients, test_alive_patients])
important_features = []
if use_xgb_features:
feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Selecting features")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Feature selection fold " + str(fold))
train_indices_samples = train_patients[train_indices]
x_train = rnaseq.loc[train_indices_samples]
# remove genes with 0 mean absolute deviation
x_train = x_train.loc[:,x_train.mad() != 0]
y_train = y.loc[train_indices_samples]
delta_train = delta.loc[train_indices_samples]
random.seed(352)
max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
for i in tqdm(range(nrounds_feature_selection)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
subsample = subsample_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
feature_importances = pd.Series(state.get_score(importance_type='gain'))
feature_importance_vector[feature_importances.index] += feature_importances
fold += 1
feature_importance_vector /= nrounds_feature_selection * fold_count
feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
num_important_features = min(num_select_features, len(feature_importance_vector))
important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]
feature_file = "%s/important_genes_replication_%d.json" % (feature_path, replication)
if single_cohort != '':
feature_file = "%s/%s/important_genes_replication_%d.json" % (feature_path, single_cohort, replication)
if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
os.makedirs("%s/%s" % (feature_path, single_cohort))
with open(feature_file, 'w') as output:
json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)
else:
important_features = rnaseq.columns
x_continuous = rnaseq[important_features]
concordance_index_dict = dict()
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Tuning XGBoost parameters")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Parameter tuning fold " + str(fold))
train_samples = train_patients[train_indices]
test_samples = train_patients[test_indices]
x_train = x_continuous.loc[train_samples]
x_test = x_continuous.loc[test_samples]
y_train = y[train_samples]
delta_train = delta[train_samples]
y_test = y[test_samples]
delta_test = delta[test_samples]
random.seed(123)
max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)
for i in tqdm(range(nrounds_parameter_tuning)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
alpha = alpha_grid[i]
gamma = gamma_grid[i]
min_child_weight = min_child_weight_grid[i]
subsample = subsample_grid[i]
lambda_val = lambda_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
prediction = state.predict(dtest)
train_prediction = state.predict(dtrain)
concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))
fold += 1
concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
num_round_max_CI = params_max_CI[0]
max_depth_max_CI = params_max_CI[1]
learning_rate_max_CI = params_max_CI[2]
colsample_bytree_max_CI = params_max_CI[3]
alpha_max_CI = params_max_CI[4]
gamma_max_CI = params_max_CI[5]
min_child_weight_max_CI = params_max_CI[6]
subsample_max_CI = params_max_CI[7]
lambda_max_CI = params_max_CI[8]
x_train = x_continuous.loc[train_patients]
x_test = x_continuous.loc[test_patients]
y_train = y.loc[train_patients]
y_test = y.loc[test_patients]
delta_train = delta.loc[train_patients]
delta_test = delta.loc[test_patients]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round_max_CI)
test_prediction = state.predict(dtest)
result = dict()
result["num_round"] = num_round_max_CI
result["max_depth"] = max_depth_max_CI
result["learning_rate"] = learning_rate_max_CI
result["colsample_bytree"] = colsample_bytree_max_CI
result["alpha"] = alpha_max_CI
result["gamma"] = gamma_max_CI
result["min_child_weight"] = min_child_weight_max_CI
result["subsample"] = subsample_max_CI
result["lambda"] = lambda_max_CI
result["importance"] = state.get_score(importance_type='gain')
result["y_test"] = dict()
result["delta_test"] = dict()
result["y_predicted"] = dict()
result["test_patients"] = dict()
result["CI"] = dict()
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))
for cohort in cohorts:
test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
prediction = state.predict(dcohort)
result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
result["y_predicted"][cohort] = prediction.tolist()
result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
print(cohort + " CI: " + str(result["CI"][cohort]))
result_file = "%s/xgb_measure_CI_replication_%d_result.json" % (result_path, replication)
if single_cohort != '':
result_file = "%s/%s/xgb_measure_CI_replication_%d_result.json" % (result_path, single_cohort, replication)
if not os.path.exists("%s/%s" % (result_path, single_cohort)):
os.makedirs("%s/%s" % (result_path, single_cohort))
with open(result_file, 'w') as output:
json.dump(result, output, indent=4)