Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import os
import sys
import argparse
import itertools
import random
def path2surv_concordance_index(y_predicted, y, delta):
if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))):
return 0
N = len(y)
Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0))
Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0
CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta)))
if np.isnan(CI):
CI = 0
return CI
train_cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD",
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP",
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC",
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC",
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"]
test_cohorts = ["TCGA-CHOL", "TCGA-DLBC", "TCGA-KICH", "TCGA-PCPG", "TCGA-PRAD",
"TCGA-TGCT", "TCGA-THCA", "TCGA-THYM"]
cohorts = train_cohorts + test_cohorts
parser = argparse.ArgumentParser(prog='run_xgb_survial_test_new_cohorts.py')
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True)
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True)
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on')
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)')
args = parser.parse_args()
result_path = args.result
feature_path = args.features
single_cohort = args.cohort
num_threads = args.threads
if not os.path.exists(result_path):
os.makedirs(result_path)
if not os.path.exists(feature_path):
os.makedirs(feature_path)
if single_cohort != '':
cohorts = [single_cohort]
if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort):
print("The selected cancer cohort is not in the available TCGA datasets")
sys.exit(5)
use_xgb_features = True
num_select_features = 500
num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500]
max_depth_set = list(range(2,15+1))
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1]
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0]
alpha_set = [0,0.1,0.2,0.5,1,2]
gamma_set = [0,0.1,0.2,0.5,1,2]
min_child_weight_set = [1,2,3,5,7,10]
subsample_set = [0.5,0.65,0.8,0.9,1]
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10]
nrounds_feature_selection = 20
nrounds_parameter_tuning = 500
fold_count = 4
train_ratio = 0.8
rnaseq = pd.DataFrame()
clinical = pd.DataFrame()
cohort_dict = dict()
for cohort in cohorts:
print("Reading data for cohort %s" % cohort)
mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort
mrna = pd.read_csv(mrna_path, index_col=0).T
mrna = mrna.dropna(axis=0, how="all")
clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort
clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t')
index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id"
clinical_file = clinical_file.set_index(index_name)
clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]]
clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"])
clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"])
clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"])
clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))]
clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')]
# only use patients that have all features
common_patients = mrna.index.intersection(clinical_unique.index)
clinical_unique = clinical_unique.loc[common_patients]
mrna = mrna.loc[common_patients]
print("Found %d patients for cohort %s" % (len(common_patients), cohort))
rnaseq = rnaseq.append(mrna)
clinical = clinical.append(clinical_unique)
cohort_dict[cohort] = common_patients
num_rna_features = rnaseq.shape[1]
delta = (clinical["vital_status"] == "Alive").astype(int)
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int)
# encode censored patients with negative survival time (as expexted by xgboost)
y = y.multiply([1 if d == 0 else -1 for d in delta])
dead_patients = delta.index[delta == 0]
alive_patients = delta.index[delta == 1]
train_patients = []
test_patients = []
for cohort in cohorts:
if cohort in train_cohorts:
train_patients.extend(cohort_dict[cohort])
elif cohort in test_cohorts:
test_patients.extend(cohort_dict[cohort])
else:
print("WARNING: Encountered cohort %s, which is neither in the train cohorts nor in the test cohorts.")
train_patients = np.array(train_patients)
test_patients = np.array(test_patients)
important_features = []
if use_xgb_features:
feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns)
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Selecting features")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Feature selection fold " + str(fold))
train_indices_samples = train_patients[train_indices]
x_train = rnaseq.loc[train_indices_samples]
# remove genes with 0 mean absolute deviation
x_train = x_train.loc[:,x_train.mad() != 0]
y_train = y.loc[train_indices_samples]
delta_train = delta.loc[train_indices_samples]
random.seed(352)
max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection)
num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection)
colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection)
for i in tqdm(range(nrounds_feature_selection)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
subsample = subsample_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
feature_importances = pd.Series(state.get_score(importance_type='gain'))
feature_importance_vector[feature_importances.index] += feature_importances
fold += 1
feature_importance_vector /= nrounds_feature_selection * fold_count
feature_importance_vector = feature_importance_vector[feature_importance_vector > 0]
num_important_features = min(num_select_features, len(feature_importance_vector))
important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features]
important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features]
feature_file = "%s/important_genes.json" % feature_path
if single_cohort != '':
feature_file = "%s/%s/important_genes.json" % (feature_path, single_cohort)
if not os.path.exists("%s/%s" % (feature_path, single_cohort)):
os.makedirs("%s/%s" % (feature_path, single_cohort))
with open(feature_file, 'w') as output:
json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4)
else:
important_features = rnaseq.columns
x_continuous = rnaseq[important_features]
concordance_index_dict = dict()
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123)
kfold_samples = kfold_generator.split(train_patients, delta[train_patients])
print("Tuning XGBoost parameters")
fold = 1
for train_indices, test_indices in kfold_samples:
print("Parameter tuning fold " + str(fold))
train_samples = train_patients[train_indices]
test_samples = train_patients[test_indices]
x_train = x_continuous.loc[train_samples]
x_test = x_continuous.loc[test_samples]
y_train = y[train_samples]
delta_train = delta[train_samples]
y_test = y[test_samples]
delta_test = delta[test_samples]
random.seed(123)
max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning)
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning)
num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning)
colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning)
alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning)
gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning)
min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning)
subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning)
lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning)
for i in tqdm(range(nrounds_parameter_tuning)):
max_depth = max_depth_grid[i]
learning_rate = learning_rate_grid[i]
num_round = num_round_grid[i]
colsample_bytree = colsample_bytree_grid[i]
alpha = alpha_grid[i]
gamma = gamma_grid[i]
min_child_weight = min_child_weight_grid[i]
subsample = subsample_grid[i]
lambda_val = lambda_grid[i]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test)
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round)
prediction = state.predict(dtest)
train_prediction = state.predict(dtrain)
concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test))
fold += 1
concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()}
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get)
num_round_max_CI = params_max_CI[0]
max_depth_max_CI = params_max_CI[1]
learning_rate_max_CI = params_max_CI[2]
colsample_bytree_max_CI = params_max_CI[3]
alpha_max_CI = params_max_CI[4]
gamma_max_CI = params_max_CI[5]
min_child_weight_max_CI = params_max_CI[6]
subsample_max_CI = params_max_CI[7]
lambda_max_CI = params_max_CI[8]
x_train = x_continuous.loc[train_patients]
x_test = x_continuous.loc[test_patients]
y_train = y.loc[train_patients]
y_test = y.loc[test_patients]
delta_train = delta.loc[train_patients]
delta_test = delta.loc[test_patients]
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0}
state = xgb.train(param, dtrain, num_round_max_CI)
test_prediction = state.predict(dtest)
result = dict()
result["num_round"] = num_round_max_CI
result["max_depth"] = max_depth_max_CI
result["learning_rate"] = learning_rate_max_CI
result["colsample_bytree"] = colsample_bytree_max_CI
result["alpha"] = alpha_max_CI
result["gamma"] = gamma_max_CI
result["min_child_weight"] = min_child_weight_max_CI
result["subsample"] = subsample_max_CI
result["lambda"] = lambda_max_CI
result["importance"] = state.get_score(importance_type='gain')
result["y_test"] = dict()
result["delta_test"] = dict()
result["y_predicted"] = dict()
result["test_patients"] = dict()
result["CI"] = dict()
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test))
for cohort in test_cohorts:
test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])]
dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients])
prediction = state.predict(dcohort)
result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist()
result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist()
result["y_predicted"][cohort] = prediction.tolist()
result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist()
result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients]))
print(cohort + " CI: " + str(result["CI"][cohort]))
result_file = "%s/xgb_measure_CI_new_cohorts_result.json" % result_path
if single_cohort != '':
result_file = "%s/%s/xgb_measure_CI_new_cohorts_result.json" % (result_path, single_cohort)
if not os.path.exists("%s/%s" % (result_path, single_cohort)):
os.makedirs("%s/%s" % (result_path, single_cohort))
with open(result_file, 'w') as output:
json.dump(result, output, indent=4)