Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
xgb_survival_network/run_xgb_survival_test_new_cohorts.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
321 lines (255 sloc)
14.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import xgboost as xgb | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.model_selection import train_test_split | |
from tqdm import tqdm | |
import json | |
import os | |
import sys | |
import argparse | |
import itertools | |
import random | |
def path2surv_concordance_index(y_predicted, y, delta): | |
if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))): | |
return 0 | |
N = len(y) | |
Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0)) | |
Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0 | |
CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta))) | |
if np.isnan(CI): | |
CI = 0 | |
return CI | |
train_cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"] | |
test_cohorts = ["TCGA-CHOL", "TCGA-DLBC", "TCGA-KICH", "TCGA-PCPG", "TCGA-PRAD", | |
"TCGA-TGCT", "TCGA-THCA", "TCGA-THYM"] | |
cohorts = train_cohorts + test_cohorts | |
parser = argparse.ArgumentParser(prog='run_xgb_survial_test_new_cohorts.py') | |
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True) | |
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True) | |
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on') | |
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)') | |
args = parser.parse_args() | |
result_path = args.result | |
feature_path = args.features | |
single_cohort = args.cohort | |
num_threads = args.threads | |
if not os.path.exists(result_path): | |
os.makedirs(result_path) | |
if not os.path.exists(feature_path): | |
os.makedirs(feature_path) | |
if single_cohort != '': | |
cohorts = [single_cohort] | |
if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort): | |
print("The selected cancer cohort is not in the available TCGA datasets") | |
sys.exit(5) | |
use_xgb_features = True | |
num_select_features = 500 | |
num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500] | |
max_depth_set = list(range(2,15+1)) | |
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1] | |
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0] | |
alpha_set = [0,0.1,0.2,0.5,1,2] | |
gamma_set = [0,0.1,0.2,0.5,1,2] | |
min_child_weight_set = [1,2,3,5,7,10] | |
subsample_set = [0.5,0.65,0.8,0.9,1] | |
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10] | |
nrounds_feature_selection = 20 | |
nrounds_parameter_tuning = 500 | |
fold_count = 4 | |
train_ratio = 0.8 | |
rnaseq = pd.DataFrame() | |
clinical = pd.DataFrame() | |
cohort_dict = dict() | |
for cohort in cohorts: | |
print("Reading data for cohort %s" % cohort) | |
mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort | |
mrna = pd.read_csv(mrna_path, index_col=0).T | |
mrna = mrna.dropna(axis=0, how="all") | |
clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort | |
clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t') | |
index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id" | |
clinical_file = clinical_file.set_index(index_name) | |
clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]] | |
clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"]) | |
clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"]) | |
clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"]) | |
clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))] | |
clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')] | |
# only use patients that have all features | |
common_patients = mrna.index.intersection(clinical_unique.index) | |
clinical_unique = clinical_unique.loc[common_patients] | |
mrna = mrna.loc[common_patients] | |
print("Found %d patients for cohort %s" % (len(common_patients), cohort)) | |
rnaseq = rnaseq.append(mrna) | |
clinical = clinical.append(clinical_unique) | |
cohort_dict[cohort] = common_patients | |
num_rna_features = rnaseq.shape[1] | |
delta = (clinical["vital_status"] == "Alive").astype(int) | |
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int) | |
# encode censored patients with negative survival time (as expexted by xgboost) | |
y = y.multiply([1 if d == 0 else -1 for d in delta]) | |
dead_patients = delta.index[delta == 0] | |
alive_patients = delta.index[delta == 1] | |
train_patients = [] | |
test_patients = [] | |
for cohort in cohorts: | |
if cohort in train_cohorts: | |
train_patients.extend(cohort_dict[cohort]) | |
elif cohort in test_cohorts: | |
test_patients.extend(cohort_dict[cohort]) | |
else: | |
print("WARNING: Encountered cohort %s, which is neither in the train cohorts nor in the test cohorts.") | |
train_patients = np.array(train_patients) | |
test_patients = np.array(test_patients) | |
important_features = [] | |
if use_xgb_features: | |
feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns) | |
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456) | |
kfold_samples = kfold_generator.split(train_patients, delta[train_patients]) | |
print("Selecting features") | |
fold = 1 | |
for train_indices, test_indices in kfold_samples: | |
print("Feature selection fold " + str(fold)) | |
train_indices_samples = train_patients[train_indices] | |
x_train = rnaseq.loc[train_indices_samples] | |
# remove genes with 0 mean absolute deviation | |
x_train = x_train.loc[:,x_train.mad() != 0] | |
y_train = y.loc[train_indices_samples] | |
delta_train = delta.loc[train_indices_samples] | |
random.seed(352) | |
max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection) | |
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection) | |
num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection) | |
colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection) | |
subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection) | |
for i in tqdm(range(nrounds_feature_selection)): | |
max_depth = max_depth_grid[i] | |
learning_rate = learning_rate_grid[i] | |
num_round = num_round_grid[i] | |
colsample_bytree = colsample_bytree_grid[i] | |
subsample = subsample_grid[i] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round) | |
feature_importances = pd.Series(state.get_score(importance_type='gain')) | |
feature_importance_vector[feature_importances.index] += feature_importances | |
fold += 1 | |
feature_importance_vector /= nrounds_feature_selection * fold_count | |
feature_importance_vector = feature_importance_vector[feature_importance_vector > 0] | |
num_important_features = min(num_select_features, len(feature_importance_vector)) | |
important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features] | |
important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features] | |
feature_file = "%s/important_genes.json" % feature_path | |
if single_cohort != '': | |
feature_file = "%s/%s/important_genes.json" % (feature_path, single_cohort) | |
if not os.path.exists("%s/%s" % (feature_path, single_cohort)): | |
os.makedirs("%s/%s" % (feature_path, single_cohort)) | |
with open(feature_file, 'w') as output: | |
json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4) | |
else: | |
important_features = rnaseq.columns | |
x_continuous = rnaseq[important_features] | |
concordance_index_dict = dict() | |
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123) | |
kfold_samples = kfold_generator.split(train_patients, delta[train_patients]) | |
print("Tuning XGBoost parameters") | |
fold = 1 | |
for train_indices, test_indices in kfold_samples: | |
print("Parameter tuning fold " + str(fold)) | |
train_samples = train_patients[train_indices] | |
test_samples = train_patients[test_indices] | |
x_train = x_continuous.loc[train_samples] | |
x_test = x_continuous.loc[test_samples] | |
y_train = y[train_samples] | |
delta_train = delta[train_samples] | |
y_test = y[test_samples] | |
delta_test = delta[test_samples] | |
random.seed(123) | |
max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning) | |
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning) | |
num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning) | |
colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning) | |
alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning) | |
gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning) | |
min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning) | |
subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning) | |
lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning) | |
for i in tqdm(range(nrounds_parameter_tuning)): | |
max_depth = max_depth_grid[i] | |
learning_rate = learning_rate_grid[i] | |
num_round = num_round_grid[i] | |
colsample_bytree = colsample_bytree_grid[i] | |
alpha = alpha_grid[i] | |
gamma = gamma_grid[i] | |
min_child_weight = min_child_weight_grid[i] | |
subsample = subsample_grid[i] | |
lambda_val = lambda_grid[i] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
dtest = xgb.DMatrix(x_test) | |
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round) | |
prediction = state.predict(dtest) | |
train_prediction = state.predict(dtrain) | |
concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test)) | |
fold += 1 | |
concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()} | |
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get) | |
num_round_max_CI = params_max_CI[0] | |
max_depth_max_CI = params_max_CI[1] | |
learning_rate_max_CI = params_max_CI[2] | |
colsample_bytree_max_CI = params_max_CI[3] | |
alpha_max_CI = params_max_CI[4] | |
gamma_max_CI = params_max_CI[5] | |
min_child_weight_max_CI = params_max_CI[6] | |
subsample_max_CI = params_max_CI[7] | |
lambda_max_CI = params_max_CI[8] | |
x_train = x_continuous.loc[train_patients] | |
x_test = x_continuous.loc[test_patients] | |
y_train = y.loc[train_patients] | |
y_test = y.loc[test_patients] | |
delta_train = delta.loc[train_patients] | |
delta_test = delta.loc[test_patients] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
dtest = xgb.DMatrix(x_test, label = y_test) | |
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round_max_CI) | |
test_prediction = state.predict(dtest) | |
result = dict() | |
result["num_round"] = num_round_max_CI | |
result["max_depth"] = max_depth_max_CI | |
result["learning_rate"] = learning_rate_max_CI | |
result["colsample_bytree"] = colsample_bytree_max_CI | |
result["alpha"] = alpha_max_CI | |
result["gamma"] = gamma_max_CI | |
result["min_child_weight"] = min_child_weight_max_CI | |
result["subsample"] = subsample_max_CI | |
result["lambda"] = lambda_max_CI | |
result["importance"] = state.get_score(importance_type='gain') | |
result["y_test"] = dict() | |
result["delta_test"] = dict() | |
result["y_predicted"] = dict() | |
result["test_patients"] = dict() | |
result["CI"] = dict() | |
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test)) | |
for cohort in test_cohorts: | |
test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])] | |
dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients]) | |
prediction = state.predict(dcohort) | |
result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist() | |
result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist() | |
result["y_predicted"][cohort] = prediction.tolist() | |
result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist() | |
result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients])) | |
print(cohort + " CI: " + str(result["CI"][cohort])) | |
result_file = "%s/xgb_measure_CI_new_cohorts_result.json" % result_path | |
if single_cohort != '': | |
result_file = "%s/%s/xgb_measure_CI_new_cohorts_result.json" % (result_path, single_cohort) | |
if not os.path.exists("%s/%s" % (result_path, single_cohort)): | |
os.makedirs("%s/%s" % (result_path, single_cohort)) | |
with open(result_file, 'w') as output: | |
json.dump(result, output, indent=4) |