Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
xgb_survival_network/run_xgb_survival_random_subsets.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
368 lines (299 sloc)
18.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import xgboost as xgb | |
from sklearn.model_selection import StratifiedKFold, train_test_split | |
from sklearn.utils import resample | |
from tqdm import tqdm | |
import json | |
import os | |
import sys | |
import argparse | |
import itertools | |
import random | |
def path2surv_concordance_index(y_predicted, y, delta): | |
if np.any(np.isnan(y_predicted) | np.any(np.isinf(y_predicted))): | |
return 0 | |
N = len(y) | |
Delta = ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * (1-np.array(delta))).astype(bool)) | ((np.ones((N,N)) * (1-np.array(delta))).astype(bool).transpose() & (np.ones((N,N)) * np.array(delta)).astype(bool) & ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y)) < 0)) | |
Diff = ((np.ones((N,N)) * np.array(y_predicted)).transpose() - (np.ones((N,N)) * np.array(y_predicted))) * ((np.ones((N,N)) * np.array(y)).transpose() - (np.ones((N,N)) * np.array(y))) > 0 | |
CI = (np.sum(Delta * Diff) - np.sum(np.diag(Delta * Diff))) / (np.sum(Delta) - np.sum(np.diag(Delta))) | |
if np.isnan(CI): | |
CI = 0 | |
return CI | |
cohorts = ["TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC", "TCGA-COAD", | |
"TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KIRC", "TCGA-KIRP", | |
"TCGA-LAML", "TCGA-LGG", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", | |
"TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-READ", "TCGA-SARC", | |
"TCGA-SKCM", "TCGA-STAD", "TCGA-UCEC", "TCGA-UCS", "TCGA-UVM"] | |
parser = argparse.ArgumentParser(prog='run_xgb_survial_replications.py') | |
parser.add_argument('-r', '--result', type=str, help='path to the result directory', required=True) | |
parser.add_argument('-f', '--features', type=str, help='path to the directory where the selected features should be written to', required=True) | |
parser.add_argument('-n', '--num_subsample', type=int, help='number of samples that should be used in training set', required=True) | |
parser.add_argument('-s', '--replication_start', type=int, default=1, help='first replication to run (default: 1)') | |
parser.add_argument('-e', '--replication_end', type=int, default=10, help='last replication to run (default: 10)') | |
parser.add_argument('-c', '--cohort', type=str, default='', help='single cohort to train the model on') | |
parser.add_argument('-t', '--threads', type=int, default=64, help='number of threads used by XGBoost (default: 64)') | |
parser.add_argument('-d', '--seed', type=int, default=135, help='seed used to compute the random seed for splitting the data into training and test sets (default: 135)') | |
args = parser.parse_args() | |
result_path = args.result | |
feature_path = args.features | |
num_subsample = args.num_subsample | |
replication_start = args.replication_start | |
replication_end = args.replication_end | |
single_cohort = args.cohort | |
num_threads = args.threads | |
seed = args.seed | |
if replication_start < 1: | |
print("The first replication must be at least 1") | |
sys.exit(3) | |
if replication_end < replication_start: | |
print("The last replication must be greater or equal than the first replication") | |
sys.exit(4) | |
if num_subsample < 1 or num_subsample > 6419: | |
print("Invalid number of subsamples") | |
sys.exit(5) | |
if not os.path.exists(result_path): | |
os.makedirs(result_path) | |
if not os.path.exists(feature_path): | |
os.makedirs(feature_path) | |
if single_cohort != '': | |
cohorts = [single_cohort] | |
if not os.path.exists("./TCGA_data/%s/clinical.tsv" % single_cohort): | |
print("The selected cancer cohort is not in the available TCGA datasets") | |
sys.exit(5) | |
use_xgb_features = True | |
num_select_features = 500 | |
num_round_set = [50, 100, 250, 500, 1000, 1500, 2000, 2500] | |
max_depth_set = list(range(2,15+1)) | |
learning_rate_set = [0.01, 0.025, 0.05, 0.075, 0.1] | |
colsample_set = [0.5,0.6,0.7,0.8,0.9,1.0] | |
alpha_set = [0,0.1,0.2,0.5,1,2] | |
gamma_set = [0,0.1,0.2,0.5,1,2] | |
min_child_weight_set = [1,2,3,5,7,10] | |
subsample_set = [0.5,0.65,0.8,0.9,1] | |
lambda_set = [0.1,0.2,0.5,0.8,1,2,3,5,10] | |
nrounds_feature_selection = 20 | |
nrounds_parameter_tuning = 500 | |
fold_count = 4 | |
train_ratio = 0.8 | |
# read data | |
rnaseq = pd.DataFrame() | |
clinical = pd.DataFrame() | |
cohort_dict = dict() | |
for cohort in cohorts: | |
print("Reading data for cohort %s" % cohort) | |
mrna_path = "./TCGA_data/%s/fpkm_table.csv" % cohort | |
mrna = pd.read_csv(mrna_path, index_col=0).T | |
mrna = mrna.dropna(axis=0, how="all") | |
clinical_file_path = "./TCGA_data/%s/clinical.tsv" % cohort | |
clinical_file = pd.read_csv(clinical_file_path, na_values=["NA", "--", "'--"], sep='\t') | |
index_name = "submitter_id" if "submitter_id" in clinical_file.columns else "case_submitter_id" | |
clinical_file = clinical_file.set_index(index_name) | |
clinical_file = clinical_file[["age_at_index", "days_to_death", "gender", "vital_status", "days_to_last_follow_up"]] | |
clinical_file = clinical_file.dropna(axis=0, subset=["vital_status"]) | |
clinical_file = clinical_file.dropna(axis=0, how="all", subset=["days_to_death", "days_to_last_follow_up"]) | |
clinical_file = clinical_file.dropna(axis=0, how="any", subset=["age_at_index", "gender"]) | |
clinical_file = clinical_file[((clinical_file.vital_status == "Dead") & (clinical_file.days_to_death > 0)) | ((clinical_file.vital_status == "Alive") & (clinical_file.days_to_last_follow_up > 0))] | |
clinical_unique = clinical_file.loc[~clinical_file.index.duplicated(keep='first')] | |
# only use patients that have all features | |
common_patients = mrna.index.intersection(clinical_unique.index) | |
clinical_unique = clinical_unique.loc[common_patients] | |
mrna = mrna.loc[common_patients] | |
print("Found %d patients for cohort %s" % (len(common_patients), cohort)) | |
# save cohort with clinical information | |
clinical_unique = clinical_unique.assign(cohort=cohort) | |
rnaseq = rnaseq.append(mrna) | |
clinical = clinical.append(clinical_unique) | |
cohort_dict[cohort] = common_patients | |
num_rna_features = rnaseq.shape[1] | |
# only use specified cohorts for training and testing | |
cohort_patients = [p for c in cohorts for p in cohort_dict[c]] | |
rnaseq = rnaseq.loc[cohort_patients] | |
delta = (clinical["vital_status"] == "Alive").astype(int) | |
y = clinical["days_to_death"].combine_first(clinical["days_to_last_follow_up"]).astype(int) | |
# encode censored patients with negative survival time (as expected by xgboost) | |
y = y.multiply([1 if d == 0 else -1 for d in delta]) | |
dead_patients = delta.index[delta == 0] | |
alive_patients = delta.index[delta == 1] | |
for replication in range(replication_start,replication_end+1): | |
random_state = seed*replication | |
train_dead_patients = [] | |
train_alive_patients = [] | |
test_dead_patients = [] | |
test_alive_patients = [] | |
for cohort in cohorts: | |
dead_patients_cohort = np.intersect1d(dead_patients, cohort_dict[cohort]) | |
alive_patients_cohort = np.intersect1d(alive_patients, cohort_dict[cohort]) | |
train_dead_patients_cohort, test_dead_patients_cohort = train_test_split(dead_patients_cohort, test_size=1-train_ratio, random_state=random_state) | |
train_alive_patients_cohort, test_alive_patients_cohort = train_test_split(alive_patients_cohort, test_size=1-train_ratio, random_state=random_state) | |
train_dead_patients.extend(train_dead_patients_cohort) | |
train_alive_patients.extend(train_alive_patients_cohort) | |
test_dead_patients.extend(test_dead_patients_cohort) | |
test_alive_patients.extend(test_alive_patients_cohort) | |
train_patients = np.concatenate([train_dead_patients, train_alive_patients]) | |
test_patients = np.concatenate([test_dead_patients, test_alive_patients]) | |
# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and cohort to ensure every group is represented in the same proportion as in the original data) | |
#censoring_criteria = pd.Series(delta[clinical.index].astype(str) + clinical['cohort'], index=clinical.index) | |
#train_patients = resample(train_patients, replace=False, n_samples=num_subsample, random_state=random_state, stratify=censoring_criteria[train_patients]) | |
# for the purpose of estimating the effect of training set size, select a random subset of patients from the training data (stratified by censoring status and same number of patient from every cohort to ensure every cohort is represented equally) | |
print("Subsampling %d patients" % num_subsample) | |
subsampled_train_patients = [] | |
subsamples_per_cohort = list(map(len, (np.array_split(range(num_subsample), len(cohorts))))) | |
random.seed(random_state) | |
random.shuffle(subsamples_per_cohort) | |
for i, cohort in enumerate(cohorts): | |
train_cohort_patients = np.intersect1d(train_patients, cohort_dict[cohort]) | |
if len(train_cohort_patients) < subsamples_per_cohort[i]: | |
print("Can only subsample %d patients for cohort %s" % (len(train_cohort_patients), cohort)) | |
subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=len(train_cohort_patients), random_state=random_state, stratify=delta[train_cohort_patients])) | |
else: | |
subsampled_train_patients.extend(resample(train_cohort_patients, replace=False, n_samples=subsamples_per_cohort[i], random_state=random_state, stratify=delta[train_cohort_patients])) | |
train_patients = np.array(subsampled_train_patients) | |
print("Training on %d patients" % len(train_patients)) | |
important_features = [] | |
if use_xgb_features: | |
feature_importance_vector = pd.Series(np.zeros(rnaseq.shape[1]), index=rnaseq.columns) | |
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=456) | |
kfold_samples = kfold_generator.split(train_patients, delta[train_patients]) | |
print("Selecting features") | |
fold = 1 | |
for train_indices, test_indices in kfold_samples: | |
print("Feature selection fold " + str(fold)) | |
train_indices_samples = train_patients[train_indices] | |
x_train = rnaseq.loc[train_indices_samples] | |
# remove genes with 0 mean absolute deviation | |
x_train = x_train.loc[:,x_train.mad() != 0] | |
y_train = y.loc[train_indices_samples] | |
delta_train = delta.loc[train_indices_samples] | |
random.seed(352) | |
max_depth_grid = random.choices(range(1,4), k=nrounds_feature_selection) | |
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_feature_selection) | |
num_round_grid = random.choices([5,10,20], k=nrounds_feature_selection) | |
colsample_bytree_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection) | |
subsample_grid = random.choices([1.0,0.8,0.65,0.5], k=nrounds_feature_selection) | |
for i in tqdm(range(nrounds_feature_selection)): | |
max_depth = max_depth_grid[i] | |
learning_rate = learning_rate_grid[i] | |
num_round = num_round_grid[i] | |
colsample_bytree = colsample_bytree_grid[i] | |
subsample = subsample_grid[i] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'subsample': subsample, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round) | |
feature_importances = pd.Series(state.get_score(importance_type='gain')) | |
feature_importance_vector[feature_importances.index] += feature_importances | |
fold += 1 | |
feature_importance_vector /= nrounds_feature_selection * fold_count | |
feature_importance_vector = feature_importance_vector[feature_importance_vector > 0] | |
num_important_features = min(num_select_features, len(feature_importance_vector)) | |
important_features = feature_importance_vector.sort_values(ascending=False).index[0:num_important_features] | |
important_features_with_values = feature_importance_vector.sort_values(ascending=False)[0:num_important_features] | |
feature_file = "%s/important_genes_replication_%d.json" % (feature_path, replication) | |
if single_cohort != '': | |
feature_file = "%s/%s/important_genes_replication_%d.json" % (feature_path, single_cohort, replication) | |
if not os.path.exists("%s/%s" % (feature_path, single_cohort)): | |
os.makedirs("%s/%s" % (feature_path, single_cohort)) | |
with open(feature_file, 'w') as output: | |
json.dump({"important_genes": important_features_with_values.to_dict()}, output, indent=4) | |
else: | |
important_features = rnaseq.columns | |
x_continuous = rnaseq[important_features] | |
concordance_index_dict = dict() | |
kfold_generator = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=123) | |
kfold_samples = kfold_generator.split(train_patients, delta[train_patients]) | |
print("Tuning XGBoost parameters") | |
fold = 1 | |
for train_indices, test_indices in kfold_samples: | |
print("Parameter tuning fold " + str(fold)) | |
train_samples = train_patients[train_indices] | |
test_samples = train_patients[test_indices] | |
x_train = x_continuous.loc[train_samples] | |
x_test = x_continuous.loc[test_samples] | |
y_train = y[train_samples] | |
delta_train = delta[train_samples] | |
y_test = y[test_samples] | |
delta_test = delta[test_samples] | |
random.seed(123) | |
max_depth_grid = random.choices(max_depth_set, k=nrounds_parameter_tuning) | |
learning_rate_grid = random.choices(learning_rate_set, k=nrounds_parameter_tuning) | |
num_round_grid = random.choices(num_round_set, k=nrounds_parameter_tuning) | |
colsample_bytree_grid = random.choices(colsample_set, k=nrounds_parameter_tuning) | |
alpha_grid = random.choices(alpha_set, k=nrounds_parameter_tuning) | |
gamma_grid = random.choices(gamma_set, k=nrounds_parameter_tuning) | |
min_child_weight_grid = random.choices(min_child_weight_set, k=nrounds_parameter_tuning) | |
subsample_grid = random.choices(subsample_set, k=nrounds_parameter_tuning) | |
lambda_grid = random.choices(lambda_set, k=nrounds_parameter_tuning) | |
for i in tqdm(range(nrounds_parameter_tuning)): | |
max_depth = max_depth_grid[i] | |
learning_rate = learning_rate_grid[i] | |
num_round = num_round_grid[i] | |
colsample_bytree = colsample_bytree_grid[i] | |
alpha = alpha_grid[i] | |
gamma = gamma_grid[i] | |
min_child_weight = min_child_weight_grid[i] | |
subsample = subsample_grid[i] | |
lambda_val = lambda_grid[i] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
dtest = xgb.DMatrix(x_test) | |
param = {'max_depth': max_depth, 'learning_rate': learning_rate, 'colsample_bytree': colsample_bytree, 'alpha': alpha, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'lambda': lambda_val, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round) | |
prediction = state.predict(dtest) | |
train_prediction = state.predict(dtrain) | |
concordance_index_dict.setdefault((num_round, max_depth, learning_rate, colsample_bytree, alpha, gamma, min_child_weight, subsample, lambda_val), []).append(path2surv_concordance_index(-prediction, abs(y_test), delta_test)) | |
fold += 1 | |
concordance_index_dict_mean = {k: np.mean(v) for k, v in concordance_index_dict.items()} | |
params_max_CI = max(concordance_index_dict_mean, key=concordance_index_dict_mean.get) | |
num_round_max_CI = params_max_CI[0] | |
max_depth_max_CI = params_max_CI[1] | |
learning_rate_max_CI = params_max_CI[2] | |
colsample_bytree_max_CI = params_max_CI[3] | |
alpha_max_CI = params_max_CI[4] | |
gamma_max_CI = params_max_CI[5] | |
min_child_weight_max_CI = params_max_CI[6] | |
subsample_max_CI = params_max_CI[7] | |
lambda_max_CI = params_max_CI[8] | |
x_train = x_continuous.loc[train_patients] | |
x_test = x_continuous.loc[test_patients] | |
y_train = y.loc[train_patients] | |
y_test = y.loc[test_patients] | |
delta_train = delta.loc[train_patients] | |
delta_test = delta.loc[test_patients] | |
dtrain = xgb.DMatrix(x_train, label = y_train) | |
dtest = xgb.DMatrix(x_test, label = y_test) | |
param = {'max_depth': max_depth_max_CI, 'learning_rate': learning_rate_max_CI, 'colsample_bytree': colsample_bytree_max_CI, 'alpha': alpha_max_CI, 'gamma': gamma_max_CI, 'min_child_weight': min_child_weight_max_CI, 'subsample': subsample_max_CI, 'lambda': lambda_max_CI, 'objective': "survival:cox", 'eval_metric': "cox-nloglik", 'nthread': num_threads, 'verbose': 0} | |
state = xgb.train(param, dtrain, num_round_max_CI) | |
test_prediction = state.predict(dtest) | |
result = dict() | |
result["num_round"] = num_round_max_CI | |
result["max_depth"] = max_depth_max_CI | |
result["learning_rate"] = learning_rate_max_CI | |
result["colsample_bytree"] = colsample_bytree_max_CI | |
result["alpha"] = alpha_max_CI | |
result["gamma"] = gamma_max_CI | |
result["min_child_weight"] = min_child_weight_max_CI | |
result["subsample"] = subsample_max_CI | |
result["lambda"] = lambda_max_CI | |
result["importance"] = state.get_score(importance_type='gain') | |
result["y_test"] = dict() | |
result["delta_test"] = dict() | |
result["y_predicted"] = dict() | |
result["test_patients"] = dict() | |
result["CI"] = dict() | |
result["CI_test"] = float(path2surv_concordance_index(-test_prediction, abs(y_test), delta_test)) | |
for cohort in cohorts: | |
test_cohort_patients = x_test.index[x_test.index.isin(cohort_dict[cohort])] | |
dcohort = xgb.DMatrix(x_test.loc[test_cohort_patients]) | |
prediction = state.predict(dcohort) | |
result["y_test"][cohort] = abs(y_test[test_cohort_patients]).tolist() | |
result["delta_test"][cohort] = delta_test[test_cohort_patients].tolist() | |
result["y_predicted"][cohort] = prediction.tolist() | |
result["test_patients"][cohort] = y_test.loc[test_cohort_patients].index.tolist() | |
result["CI"][cohort] = float(path2surv_concordance_index(-prediction, abs(y_test[test_cohort_patients]), delta_test[test_cohort_patients])) | |
print(cohort + " CI: " + str(result["CI"][cohort])) | |
result_file = "%s/xgb_measure_CI_replication_%d_result.json" % (result_path, replication) | |
if single_cohort != '': | |
result_file = "%s/%s/xgb_measure_CI_replication_%d_result.json" % (result_path, single_cohort, replication) | |
if not os.path.exists("%s/%s" % (result_path, single_cohort)): | |
os.makedirs("%s/%s" % (result_path, single_cohort)) | |
with open(result_file, 'w') as output: | |
json.dump(result, output, indent=4) |