Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
xgb_survival_network/pythonProcessingCode.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
249 lines (183 sloc)
14.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from matplotlib_venn import venn3, venn2 | |
import json | |
import mygene | |
# code for generating feature importance files from the results of the 100 XGBoost replications | |
xgb_result_path = "../results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_results" # modify this to the output path of the XGBoost survival prediction results | |
output_path = "../survival_prediction" # modify this to the desired output path for the feature importance files | |
num_reps = 100 | |
result_feature_df = pd.DataFrame() | |
for i in range(1, num_reps+1): | |
with open("%s/xgb_measure_CI_replication_%d_result.json" % (xgb_result_path, i), 'r') as json_file: | |
result = json.load(json_file) | |
features = pd.Series(result['importance']) | |
features.name = "replication_%d" % i | |
result_feature_df = pd.concat([result_feature_df, features], axis=1) | |
mg = mygene.MyGeneInfo() | |
result_feature_df = result_feature_df.assign(sum=result_feature_df.sum(axis=1)).sort_values(by='sum', ascending=False).iloc[:, :-1] | |
result_feature_sums = result_feature_df.sum(axis=1) | |
ginfo_sums = mg.querymany(result_feature_sums.index, scopes='ensembl.gene') | |
# genes without defined Hugo Symbol are removed before network propagation because they can't be used | |
hugo_genes_sums = [g['symbol'] if 'symbol' in g.keys() else '' for g in ginfo_sums] | |
result_feature_sums_hugo = result_feature_sums.copy() | |
result_feature_sums_hugo.index = hugo_genes_sums | |
result_feature_sums_hugo = result_feature_sums_hugo.drop('') | |
result_feature_df.to_csv("%s/result_feature_importance_pancancer_%d_replications.csv" % (output_path, num_reps)) | |
result_feature_sums.to_csv("%s/result_feature_importance_pancancer_%d_replications_sums.csv" % (output_path, num_reps), header=False, sep='\t') | |
# the following file is used as an input for the NetCore network propagation and module identification step | |
result_feature_sums_hugo.to_csv("%s/result_feature_importance_pancancer_%d_replications_sums_hugo_symbols.csv" % (output_path, num_reps), header=False, sep='\t') | |
# code for generating gene types of important features (including pie charts in SI1) | |
all_measured_genes_matrix = pd.read_csv("TCGA_data/TCGA-ACC/fpkm_table.csv", index_col=0) | |
pancancer_feature_importances = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0) | |
single_cohort_feature_importances = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=0, index_col=0) | |
all_measured_genes = all_measured_genes_matrix.index | |
pancancer_features = pancancer_feature_importances.index | |
single_cohort_features = single_cohort_feature_importances.index | |
mg = mygene.MyGeneInfo() | |
gene_type_df_pancancer = mg.querymany(pancancer_features, fields=['ensembl.type_of_gene', 'symbol'], as_dataframe=True) | |
gene_type_df_single_cohort = mg.querymany(single_cohort_features, fields=['ensembl.type_of_gene', 'symbol'], as_dataframe=True) | |
# for single cohort, remove second (duplicated) entry for ENSG00000229425 | |
gene_type_df_single_cohort = gene_type_df_single_cohort[~gene_type_df_single_cohort.index.duplicated()] | |
# correct entries where a gene has multiple (identical) gene type entries | |
gene_type_df_all_measured_genes['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_all_measured_genes.fillna('').iterrows()] | |
gene_type_df_pancancer['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_pancancer.fillna('').iterrows()] | |
gene_type_df_single_cohort['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_single_cohort.fillna('').iterrows()] | |
# write feature importance scores including gene types to files | |
pancancer_feature_importances_gene_types = pd.concat([pancancer_feature_importances, gene_type_df_pancancer['ensembl.type_of_gene']], axis=1) | |
pancancer_feature_importances_gene_types.index = [g[1]['symbol'] if g[1]['symbol']!='' else g[0] for g in gene_type_df_pancancer.fillna('').iterrows()] | |
single_cohort_feature_importances_gene_types = pd.concat([single_cohort_feature_importances, gene_type_df_single_cohort['ensembl.type_of_gene']], axis=1) | |
single_cohort_feature_importances_gene_types.index = [g[1]['symbol'] if g[1]['symbol']!='' else g[0] for g in gene_type_df_single_cohort.fillna('').iterrows()] | |
pancancer_feature_importances_gene_types.to_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums_with_gene_types.tab", sep='\t', header=False) | |
single_cohort_feature_importances_gene_types.to_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums_with_gene_types.tab", sep='\t', header=False) | |
# compute gene type frequencies | |
pancancer_gene_type_frequencies_all = gene_type_df_pancancer['ensembl.type_of_gene'].value_counts(normalize=True, dropna=False) | |
single_cohort_gene_type_frequencies_all = gene_type_df_single_cohort['ensembl.type_of_gene'].value_counts(normalize=True, dropna=False) | |
pancancer_gene_type_frequencies_top_100 = gene_type_df_pancancer['ensembl.type_of_gene'].iloc[:100].value_counts(normalize=True, dropna=False) | |
single_cohort_gene_type_frequencies_top_100 = gene_type_df_single_cohort['ensembl.type_of_gene'].iloc[:100].value_counts(normalize=True, dropna=False) | |
labels = ['Protein coding', 'lncRNA', 'Processed pseudogene', 'Other'] | |
# plot pie chart for different gene types (single-cohort genes) | |
frequencies = pd.Series(single_cohort_gene_type_frequencies_all[['protein_coding', 'lncRNA', 'processed_pseudogene']]) | |
frequencies['other'] = single_cohort_gene_type_frequencies_all[~single_cohort_gene_type_frequencies_all.index.isin(['protein_coding', 'lncRNA', 'processed_pseudogene'])].sum() | |
mm = 1/25.4 | |
fig, ax = plt.subplots(figsize=(120*mm, 66*mm), dpi=500) | |
ax.pie(frequencies*100, labels=labels, autopct='%1.1f%%', shadow=False) | |
ax.axis('equal') | |
plt.savefig("./survival_prediction/gene_types_single_cohort_all.pdf") | |
plt.close() | |
# plot pie chart for different gene types (pan-cancer genes) | |
frequencies = pd.Series(pancancer_gene_type_frequencies_all[['protein_coding', 'lncRNA', 'processed_pseudogene']]) | |
frequencies['other'] = pancancer_gene_type_frequencies_all[~pancancer_gene_type_frequencies_all.index.isin(['protein_coding', 'lncRNA', 'processed_pseudogene'])].sum() | |
mm = 1/25.4 | |
fig, ax = plt.subplots(figsize=(120*mm, 66*mm), dpi=500) | |
ax.pie(frequencies*100, labels=['Protein coding', 'lncRNA', 'Processed pseudogene', 'Other'], autopct='%1.1f%%', shadow=False) | |
ax.axis('equal') | |
plt.savefig("./survival_prediction/gene_types_pancancer_all.pdf") | |
plt.close() | |
# Figure 2C: code for plotting pan-cancer and single-cohort genes Venn-diagram | |
pancancer_feature_importances_ensembl = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None) | |
single_cohort_feature_importances = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=0, index_col=0) | |
mm = 1/25.4 | |
plt.figure(figsize=(66*mm, 66*mm), dpi=500) | |
v = venn2(subsets=[set(single_cohort_feature_importances.index), set(pancancer_feature_importances_ensembl.iloc[:,0])], set_labels = ('Single-cohort', 'Pan-cancer'), set_colors=("#E76BF3", "#619CFF"), alpha = 0.8) | |
v.get_patch_by_id('11').set_color('#A483F9') | |
v.get_patch_by_id('11').set_alpha(0.8) | |
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2]) | |
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2]) | |
for text in v.set_labels: | |
text.set_fontsize(6) | |
for text in v.subset_labels: | |
text.set_fontsize(6) | |
plt.savefig("./survival_prediction/single_cohort_vs_pancancer_venn.pdf") | |
plt.close() | |
# Code for computing feature entropies in Figure 3B | |
import numpy as np | |
import pandas as pd | |
def entropy(feature_scores): | |
feature_scores = np.array(feature_scores) | |
feature_scores = feature_scores[feature_scores!=0] | |
entropy = - np.sum(feature_scores*np.log2(feature_scores)) | |
return entropy | |
feature_csv = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=0) | |
feature_matrix = feature_csv.drop(columns="Symbol").fillna(0) | |
# convert feature matrix to probability matrix by dividing by row sums | |
feature_probability_matrix = feature_matrix.divide(feature_matrix.sum(axis=1), axis=0) | |
# compute entropy for each gene | |
feature_entropies = feature_probability_matrix.apply(entropy, axis=1).sort_values(ascending=False) | |
pancancer_feature_csv = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', index_col=0, squeeze=True, header=None) | |
feature_entropies[pancancer_feature_csv.index[pancancer_feature_csv.index.isin(feature_entropies.index)]].to_csv("./survival_prediction/pancancer_feature_entropies.tab", header=False, sep='\t', na_rep="NA") | |
feature_entropies[feature_matrix.index].to_csv("./survival_prediction/single_cohort_feature_entropies.tab", header=False, sep='\t', na_rep="NA") | |
# Count how many of the pan-cancer features are also important in single-cohort training per cohort | |
import numpy as np | |
import pandas as pd | |
feature_csv = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=0) | |
feature_matrix = feature_csv.drop(columns="Symbol").fillna(0) | |
pancancer_feature_csv = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', index_col=0, squeeze=True, header=None) | |
per_cohort_feature_count = feature_matrix.loc[feature_matrix.index.isin(pancancer_feature_csv.index)].astype(bool).sum(axis=0) | |
per_cohort_feature_count.to_csv("./survival_prediction/pan_cancer_feature_counts_per_cohort.txt", sep='\t', header=False) | |
# Figure 4B: plot single-cohort feature importances of module genes only | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
module_genes = pd.read_csv("./survival_prediction/pancancer_correct_features_sums_netcore_p08_module_genes.tab", header=None, sep='\t', squeeze=True) | |
single_cohort_feature_importance_table = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=1) | |
single_cohort_feature_importance_table_module_genes = single_cohort_feature_importance_table[single_cohort_feature_importance_table.index.isin(module_genes)] | |
# create barplot for feature importances of module genes in the different cohorts | |
mm = 1/25.4 | |
plt.rc('font', size=6) | |
plt.rc('xtick', labelsize=8) | |
plt.rc('ytick', labelsize=8) | |
plt.rc('legend', fontsize=8) | |
feature_importance_sums = single_cohort_feature_importance_table_module_genes.iloc[:,1:].sum(axis=0).sort_values(ascending=False) | |
feature_importance_counts = single_cohort_feature_importance_table_module_genes.iloc[:,1:].count(axis=0) | |
plot_df = pd.concat([feature_importance_sums, feature_importance_counts], axis=1) | |
plot_df.columns = ['Sum of feature importances', 'Number of important genes'] | |
axes = plot_df.plot.bar(subplots=True, title=['', ''], figsize=(174*mm,80*mm)) | |
axes[1].set_ylim(0,70) | |
plt.tight_layout() | |
plt.savefig("./survival_prediction/single_cohorts_module_gene_feature_importances.pdf") | |
plt.close() | |
# code for plotting Venn-diagram of important features from pan-cancer model training 100 replications vs. 10 replications | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from matplotlib_venn import venn3, venn2 | |
pancancer_feature_importances_100_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0) | |
pancancer_feature_importances_10_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums.csv", sep='\t', header=None, index_col=0) | |
mm = 1/25.4 | |
plt.figure(figsize=(66*mm, 66*mm), dpi=500) | |
v = venn2(subsets=[set(pancancer_feature_importances_100_reps.index), set(pancancer_feature_importances_10_reps.index)], set_labels = ('100 Replications', '10 Replications'), set_colors=("#E76BF3", "#619CFF"), alpha = 0.8) | |
v.get_patch_by_id('11').set_color('#A483F9') | |
v.get_patch_by_id('11').set_alpha(0.8) | |
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2]) | |
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2]) | |
for text in v.set_labels: | |
text.set_fontsize(6) | |
for text in v.subset_labels: | |
text.set_fontsize(6) | |
plt.savefig("../XGBoostSTARprotocols/survival_network/pancancer_100_vs_10_replications_venn.pdf") | |
plt.close() | |
# code for plotting Venn-diagram of important features from pan-cancer model training 100 replications vs. two different sets of 10 replications | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from matplotlib_venn import venn3, venn2 | |
pancancer_feature_importances_100_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0) | |
pancancer_feature_importances_10_reps_1 = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums.csv", sep='\t', header=None, index_col=0) | |
pancancer_feature_importances_10_reps_2 = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums_reps_11-20.csv", sep='\t', header=None, index_col=0) | |
mm = 1/25.4 | |
plt.figure(figsize=(66*mm, 66*mm), dpi=500) | |
v = venn3(subsets=[set(pancancer_feature_importances_100_reps.index), set(pancancer_feature_importances_10_reps_1.index), set(pancancer_feature_importances_10_reps_2.index)], set_labels = ('100 Replications', '10 Replications', '10 Replications'), set_colors=("#E76BF3", "#619CFF", "#E67E22"), alpha = 0.8) | |
v.get_patch_by_id('11').set_color('#A483F9') | |
v.get_patch_by_id('11').set_alpha(0.8) | |
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2]) | |
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2]) | |
v.get_label_by_id("C").set_position(v.get_circle_center(2) + [0,-0.2]) | |
for text in v.set_labels: | |
text.set_fontsize(6) | |
for text in v.subset_labels: | |
if text is not None: | |
text.set_fontsize(6) | |
plt.savefig("../XGBoostSTARprotocols/survival_network/pancancer_100_vs_10_vs_10_replications_venn.pdf") | |
plt.close() |