Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn2
import json
import mygene
# code for generating feature importance files from the results of the 100 XGBoost replications
xgb_result_path = "../results_xgb_pancancer_xgb_correct_feature_selection_pancancer_no_age_gender_results" # modify this to the output path of the XGBoost survival prediction results
output_path = "../survival_prediction" # modify this to the desired output path for the feature importance files
num_reps = 100
result_feature_df = pd.DataFrame()
for i in range(1, num_reps+1):
with open("%s/xgb_measure_CI_replication_%d_result.json" % (xgb_result_path, i), 'r') as json_file:
result = json.load(json_file)
features = pd.Series(result['importance'])
features.name = "replication_%d" % i
result_feature_df = pd.concat([result_feature_df, features], axis=1)
mg = mygene.MyGeneInfo()
result_feature_df = result_feature_df.assign(sum=result_feature_df.sum(axis=1)).sort_values(by='sum', ascending=False).iloc[:, :-1]
result_feature_sums = result_feature_df.sum(axis=1)
ginfo_sums = mg.querymany(result_feature_sums.index, scopes='ensembl.gene')
# genes without defined Hugo Symbol are removed before network propagation because they can't be used
hugo_genes_sums = [g['symbol'] if 'symbol' in g.keys() else '' for g in ginfo_sums]
result_feature_sums_hugo = result_feature_sums.copy()
result_feature_sums_hugo.index = hugo_genes_sums
result_feature_sums_hugo = result_feature_sums_hugo.drop('')
result_feature_df.to_csv("%s/result_feature_importance_pancancer_%d_replications.csv" % (output_path, num_reps))
result_feature_sums.to_csv("%s/result_feature_importance_pancancer_%d_replications_sums.csv" % (output_path, num_reps), header=False, sep='\t')
# the following file is used as an input for the NetCore network propagation and module identification step
result_feature_sums_hugo.to_csv("%s/result_feature_importance_pancancer_%d_replications_sums_hugo_symbols.csv" % (output_path, num_reps), header=False, sep='\t')
# code for generating gene types of important features (including pie charts in SI1)
all_measured_genes_matrix = pd.read_csv("TCGA_data/TCGA-ACC/fpkm_table.csv", index_col=0)
pancancer_feature_importances = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0)
single_cohort_feature_importances = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=0, index_col=0)
all_measured_genes = all_measured_genes_matrix.index
pancancer_features = pancancer_feature_importances.index
single_cohort_features = single_cohort_feature_importances.index
mg = mygene.MyGeneInfo()
gene_type_df_pancancer = mg.querymany(pancancer_features, fields=['ensembl.type_of_gene', 'symbol'], as_dataframe=True)
gene_type_df_single_cohort = mg.querymany(single_cohort_features, fields=['ensembl.type_of_gene', 'symbol'], as_dataframe=True)
# for single cohort, remove second (duplicated) entry for ENSG00000229425
gene_type_df_single_cohort = gene_type_df_single_cohort[~gene_type_df_single_cohort.index.duplicated()]
# correct entries where a gene has multiple (identical) gene type entries
gene_type_df_all_measured_genes['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_all_measured_genes.fillna('').iterrows()]
gene_type_df_pancancer['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_pancancer.fillna('').iterrows()]
gene_type_df_single_cohort['ensembl.type_of_gene'] = [g[1]['ensembl.type_of_gene'] if g[1]['ensembl.type_of_gene']!='' else g[1]['ensembl'][0]['type_of_gene'] if g[1]['ensembl']!='' else '' for g in gene_type_df_single_cohort.fillna('').iterrows()]
# write feature importance scores including gene types to files
pancancer_feature_importances_gene_types = pd.concat([pancancer_feature_importances, gene_type_df_pancancer['ensembl.type_of_gene']], axis=1)
pancancer_feature_importances_gene_types.index = [g[1]['symbol'] if g[1]['symbol']!='' else g[0] for g in gene_type_df_pancancer.fillna('').iterrows()]
single_cohort_feature_importances_gene_types = pd.concat([single_cohort_feature_importances, gene_type_df_single_cohort['ensembl.type_of_gene']], axis=1)
single_cohort_feature_importances_gene_types.index = [g[1]['symbol'] if g[1]['symbol']!='' else g[0] for g in gene_type_df_single_cohort.fillna('').iterrows()]
pancancer_feature_importances_gene_types.to_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums_with_gene_types.tab", sep='\t', header=False)
single_cohort_feature_importances_gene_types.to_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums_with_gene_types.tab", sep='\t', header=False)
# compute gene type frequencies
pancancer_gene_type_frequencies_all = gene_type_df_pancancer['ensembl.type_of_gene'].value_counts(normalize=True, dropna=False)
single_cohort_gene_type_frequencies_all = gene_type_df_single_cohort['ensembl.type_of_gene'].value_counts(normalize=True, dropna=False)
pancancer_gene_type_frequencies_top_100 = gene_type_df_pancancer['ensembl.type_of_gene'].iloc[:100].value_counts(normalize=True, dropna=False)
single_cohort_gene_type_frequencies_top_100 = gene_type_df_single_cohort['ensembl.type_of_gene'].iloc[:100].value_counts(normalize=True, dropna=False)
labels = ['Protein coding', 'lncRNA', 'Processed pseudogene', 'Other']
# plot pie chart for different gene types (single-cohort genes)
frequencies = pd.Series(single_cohort_gene_type_frequencies_all[['protein_coding', 'lncRNA', 'processed_pseudogene']])
frequencies['other'] = single_cohort_gene_type_frequencies_all[~single_cohort_gene_type_frequencies_all.index.isin(['protein_coding', 'lncRNA', 'processed_pseudogene'])].sum()
mm = 1/25.4
fig, ax = plt.subplots(figsize=(120*mm, 66*mm), dpi=500)
ax.pie(frequencies*100, labels=labels, autopct='%1.1f%%', shadow=False)
ax.axis('equal')
plt.savefig("./survival_prediction/gene_types_single_cohort_all.pdf")
plt.close()
# plot pie chart for different gene types (pan-cancer genes)
frequencies = pd.Series(pancancer_gene_type_frequencies_all[['protein_coding', 'lncRNA', 'processed_pseudogene']])
frequencies['other'] = pancancer_gene_type_frequencies_all[~pancancer_gene_type_frequencies_all.index.isin(['protein_coding', 'lncRNA', 'processed_pseudogene'])].sum()
mm = 1/25.4
fig, ax = plt.subplots(figsize=(120*mm, 66*mm), dpi=500)
ax.pie(frequencies*100, labels=['Protein coding', 'lncRNA', 'Processed pseudogene', 'Other'], autopct='%1.1f%%', shadow=False)
ax.axis('equal')
plt.savefig("./survival_prediction/gene_types_pancancer_all.pdf")
plt.close()
# Figure 2C: code for plotting pan-cancer and single-cohort genes Venn-diagram
pancancer_feature_importances_ensembl = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None)
single_cohort_feature_importances = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", sep=',', header=0, index_col=0)
mm = 1/25.4
plt.figure(figsize=(66*mm, 66*mm), dpi=500)
v = venn2(subsets=[set(single_cohort_feature_importances.index), set(pancancer_feature_importances_ensembl.iloc[:,0])], set_labels = ('Single-cohort', 'Pan-cancer'), set_colors=("#E76BF3", "#619CFF"), alpha = 0.8)
v.get_patch_by_id('11').set_color('#A483F9')
v.get_patch_by_id('11').set_alpha(0.8)
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2])
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2])
for text in v.set_labels:
text.set_fontsize(6)
for text in v.subset_labels:
text.set_fontsize(6)
plt.savefig("./survival_prediction/single_cohort_vs_pancancer_venn.pdf")
plt.close()
# Code for computing feature entropies in Figure 3B
import numpy as np
import pandas as pd
def entropy(feature_scores):
feature_scores = np.array(feature_scores)
feature_scores = feature_scores[feature_scores!=0]
entropy = - np.sum(feature_scores*np.log2(feature_scores))
return entropy
feature_csv = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=0)
feature_matrix = feature_csv.drop(columns="Symbol").fillna(0)
# convert feature matrix to probability matrix by dividing by row sums
feature_probability_matrix = feature_matrix.divide(feature_matrix.sum(axis=1), axis=0)
# compute entropy for each gene
feature_entropies = feature_probability_matrix.apply(entropy, axis=1).sort_values(ascending=False)
pancancer_feature_csv = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', index_col=0, squeeze=True, header=None)
feature_entropies[pancancer_feature_csv.index[pancancer_feature_csv.index.isin(feature_entropies.index)]].to_csv("./survival_prediction/pancancer_feature_entropies.tab", header=False, sep='\t', na_rep="NA")
feature_entropies[feature_matrix.index].to_csv("./survival_prediction/single_cohort_feature_entropies.tab", header=False, sep='\t', na_rep="NA")
# Count how many of the pan-cancer features are also important in single-cohort training per cohort
import numpy as np
import pandas as pd
feature_csv = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=0)
feature_matrix = feature_csv.drop(columns="Symbol").fillna(0)
pancancer_feature_csv = pd.read_csv("./survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', index_col=0, squeeze=True, header=None)
per_cohort_feature_count = feature_matrix.loc[feature_matrix.index.isin(pancancer_feature_csv.index)].astype(bool).sum(axis=0)
per_cohort_feature_count.to_csv("./survival_prediction/pan_cancer_feature_counts_per_cohort.txt", sep='\t', header=False)
# Figure 4B: plot single-cohort feature importances of module genes only
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
module_genes = pd.read_csv("./survival_prediction/pancancer_correct_features_sums_netcore_p08_module_genes.tab", header=None, sep='\t', squeeze=True)
single_cohort_feature_importance_table = pd.read_csv("./survival_prediction/result_mean_absolute_deviation_feature_importance_single_cohort_100_replications_sums.csv", index_col=1)
single_cohort_feature_importance_table_module_genes = single_cohort_feature_importance_table[single_cohort_feature_importance_table.index.isin(module_genes)]
# create barplot for feature importances of module genes in the different cohorts
mm = 1/25.4
plt.rc('font', size=6)
plt.rc('xtick', labelsize=8)
plt.rc('ytick', labelsize=8)
plt.rc('legend', fontsize=8)
feature_importance_sums = single_cohort_feature_importance_table_module_genes.iloc[:,1:].sum(axis=0).sort_values(ascending=False)
feature_importance_counts = single_cohort_feature_importance_table_module_genes.iloc[:,1:].count(axis=0)
plot_df = pd.concat([feature_importance_sums, feature_importance_counts], axis=1)
plot_df.columns = ['Sum of feature importances', 'Number of important genes']
axes = plot_df.plot.bar(subplots=True, title=['', ''], figsize=(174*mm,80*mm))
axes[1].set_ylim(0,70)
plt.tight_layout()
plt.savefig("./survival_prediction/single_cohorts_module_gene_feature_importances.pdf")
plt.close()
# code for plotting Venn-diagram of important features from pan-cancer model training 100 replications vs. 10 replications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn2
pancancer_feature_importances_100_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0)
pancancer_feature_importances_10_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums.csv", sep='\t', header=None, index_col=0)
mm = 1/25.4
plt.figure(figsize=(66*mm, 66*mm), dpi=500)
v = venn2(subsets=[set(pancancer_feature_importances_100_reps.index), set(pancancer_feature_importances_10_reps.index)], set_labels = ('100 Replications', '10 Replications'), set_colors=("#E76BF3", "#619CFF"), alpha = 0.8)
v.get_patch_by_id('11').set_color('#A483F9')
v.get_patch_by_id('11').set_alpha(0.8)
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2])
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2])
for text in v.set_labels:
text.set_fontsize(6)
for text in v.subset_labels:
text.set_fontsize(6)
plt.savefig("../XGBoostSTARprotocols/survival_network/pancancer_100_vs_10_replications_venn.pdf")
plt.close()
# code for plotting Venn-diagram of important features from pan-cancer model training 100 replications vs. two different sets of 10 replications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn2
pancancer_feature_importances_100_reps = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_100_replications_sums.csv", sep='\t', header=None, index_col=0)
pancancer_feature_importances_10_reps_1 = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums.csv", sep='\t', header=None, index_col=0)
pancancer_feature_importances_10_reps_2 = pd.read_csv("../survival_prediction/result_feature_importance_pancancer_10_replications_sums_reps_11-20.csv", sep='\t', header=None, index_col=0)
mm = 1/25.4
plt.figure(figsize=(66*mm, 66*mm), dpi=500)
v = venn3(subsets=[set(pancancer_feature_importances_100_reps.index), set(pancancer_feature_importances_10_reps_1.index), set(pancancer_feature_importances_10_reps_2.index)], set_labels = ('100 Replications', '10 Replications', '10 Replications'), set_colors=("#E76BF3", "#619CFF", "#E67E22"), alpha = 0.8)
v.get_patch_by_id('11').set_color('#A483F9')
v.get_patch_by_id('11').set_alpha(0.8)
v.get_label_by_id("A").set_position(v.get_circle_center(0) + [0,0.2])
v.get_label_by_id("B").set_position(v.get_circle_center(1) + [-0.15,0.2])
v.get_label_by_id("C").set_position(v.get_circle_center(2) + [0,-0.2])
for text in v.set_labels:
text.set_fontsize(6)
for text in v.subset_labels:
if text is not None:
text.set_fontsize(6)
plt.savefig("../XGBoostSTARprotocols/survival_network/pancancer_100_vs_10_vs_10_replications_venn.pdf")
plt.close()