Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
admire/src/enrichAnalysis.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
379 lines (327 sloc)
18.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: jbayer | |
If a ranking file was passed to miRNA, a reduced version of the Gene Set Enrichment Analysis tool is started, analysing the enrichment of the identified MTI sets based on the ranked UniProtAccs. | |
With a running sum statistic, a weighted Enrichment Score (ES) is calculated for each gene set based on position dependant gene matches between the ranked list and the set. | |
The Leading Edge analysis additionally identifies and analyses the core genes of the gene set which mainly affect the ES. | |
At this, the Leading Edge analysis proceeds as follows: Depending on whether the ES of a MTI set is positive or negative, the set of Leading Edge targets either consists of the MTI set targets before | |
or after the peak in the running sum calculation. | |
Based on this, three statistics are calculated, where tags represents the ratio of leading edge targets to all targets in the given set, list starts out from all UniProtAcc in the ranking file either before | |
positive ES) or after (negative ES) the peak and calculates the ratio of these UniProtAccs to all existent within the file and where signal is a combination of the two previous calculations, | |
describing the distribution of the MTI set targets over the ranked dataset, resulting in 100% or more, if all targets can be found at the beginning of the ranked list. | |
To take the set sizes into account, MTI set enrichment analysis calculates in the next step the Normalized Enrichment Score (NES) for each gene set by using permutations of the dataset. | |
Additionally, the False Discovery Rate (FDR) q-value is calculated, representing the estimated probability of a false positive result for each set with a given NES. | |
INPUT: | |
mti_d - dictionaries with sets of targets per miRNA | |
rank_l - list of rows from the target ranking file | |
perm_num - number of permutations for normalization | |
p - power for weighting of ranking values | |
s_path - Path for MTI sets ranked file | |
h_path - Path for MTI LE heatmap | |
e_path - Path for enrichment score plots | |
le_path - Path for MTI set target information file | |
''' | |
import matplotlib | |
matplotlib.use('Agg',warn=False) | |
import matplotlib.pyplot as plt, numpy as np, random | |
from operator import itemgetter | |
from matplotlib.backends.backend_pdf import PdfPages | |
import natural_sort as ns | |
import create_figure | |
import multiprocessing as ms | |
from multiprocessing import Queue | |
def main(mti_d = {}, rank_l = [], perm_num = 1000, p = 1, s_path = '', h_path = '', e_path = '', le_path = ''): | |
# turn values in ranked list around: [value,target] and sort by values in reversed order # | |
rank_l = map(lambda line:[float(line.strip().split('\t')[1].replace(",",".")),line.strip().split('\t')[0]] if line.strip() else "not",rank_l) | |
while "not" in rank_l: | |
rank_l.remove("not") | |
rank_l.sort(reverse=True) # sort high to low | |
n = len(rank_l) # number of entries in ranked_l | |
# all genes from ranked list # | |
rank_gene_l = map(lambda entry: entry[1],rank_l) | |
rank_gene_l = list(set(rank_gene_l)) | |
# gene list for permutation # | |
perm_gene_l = rank_gene_l | |
max_es_d = {} # dict of max ES per set (mir:es) | |
max_es_ind_d = {} # dict of max ES index per set (mir:es_ind) | |
nes_d = {} # dict of NES per set (mir:nes) | |
nes_l = [] | |
perm_es_norm_d = {} # dict of all normalized ES for each set (mir:[]) | |
lead_edge_d = {} # LE values per mir | |
lead_edge_gene_d = {} #LE genes per mir | |
out_str = "Geneset\tSize\tES\tNES\tFDR q-val\tRank at Max\tLeading Edge\n" | |
mir_l = mti_d.keys() | |
mir_l.sort(key=ns.natural_keys) | |
es_q = Queue() | |
proc_l = [] | |
# calculate ES + Leading Edge and NES # | |
for mir in mir_l: | |
es_ms = ms.Process(target=get_es, args=(mir,mti_d[mir], rank_l, perm_gene_l,p, n, perm_num, es_q)) | |
proc_l.append(es_ms) | |
es_ms.start() | |
# Get all results from multiprocess (unordered) # | |
result_l = [] | |
for _ in range(0,len(mir_l)): | |
result_l.append(es_q.get()) | |
for p in proc_l: | |
p.join() | |
# Sort result lists by miRNA # | |
result_l.sort(key=lambda x: ns.natural_keys(x[0])) | |
new_mti_d = {} | |
le_gene_file = open(le_path, "w") | |
le_gene_file.write("Geneset\tGene\tIndex in Ranked List\tRunning ES\tLE Member\n") | |
# Process results and create plots for running ES per miRNA # | |
for e in result_l: | |
mir = e[0] | |
new_mti_d[mir] = e[1] | |
max_es_d[mir] = e[2] | |
max_es_ind_d[mir] = e[3] | |
nes_d[mir] = e[4] | |
lead_edge_d[mir] = e[6] | |
## If Set passed ## | |
if not "No gene" in str(max_es_d[mir]): | |
lead_edge_gene_d[mir] = e[7] | |
# add nes to list # | |
nes_l.append(e[4]) | |
# add list of permuted ES to dict # | |
perm_es_norm_d[mir] = e[5] | |
# Extend MTI-set-genes file # | |
es_genes(mir, e[8], e[10], e[2], e[3], le_gene_file) | |
# ES plot with all main running ESs # | |
es_plot_pages = PdfPages(e_path+"-"+mir+".pdf") | |
es_plot(mir, e[8], e[2], e[3]+1, e[9], e[10], e[11], es_plot_pages) | |
es_plot_pages.close() | |
le_gene_file.close() | |
# calculate FDR q-value# | |
FDR_mean_d, FDR_median_d = get_fdr(mir_l, nes_d,nes_l, perm_es_norm_d, perm_num) | |
# create LE mir overlap heatmap # | |
if lead_edge_gene_d: | |
create_figure.heatmap(lead_edge_gene_d, h_path, le=True) | |
else: | |
es_plot_pages = PdfPages(e_path) | |
empty_es_plot(es_plot_pages) | |
es_plot_pages.close() | |
create_figure.heatmap(mti_d, h_path, le=False) | |
# create output # | |
get_output(new_mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, s_path) | |
def get_es(s, mti_l, rank_l, gene_l, p, n, perm_num, es_q): | |
n = float(n) # number of genes in ranked list | |
es_l = [] # new ES list for set | |
mir_l = [] # " " | |
le_gene_l = [] # leading edge gene list | |
perm_es_norm_l = [] # normalized permuted ES list | |
# define new set, if not all genes in set are in rank list # | |
new_set_l = set(mti_l).intersection(gene_l) | |
mti_l = list(new_set_l) | |
nh = float(len(mti_l)) # number of targets in current set | |
# if at least one gene is in both lists # | |
if nh > 0: | |
empty_set = False | |
# start enrichment analysis calculations # | |
hit_gene_l = map(lambda gene: gene[1] if gene[1] in mti_l else "",rank_l) # list hits (gene name) and fails (empty string) of genes in rank list for current set | |
hit_bin = map(lambda gene: 1 if gene[1] in mti_l else 0,rank_l) # list hits (1) and fails (0) of genes in rank list for current set | |
miss_bin = map(lambda gene: 0 if gene[1] in mti_l else 1,rank_l) # inverted hit_bin list -> hits = 0, fails = 1 | |
calc_l = map(lambda item:abs(float(item[0]))**p, rank_l) # list with weighted ranking values (val^p) | |
nr = abs(sum([hit_bin[i]*calc_l[i] for i in range(0,len(calc_l))])) # absolute (positive) sum of ranking values of genes in current set | |
norm_hit = 1.0/nr | |
norm_miss = 1.0/(n-nh) | |
sumes = 0 | |
rES_l = [] | |
for i in range(0,len(calc_l)): | |
res = ((hit_bin[i]*calc_l[i]*norm_hit)-(miss_bin[i]*norm_miss))+sumes | |
rES_l.append(res) | |
sumes = res | |
es_l = rES_l | |
mir_l = hit_bin | |
## get real ES (max/min) ## | |
max_es = max(es_l) | |
max_es_ind = es_l.index(max_es) # first occurring max | |
min_es = min(es_l) | |
min_es_ind = es_l.index(min_es) # first occurring min | |
# min or max ES as max ES # | |
if max_es > -min_es: | |
max_es = max_es | |
max_es_ind = max_es_ind | |
else: | |
max_es = min_es | |
max_es_ind = min_es_ind | |
# no gene overlaps with rank list # | |
else: | |
# set is empty # | |
empty_set = True | |
## further processes in dependence of the tool's state ## | |
# function was called for main calculation # | |
if perm_num: | |
if not empty_set: | |
## Leading Edge values and genes ## | |
if max_es >= 0: | |
tag_frac = sum(hit_bin[:max_es_ind+1])/nh | |
gene_frac = (max_es_ind+1)/n | |
le_gene_l = hit_gene_l[:max_es_ind+1] | |
else: | |
tag_frac = sum(hit_bin[max_es_ind:])/nh | |
gene_frac = (n-(max_es_ind+1)+1)/n | |
le_gene_l = hit_gene_l[max_es_ind:] | |
signal = tag_frac * (1-gene_frac) * (n/(n-nh)) # gives different results than GSEA, but has the official formular -.- | |
le_gene_l = list(set(le_gene_l)) | |
if "" in le_gene_l: | |
le_gene_l.remove("") | |
## calculate NES by permute gene sets # | |
nes, perm_es_norm_l = get_nes(max_es, int(nh), rank_l, gene_l, perm_num, p, n) | |
es_q.put([s,mti_l, max_es, max_es_ind, nes, perm_es_norm_l, [tag_frac, gene_frac, signal], le_gene_l,es_l, mir_l, hit_gene_l, n]) | |
else: | |
es_q.put([s,[],"No gene in rank list.", "NaN","NaN", perm_es_norm_l, ["NaN","NaN","NaN"], []]) | |
# function was called for permuted calculation - just return max_es # | |
else: | |
return max_es | |
def get_nes(max_es, nh, rank_l, gene_l, perm_num, p, n): | |
''' permutations per set ''' | |
perm_es_l = [] # list of all permuted escores | |
perm_pos_es_l = [] # list of all permuted positive escores | |
perm_neg_es_l = [] # list of all permuted negative escores | |
for _ in range(0,perm_num): | |
tmp_mti_l = random.sample(gene_l,nh) # randomly fill sets with same number of targets | |
# new mti_dict done # | |
tmp_es = get_es('x', tmp_mti_l, rank_l, gene_l, p, n,False, None) | |
perm_es_l.append(tmp_es) | |
if tmp_es >= 0: | |
perm_pos_es_l.append(tmp_es) | |
else: | |
perm_neg_es_l.append(tmp_es) | |
# get mean of p_es for normalization # | |
if len(perm_pos_es_l) > 0: | |
pos_mean = sum(perm_pos_es_l)/len(perm_pos_es_l) | |
else: pos_mean = False | |
if len(perm_neg_es_l) > 0: | |
neg_mean = abs(sum(perm_neg_es_l)/len(perm_neg_es_l)) | |
else: neg_mean = False | |
# normalize all ES scores (permuted, real, max) # | |
perm_es_norm_l = normalize(perm_es_l, pos_mean, neg_mean) # list of all normalized permuted escores | |
max_es_norm = normalize([max_es], pos_mean, neg_mean)[0] # normalized max escore = NES | |
nes = max_es_norm | |
return nes, perm_es_norm_l | |
def get_fdr(mir_l, nes_d, nes_l, perm_es_norm_d, perm_num): | |
FDR_mean = {} | |
FDR_median = {} | |
nes_col = list(nes_l) | |
for mir in mir_l: | |
nes = nes_d[mir] | |
perm_col_sum_l = [] | |
nes_col_sum_l = [] | |
if not nes == "NaN": | |
for i in range(0,perm_num): | |
perm_col = map(lambda es_l:es_l[i],perm_es_norm_d.values()) # all perm ES over all sets from index i | |
if nes >= 0: | |
perm_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,perm_col))) # sum up all positives (permuted col) | |
nes_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,nes_col))) # " " (real col) | |
perm_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0 | |
# sum up all above NES (permuted col) and normalize with colsum | |
nes_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # " " nes_val " " | |
# " " (real col) " " " | |
else: | |
perm_col_sum = sum(map(lambda es: 1 if es < 0 else 0,perm_col)) # number of all negatives (permuted col) | |
nes_col_sum = sum(map(lambda es: 1 if es < 0 else 0,nes_col)) # " " (real col) | |
perm_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0: | |
# sum up all below NES (permuted col) and normalize with colsum | |
nes_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # # " " nes_val " " | |
# " " (real col) " " | |
perm_norm_mean = np.mean(perm_col_sum_l) | |
nes_norm_mean = np.mean(nes_col_sum_l) | |
perm_norm_median = np.median(perm_col_sum_l) | |
nes_norm_median = np.median(nes_col_sum_l) | |
FDR_mean[mir] = perm_norm_mean/nes_norm_mean if perm_norm_mean/nes_norm_mean < 1 else 1 # permutated proportional to real | |
FDR_median[mir] = perm_norm_median/nes_norm_median if perm_norm_median/nes_norm_median < 1 else 1 # " " | |
else: | |
FDR_mean[mir], FDR_median[mir] = "NaN", "NaN" | |
return FDR_mean, FDR_median | |
def get_output(mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, outPath): | |
full_val_l = [] | |
# round floating point numbers # | |
for mir in mti_d: | |
size = len(mti_d[mir]) | |
# round floating point numbers - if they are floats # | |
formatted_nes = format(nes_d[mir],".2f") if not nes_d[mir]=="NaN" else nes_d[mir] | |
formatted_fdr = format(FDR_mean_d[mir],".3f") if not FDR_mean_d[mir]=="NaN" else FDR_mean_d[mir] | |
formatted_es = format(max_es_d[mir],".2f") if not type(max_es_d[mir])==str else max_es_d[mir] | |
max_es_ind = max_es_ind_d[mir]+1 if type(max_es_ind_d[mir])==int else max_es_ind_d[mir] | |
# list of all values for output # | |
full_val_l.append([mir, size, formatted_es, formatted_nes, formatted_fdr, max_es_ind, lead_edge_d[mir]]) | |
full_val_l.sort(key=itemgetter(3), reverse=True) # sort values by NES (greater first) | |
ind_le = 6 # index where leading edge values are in list | |
epi_str = "" # epilog string for output ("No gene in rank list" MTIs) | |
for mset in full_val_l: | |
mset_str = map(lambda it: str(it),mset[:ind_le]) # all values except for leading edge values | |
tag_str = str(int(round(100*mset[ind_le][0]))) if not mset[ind_le][0]=="NaN" else mset[ind_le][0] | |
list_str = str(int(round(100*mset[ind_le][1]))) if not mset[ind_le][1]=="NaN" else mset[ind_le][1] | |
sig_str = str(int(round(100*mset[ind_le][2]))) if not mset[ind_le][2]=="NaN" else mset[ind_le][2] | |
le_str = "tags="+tag_str+"%"+",list="+list_str+"%"+",signal="+sig_str+"%" # leading edge values | |
if "No gene" in mset_str[2]: | |
# empty sets at end of list # | |
epi_str += "\t".join(mset_str)+"\t"+le_str+"\n" | |
else: | |
out_str += "\t".join(mset_str)+"\t"+le_str+"\n" # prolog string for output | |
out_str += epi_str | |
ofile = open(outPath,"w") | |
ofile.write(out_str) | |
ofile.close() | |
def es_genes(mir, es_l, tar_l, max_es, max_es_ind, gene_file): | |
'''Extends the MTI-Set-Genes list with a row for each gene in the MTI-Set. | |
Columns are: "MTI-Set Target Index in ranked list Running ES LE Member" ''' | |
for tar in tar_l: | |
if tar: | |
index = tar_l.index(tar) # index of gene in ranked list | |
if max_es >= 0: # positive max ES - all genes before and at index | |
if index <= max_es_ind: le = "Yes" | |
else: le = "No" | |
else: # negative max ES - all genes at and after index | |
if index >= max_es_ind: le = "Yes" | |
else: le = "No" | |
gene_file.write("\t".join([mir, tar, str(index+1), str(format(es_l[index],".2f")), le])+"\n") | |
def es_plot(s, es_l, max_es, max_es_ind, mir_l, tar_l, n, es_plot_pages): | |
''' Plots running ES for each index of ranking list and target occurrence per MTI-set ''' | |
ticks_tar = np.arange(1,n+1,1) # target occurrence ticks - rearranged so they are centered to ES ticks | |
ticks_es = np.arange(1,n+1) # ES ticks (startig from index 1) | |
# One plot per MTI-set # | |
fig = plt.figure(figsize=(8,3)) | |
## targets ## | |
plt.subplot2grid((3,1),(0,0)) # subplot: 3rows,1col - r0,c0 | |
plt.vlines(ticks_tar, [0], mir_l) | |
plt.xlim(xmin=0.5,xmax=n+0.5) # x-axis from 0.5 - num of items + 0.5 | |
plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off', | |
labelleft='off',right='off',top='off') # turns of all axis ticks and labels | |
plt.title("Enrichment Plot: "+s)#, y= fig_height,fontsize=font_s) | |
## ES ## | |
ymin=min(es_l)-0.1 | |
ymax=max(es_l)+0.1 | |
plt.subplot2grid((3,1), (1,0),rowspan=2) # subplot: 3rows,1column - starts from r0c0 and spans over 2 rows (1:3) | |
plt.axhline(0,0,n+0.5,linestyle='--',color='grey') # horizontal line through zero | |
plt.plot((max_es_ind,max_es_ind),(ymin,ymax),'r-',ticks_es,tuple(es_l),'b-') # color points per data point and line through data points | |
plt.xlim(xmin=0.5,xmax=n+0.5) | |
plt.xlabel("Rank in Ordered Dataset") | |
plt.ylabel("Enrichment Score (ES)") | |
plt.ylim(ymin=ymin, ymax=ymax) # reset xlimits | |
plt.subplots_adjust(hspace=0) # no space between subplots | |
es_plot_pages.savefig(fig, bbox_inches='tight') | |
plt.close() | |
def empty_es_plot(es_plot_pages): | |
fig = plt.figure(figsize=(3,3)) | |
plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off', | |
labelleft='off',right='off',top='off') # turns of all axis ticks and labels | |
plt.text(0.5,0.5,"No gene in rank list.", ha="center") | |
es_plot_pages.savefig(fig) | |
plt.close() | |
def normalize(v_list, pos_m, neg_m): | |
norm_list = [] | |
for es in v_list: | |
if es >= 0: | |
if pos_m: | |
norm_list.append(es/pos_m) | |
else: | |
norm_list.append("NaN") | |
else: | |
if neg_m: | |
norm_list.append(es/neg_m) | |
else: | |
norm_list.append("NaN") | |
return norm_list | |
if __name__ == '__main__': | |
main() |