Skip to content
Permalink
5bf0bf30e1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 379 lines (327 sloc) 18.3 KB
'''
@author: jbayer
If a ranking file was passed to miRNA, a reduced version of the Gene Set Enrichment Analysis tool is started, analysing the enrichment of the identified MTI sets based on the ranked UniProtAccs.
With a running sum statistic, a weighted Enrichment Score (ES) is calculated for each gene set based on position dependant gene matches between the ranked list and the set.
The Leading Edge analysis additionally identifies and analyses the core genes of the gene set which mainly affect the ES.
At this, the Leading Edge analysis proceeds as follows: Depending on whether the ES of a MTI set is positive or negative, the set of Leading Edge targets either consists of the MTI set targets before
or after the peak in the running sum calculation.
Based on this, three statistics are calculated, where tags represents the ratio of leading edge targets to all targets in the given set, list starts out from all UniProtAcc in the ranking file either before
positive ES) or after (negative ES) the peak and calculates the ratio of these UniProtAccs to all existent within the file and where signal is a combination of the two previous calculations,
describing the distribution of the MTI set targets over the ranked dataset, resulting in 100% or more, if all targets can be found at the beginning of the ranked list.
To take the set sizes into account, MTI set enrichment analysis calculates in the next step the Normalized Enrichment Score (NES) for each gene set by using permutations of the dataset.
Additionally, the False Discovery Rate (FDR) q-value is calculated, representing the estimated probability of a false positive result for each set with a given NES.
INPUT:
mti_d - dictionaries with sets of targets per miRNA
rank_l - list of rows from the target ranking file
perm_num - number of permutations for normalization
p - power for weighting of ranking values
s_path - Path for MTI sets ranked file
h_path - Path for MTI LE heatmap
e_path - Path for enrichment score plots
le_path - Path for MTI set target information file
'''
import matplotlib
matplotlib.use('Agg',warn=False)
import matplotlib.pyplot as plt, numpy as np, random
from operator import itemgetter
from matplotlib.backends.backend_pdf import PdfPages
import natural_sort as ns
import create_figure
import multiprocessing as ms
from multiprocessing import Queue
def main(mti_d = {}, rank_l = [], perm_num = 1000, p = 1, s_path = '', h_path = '', e_path = '', le_path = ''):
# turn values in ranked list around: [value,target] and sort by values in reversed order #
rank_l = map(lambda line:[float(line.strip().split('\t')[1].replace(",",".")),line.strip().split('\t')[0]] if line.strip() else "not",rank_l)
while "not" in rank_l:
rank_l.remove("not")
rank_l.sort(reverse=True) # sort high to low
n = len(rank_l) # number of entries in ranked_l
# all genes from ranked list #
rank_gene_l = map(lambda entry: entry[1],rank_l)
rank_gene_l = list(set(rank_gene_l))
# gene list for permutation #
perm_gene_l = rank_gene_l
max_es_d = {} # dict of max ES per set (mir:es)
max_es_ind_d = {} # dict of max ES index per set (mir:es_ind)
nes_d = {} # dict of NES per set (mir:nes)
nes_l = []
perm_es_norm_d = {} # dict of all normalized ES for each set (mir:[])
lead_edge_d = {} # LE values per mir
lead_edge_gene_d = {} #LE genes per mir
out_str = "Geneset\tSize\tES\tNES\tFDR q-val\tRank at Max\tLeading Edge\n"
mir_l = mti_d.keys()
mir_l.sort(key=ns.natural_keys)
es_q = Queue()
proc_l = []
# calculate ES + Leading Edge and NES #
for mir in mir_l:
es_ms = ms.Process(target=get_es, args=(mir,mti_d[mir], rank_l, perm_gene_l,p, n, perm_num, es_q))
proc_l.append(es_ms)
es_ms.start()
# Get all results from multiprocess (unordered) #
result_l = []
for _ in range(0,len(mir_l)):
result_l.append(es_q.get())
for p in proc_l:
p.join()
# Sort result lists by miRNA #
result_l.sort(key=lambda x: ns.natural_keys(x[0]))
new_mti_d = {}
le_gene_file = open(le_path, "w")
le_gene_file.write("Geneset\tGene\tIndex in Ranked List\tRunning ES\tLE Member\n")
# Process results and create plots for running ES per miRNA #
for e in result_l:
mir = e[0]
new_mti_d[mir] = e[1]
max_es_d[mir] = e[2]
max_es_ind_d[mir] = e[3]
nes_d[mir] = e[4]
lead_edge_d[mir] = e[6]
## If Set passed ##
if not "No gene" in str(max_es_d[mir]):
lead_edge_gene_d[mir] = e[7]
# add nes to list #
nes_l.append(e[4])
# add list of permuted ES to dict #
perm_es_norm_d[mir] = e[5]
# Extend MTI-set-genes file #
es_genes(mir, e[8], e[10], e[2], e[3], le_gene_file)
# ES plot with all main running ESs #
es_plot_pages = PdfPages(e_path+"-"+mir+".pdf")
es_plot(mir, e[8], e[2], e[3]+1, e[9], e[10], e[11], es_plot_pages)
es_plot_pages.close()
le_gene_file.close()
# calculate FDR q-value#
FDR_mean_d, FDR_median_d = get_fdr(mir_l, nes_d,nes_l, perm_es_norm_d, perm_num)
# create LE mir overlap heatmap #
if lead_edge_gene_d:
create_figure.heatmap(lead_edge_gene_d, h_path, le=True)
else:
es_plot_pages = PdfPages(e_path)
empty_es_plot(es_plot_pages)
es_plot_pages.close()
create_figure.heatmap(mti_d, h_path, le=False)
# create output #
get_output(new_mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, s_path)
def get_es(s, mti_l, rank_l, gene_l, p, n, perm_num, es_q):
n = float(n) # number of genes in ranked list
es_l = [] # new ES list for set
mir_l = [] # " "
le_gene_l = [] # leading edge gene list
perm_es_norm_l = [] # normalized permuted ES list
# define new set, if not all genes in set are in rank list #
new_set_l = set(mti_l).intersection(gene_l)
mti_l = list(new_set_l)
nh = float(len(mti_l)) # number of targets in current set
# if at least one gene is in both lists #
if nh > 0:
empty_set = False
# start enrichment analysis calculations #
hit_gene_l = map(lambda gene: gene[1] if gene[1] in mti_l else "",rank_l) # list hits (gene name) and fails (empty string) of genes in rank list for current set
hit_bin = map(lambda gene: 1 if gene[1] in mti_l else 0,rank_l) # list hits (1) and fails (0) of genes in rank list for current set
miss_bin = map(lambda gene: 0 if gene[1] in mti_l else 1,rank_l) # inverted hit_bin list -> hits = 0, fails = 1
calc_l = map(lambda item:abs(float(item[0]))**p, rank_l) # list with weighted ranking values (val^p)
nr = abs(sum([hit_bin[i]*calc_l[i] for i in range(0,len(calc_l))])) # absolute (positive) sum of ranking values of genes in current set
norm_hit = 1.0/nr
norm_miss = 1.0/(n-nh)
sumes = 0
rES_l = []
for i in range(0,len(calc_l)):
res = ((hit_bin[i]*calc_l[i]*norm_hit)-(miss_bin[i]*norm_miss))+sumes
rES_l.append(res)
sumes = res
es_l = rES_l
mir_l = hit_bin
## get real ES (max/min) ##
max_es = max(es_l)
max_es_ind = es_l.index(max_es) # first occurring max
min_es = min(es_l)
min_es_ind = es_l.index(min_es) # first occurring min
# min or max ES as max ES #
if max_es > -min_es:
max_es = max_es
max_es_ind = max_es_ind
else:
max_es = min_es
max_es_ind = min_es_ind
# no gene overlaps with rank list #
else:
# set is empty #
empty_set = True
## further processes in dependence of the tool's state ##
# function was called for main calculation #
if perm_num:
if not empty_set:
## Leading Edge values and genes ##
if max_es >= 0:
tag_frac = sum(hit_bin[:max_es_ind+1])/nh
gene_frac = (max_es_ind+1)/n
le_gene_l = hit_gene_l[:max_es_ind+1]
else:
tag_frac = sum(hit_bin[max_es_ind:])/nh
gene_frac = (n-(max_es_ind+1)+1)/n
le_gene_l = hit_gene_l[max_es_ind:]
signal = tag_frac * (1-gene_frac) * (n/(n-nh)) # gives different results than GSEA, but has the official formular -.-
le_gene_l = list(set(le_gene_l))
if "" in le_gene_l:
le_gene_l.remove("")
## calculate NES by permute gene sets #
nes, perm_es_norm_l = get_nes(max_es, int(nh), rank_l, gene_l, perm_num, p, n)
es_q.put([s,mti_l, max_es, max_es_ind, nes, perm_es_norm_l, [tag_frac, gene_frac, signal], le_gene_l,es_l, mir_l, hit_gene_l, n])
else:
es_q.put([s,[],"No gene in rank list.", "NaN","NaN", perm_es_norm_l, ["NaN","NaN","NaN"], []])
# function was called for permuted calculation - just return max_es #
else:
return max_es
def get_nes(max_es, nh, rank_l, gene_l, perm_num, p, n):
''' permutations per set '''
perm_es_l = [] # list of all permuted escores
perm_pos_es_l = [] # list of all permuted positive escores
perm_neg_es_l = [] # list of all permuted negative escores
for _ in range(0,perm_num):
tmp_mti_l = random.sample(gene_l,nh) # randomly fill sets with same number of targets
# new mti_dict done #
tmp_es = get_es('x', tmp_mti_l, rank_l, gene_l, p, n,False, None)
perm_es_l.append(tmp_es)
if tmp_es >= 0:
perm_pos_es_l.append(tmp_es)
else:
perm_neg_es_l.append(tmp_es)
# get mean of p_es for normalization #
if len(perm_pos_es_l) > 0:
pos_mean = sum(perm_pos_es_l)/len(perm_pos_es_l)
else: pos_mean = False
if len(perm_neg_es_l) > 0:
neg_mean = abs(sum(perm_neg_es_l)/len(perm_neg_es_l))
else: neg_mean = False
# normalize all ES scores (permuted, real, max) #
perm_es_norm_l = normalize(perm_es_l, pos_mean, neg_mean) # list of all normalized permuted escores
max_es_norm = normalize([max_es], pos_mean, neg_mean)[0] # normalized max escore = NES
nes = max_es_norm
return nes, perm_es_norm_l
def get_fdr(mir_l, nes_d, nes_l, perm_es_norm_d, perm_num):
FDR_mean = {}
FDR_median = {}
nes_col = list(nes_l)
for mir in mir_l:
nes = nes_d[mir]
perm_col_sum_l = []
nes_col_sum_l = []
if not nes == "NaN":
for i in range(0,perm_num):
perm_col = map(lambda es_l:es_l[i],perm_es_norm_d.values()) # all perm ES over all sets from index i
if nes >= 0:
perm_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,perm_col))) # sum up all positives (permuted col)
nes_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,nes_col))) # " " (real col)
perm_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0
# sum up all above NES (permuted col) and normalize with colsum
nes_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # " " nes_val " "
# " " (real col) " " "
else:
perm_col_sum = sum(map(lambda es: 1 if es < 0 else 0,perm_col)) # number of all negatives (permuted col)
nes_col_sum = sum(map(lambda es: 1 if es < 0 else 0,nes_col)) # " " (real col)
perm_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0:
# sum up all below NES (permuted col) and normalize with colsum
nes_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # # " " nes_val " "
# " " (real col) " "
perm_norm_mean = np.mean(perm_col_sum_l)
nes_norm_mean = np.mean(nes_col_sum_l)
perm_norm_median = np.median(perm_col_sum_l)
nes_norm_median = np.median(nes_col_sum_l)
FDR_mean[mir] = perm_norm_mean/nes_norm_mean if perm_norm_mean/nes_norm_mean < 1 else 1 # permutated proportional to real
FDR_median[mir] = perm_norm_median/nes_norm_median if perm_norm_median/nes_norm_median < 1 else 1 # " "
else:
FDR_mean[mir], FDR_median[mir] = "NaN", "NaN"
return FDR_mean, FDR_median
def get_output(mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, outPath):
full_val_l = []
# round floating point numbers #
for mir in mti_d:
size = len(mti_d[mir])
# round floating point numbers - if they are floats #
formatted_nes = format(nes_d[mir],".2f") if not nes_d[mir]=="NaN" else nes_d[mir]
formatted_fdr = format(FDR_mean_d[mir],".3f") if not FDR_mean_d[mir]=="NaN" else FDR_mean_d[mir]
formatted_es = format(max_es_d[mir],".2f") if not type(max_es_d[mir])==str else max_es_d[mir]
max_es_ind = max_es_ind_d[mir]+1 if type(max_es_ind_d[mir])==int else max_es_ind_d[mir]
# list of all values for output #
full_val_l.append([mir, size, formatted_es, formatted_nes, formatted_fdr, max_es_ind, lead_edge_d[mir]])
full_val_l.sort(key=itemgetter(3), reverse=True) # sort values by NES (greater first)
ind_le = 6 # index where leading edge values are in list
epi_str = "" # epilog string for output ("No gene in rank list" MTIs)
for mset in full_val_l:
mset_str = map(lambda it: str(it),mset[:ind_le]) # all values except for leading edge values
tag_str = str(int(round(100*mset[ind_le][0]))) if not mset[ind_le][0]=="NaN" else mset[ind_le][0]
list_str = str(int(round(100*mset[ind_le][1]))) if not mset[ind_le][1]=="NaN" else mset[ind_le][1]
sig_str = str(int(round(100*mset[ind_le][2]))) if not mset[ind_le][2]=="NaN" else mset[ind_le][2]
le_str = "tags="+tag_str+"%"+",list="+list_str+"%"+",signal="+sig_str+"%" # leading edge values
if "No gene" in mset_str[2]:
# empty sets at end of list #
epi_str += "\t".join(mset_str)+"\t"+le_str+"\n"
else:
out_str += "\t".join(mset_str)+"\t"+le_str+"\n" # prolog string for output
out_str += epi_str
ofile = open(outPath,"w")
ofile.write(out_str)
ofile.close()
def es_genes(mir, es_l, tar_l, max_es, max_es_ind, gene_file):
'''Extends the MTI-Set-Genes list with a row for each gene in the MTI-Set.
Columns are: "MTI-Set Target Index in ranked list Running ES LE Member" '''
for tar in tar_l:
if tar:
index = tar_l.index(tar) # index of gene in ranked list
if max_es >= 0: # positive max ES - all genes before and at index
if index <= max_es_ind: le = "Yes"
else: le = "No"
else: # negative max ES - all genes at and after index
if index >= max_es_ind: le = "Yes"
else: le = "No"
gene_file.write("\t".join([mir, tar, str(index+1), str(format(es_l[index],".2f")), le])+"\n")
def es_plot(s, es_l, max_es, max_es_ind, mir_l, tar_l, n, es_plot_pages):
''' Plots running ES for each index of ranking list and target occurrence per MTI-set '''
ticks_tar = np.arange(1,n+1,1) # target occurrence ticks - rearranged so they are centered to ES ticks
ticks_es = np.arange(1,n+1) # ES ticks (startig from index 1)
# One plot per MTI-set #
fig = plt.figure(figsize=(8,3))
## targets ##
plt.subplot2grid((3,1),(0,0)) # subplot: 3rows,1col - r0,c0
plt.vlines(ticks_tar, [0], mir_l)
plt.xlim(xmin=0.5,xmax=n+0.5) # x-axis from 0.5 - num of items + 0.5
plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
labelleft='off',right='off',top='off') # turns of all axis ticks and labels
plt.title("Enrichment Plot: "+s)#, y= fig_height,fontsize=font_s)
## ES ##
ymin=min(es_l)-0.1
ymax=max(es_l)+0.1
plt.subplot2grid((3,1), (1,0),rowspan=2) # subplot: 3rows,1column - starts from r0c0 and spans over 2 rows (1:3)
plt.axhline(0,0,n+0.5,linestyle='--',color='grey') # horizontal line through zero
plt.plot((max_es_ind,max_es_ind),(ymin,ymax),'r-',ticks_es,tuple(es_l),'b-') # color points per data point and line through data points
plt.xlim(xmin=0.5,xmax=n+0.5)
plt.xlabel("Rank in Ordered Dataset")
plt.ylabel("Enrichment Score (ES)")
plt.ylim(ymin=ymin, ymax=ymax) # reset xlimits
plt.subplots_adjust(hspace=0) # no space between subplots
es_plot_pages.savefig(fig, bbox_inches='tight')
plt.close()
def empty_es_plot(es_plot_pages):
fig = plt.figure(figsize=(3,3))
plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
labelleft='off',right='off',top='off') # turns of all axis ticks and labels
plt.text(0.5,0.5,"No gene in rank list.", ha="center")
es_plot_pages.savefig(fig)
plt.close()
def normalize(v_list, pos_m, neg_m):
norm_list = []
for es in v_list:
if es >= 0:
if pos_m:
norm_list.append(es/pos_m)
else:
norm_list.append("NaN")
else:
if neg_m:
norm_list.append(es/neg_m)
else:
norm_list.append("NaN")
return norm_list
if __name__ == '__main__':
main()