enrichAnalysis.py

'''
@author: jbayer

If a ranking file was passed to miRNA, a reduced version of the Gene Set Enrichment Analysis tool is started, analysing the enrichment of the identified MTI sets based on the ranked UniProtAccs.
With a running sum statistic, a weighted Enrichment Score (ES) is calculated for each gene set based on position dependant gene matches between the ranked list and the set.
The Leading Edge analysis additionally identifies and analyses the core genes of the gene set which mainly affect the ES.
At this, the Leading Edge analysis proceeds as follows: Depending on whether the ES of a MTI set is positive or negative, the set of Leading Edge targets either consists of the MTI set targets before
or after the peak in the running sum calculation.
Based on this, three statistics are calculated, where tags represents the ratio of leading edge targets to all targets in the given set, list starts out from all UniProtAcc in the ranking file either before
positive ES) or after (negative ES) the peak and calculates the ratio of these UniProtAccs to all existent within the file and where signal is a combination of the two previous calculations,
describing the distribution of the MTI set targets over the ranked dataset, resulting in 100% or more, if all targets can be found at the beginning of the ranked list.
To take the set sizes into account, MTI set enrichment analysis calculates in the next step the Normalized Enrichment Score (NES) for each gene set by using permutations of the dataset.
Additionally, the False Discovery Rate (FDR) q-value is calculated, representing the estimated probability of a false positive result for each set with a given NES.

INPUT:

mti_d - dictionaries with sets of targets per miRNA
rank_l - list of rows from the target ranking file
perm_num - number of permutations for normalization
p - power for weighting of ranking values
s_path - Path for MTI sets ranked file
h_path - Path for MTI LE heatmap
e_path - Path for enrichment score plots
le_path - Path for MTI set target information file
'''
import matplotlib
matplotlib.use('Agg',warn=False)
import matplotlib.pyplot as plt, numpy as np, random
from operator import itemgetter
from matplotlib.backends.backend_pdf import PdfPages
import natural_sort as ns
import create_figure
import multiprocessing as ms
from multiprocessing import Queue

def main(mti_d = {}, rank_l = [], perm_num = 1000, p = 1, s_path = '', h_path = '', e_path = '', le_path = ''):

# turn values in ranked list around: [value,target] and sort by values in reversed order #
    rank_l = map(lambda line:[float(line.strip().split('\t')[1].replace(",",".")),line.strip().split('\t')[0]] if line.strip() else "not",rank_l)
    while "not" in rank_l:
        rank_l.remove("not")
    rank_l.sort(reverse=True) # sort high to low
    n = len(rank_l)           # number of entries in ranked_l

# all genes from ranked list #
    rank_gene_l = map(lambda entry: entry[1],rank_l)
    rank_gene_l = list(set(rank_gene_l))
# gene list for permutation #
    perm_gene_l = rank_gene_l

    max_es_d = {}       # dict of max ES per set (mir:es)
    max_es_ind_d = {}   # dict of max ES index per set (mir:es_ind)
    nes_d = {}          # dict of NES per set (mir:nes)
    nes_l = []
    perm_es_norm_d = {} # dict of all normalized ES for each set (mir:[])
    lead_edge_d = {}    # LE values per mir
    lead_edge_gene_d = {} #LE genes per mir

    out_str = "Geneset\tSize\tES\tNES\tFDR q-val\tRank at Max\tLeading Edge\n"

    mir_l = mti_d.keys()
    mir_l.sort(key=ns.natural_keys)
    es_q = Queue()
    proc_l = []

# calculate ES + Leading Edge and NES #
    for mir in mir_l:
        es_ms = ms.Process(target=get_es, args=(mir,mti_d[mir], rank_l, perm_gene_l,p, n, perm_num, es_q))
        proc_l.append(es_ms)
        es_ms.start()

# Get all results from multiprocess (unordered) #
    result_l = []
    for _ in range(0,len(mir_l)):
        result_l.append(es_q.get())
    for p in proc_l:
        p.join()
# Sort result lists by miRNA #
    result_l.sort(key=lambda x: ns.natural_keys(x[0]))
    new_mti_d = {}

    le_gene_file = open(le_path, "w")
    le_gene_file.write("Geneset\tGene\tIndex in Ranked List\tRunning ES\tLE Member\n")

# Process results and create plots for running ES per miRNA #
    for e in result_l:
        mir = e[0]
        new_mti_d[mir] = e[1]
        max_es_d[mir] = e[2]
        max_es_ind_d[mir] = e[3]
        nes_d[mir] = e[4]
        lead_edge_d[mir] = e[6]

        ## If Set passed ##
        if not "No gene" in str(max_es_d[mir]):
            lead_edge_gene_d[mir] = e[7]
            # add nes to list #
            nes_l.append(e[4])
            # add list of permuted ES to dict #
            perm_es_norm_d[mir] = e[5]
            # Extend MTI-set-genes file #
            es_genes(mir, e[8], e[10], e[2], e[3], le_gene_file)
            # ES plot with all main running ESs  #
            es_plot_pages = PdfPages(e_path+"-"+mir+".pdf")
            es_plot(mir, e[8], e[2], e[3]+1, e[9], e[10], e[11], es_plot_pages)
            es_plot_pages.close()
    le_gene_file.close()

# calculate FDR q-value#
    FDR_mean_d, FDR_median_d = get_fdr(mir_l, nes_d,nes_l, perm_es_norm_d, perm_num)

# create LE mir overlap heatmap #
    if lead_edge_gene_d:
        create_figure.heatmap(lead_edge_gene_d, h_path, le=True)
    else:
        es_plot_pages = PdfPages(e_path)
        empty_es_plot(es_plot_pages)
        es_plot_pages.close()
        create_figure.heatmap(mti_d, h_path, le=False)

# create output #
    get_output(new_mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, s_path)

def get_es(s, mti_l, rank_l, gene_l, p, n, perm_num, es_q):


    n = float(n)    # number of genes in ranked list
    es_l = []    # new ES list for set
    mir_l = []   #     "        "
    le_gene_l = []  # leading edge gene list
    perm_es_norm_l = [] # normalized permuted ES list

# define new set, if not all genes in set are in rank list #
    new_set_l = set(mti_l).intersection(gene_l)
    mti_l = list(new_set_l)
    nh = float(len(mti_l))  # number of targets in current set

# if at least one gene is in both lists #
    if nh > 0:
        empty_set = False

    # start enrichment analysis calculations #
        hit_gene_l = map(lambda gene: gene[1] if gene[1] in mti_l else "",rank_l) # list hits (gene name) and fails (empty string) of genes in rank list for current set
        hit_bin = map(lambda gene: 1 if gene[1] in mti_l else 0,rank_l)           # list hits (1) and fails (0) of genes in rank list for current set
        miss_bin = map(lambda gene: 0 if gene[1] in mti_l else 1,rank_l)          # inverted hit_bin list -> hits = 0, fails = 1
        calc_l = map(lambda item:abs(float(item[0]))**p, rank_l)                              # list with weighted ranking values (val^p)
        nr = abs(sum([hit_bin[i]*calc_l[i] for i in range(0,len(calc_l))]))       # absolute (positive) sum of ranking values of genes in current set

        norm_hit = 1.0/nr
        norm_miss = 1.0/(n-nh)
        sumes = 0
        rES_l = []
        for i in range(0,len(calc_l)):
            res = ((hit_bin[i]*calc_l[i]*norm_hit)-(miss_bin[i]*norm_miss))+sumes
            rES_l.append(res)
            sumes = res

        es_l = rES_l
        mir_l = hit_bin

    ## get real ES (max/min) ##
        max_es = max(es_l)
        max_es_ind = es_l.index(max_es)  # first occurring max
        min_es = min(es_l)
        min_es_ind =  es_l.index(min_es) # first occurring min

        # min or max ES as max ES #
        if max_es > -min_es:
            max_es = max_es
            max_es_ind = max_es_ind
        else:
            max_es = min_es
            max_es_ind = min_es_ind

# no gene overlaps with rank list #
    else:
    # set is empty #
        empty_set = True

## further processes in dependence of the tool's state ##
    # function was called for main calculation #
    if perm_num:
        if not empty_set:
            ## Leading Edge values and genes ##
            if max_es >= 0:
                tag_frac = sum(hit_bin[:max_es_ind+1])/nh
                gene_frac = (max_es_ind+1)/n
                le_gene_l = hit_gene_l[:max_es_ind+1]
            else:
                tag_frac = sum(hit_bin[max_es_ind:])/nh
                gene_frac = (n-(max_es_ind+1)+1)/n
                le_gene_l = hit_gene_l[max_es_ind:]
            signal = tag_frac * (1-gene_frac) * (n/(n-nh)) # gives different results than GSEA, but has the official formular -.-

            le_gene_l = list(set(le_gene_l))
            if "" in le_gene_l:
                le_gene_l.remove("")

            ## calculate NES by permute gene sets #
            nes, perm_es_norm_l = get_nes(max_es, int(nh), rank_l, gene_l, perm_num, p, n)
            es_q.put([s,mti_l, max_es, max_es_ind, nes, perm_es_norm_l, [tag_frac, gene_frac, signal], le_gene_l,es_l, mir_l, hit_gene_l, n])
        else:
            es_q.put([s,[],"No gene in rank list.", "NaN","NaN", perm_es_norm_l, ["NaN","NaN","NaN"], []])

    # function was called for permuted calculation - just return max_es #
    else:
        return max_es

def get_nes(max_es, nh, rank_l, gene_l, perm_num, p, n):
    ''' permutations per set '''
    perm_es_l = []      # list of all permuted escores
    perm_pos_es_l = []  # list of all permuted positive escores
    perm_neg_es_l = []  # list of all permuted negative escores
    for _ in range(0,perm_num):
        tmp_mti_l = random.sample(gene_l,nh) # randomly fill sets with same number of targets
    # new mti_dict done #
        tmp_es = get_es('x', tmp_mti_l, rank_l, gene_l, p, n,False, None)
        perm_es_l.append(tmp_es)
        if tmp_es >= 0:
            perm_pos_es_l.append(tmp_es)
        else:
            perm_neg_es_l.append(tmp_es)
    # get mean of p_es for normalization #
    if len(perm_pos_es_l) > 0:
        pos_mean = sum(perm_pos_es_l)/len(perm_pos_es_l)
    else: pos_mean = False
    if len(perm_neg_es_l) > 0:
        neg_mean = abs(sum(perm_neg_es_l)/len(perm_neg_es_l))
    else: neg_mean = False

    # normalize all ES scores (permuted, real, max) #
    perm_es_norm_l = normalize(perm_es_l, pos_mean, neg_mean)   # list of all normalized permuted escores
    max_es_norm = normalize([max_es], pos_mean, neg_mean)[0]    # normalized max escore = NES
    nes = max_es_norm

    return nes, perm_es_norm_l

def get_fdr(mir_l, nes_d, nes_l, perm_es_norm_d, perm_num):
    FDR_mean = {}
    FDR_median = {}
    nes_col = list(nes_l)

    for mir in mir_l:
        nes = nes_d[mir]
        perm_col_sum_l = []
        nes_col_sum_l = []
        if not nes == "NaN":
            for i in range(0,perm_num):
                perm_col = map(lambda es_l:es_l[i],perm_es_norm_d.values()) # all perm ES over all sets from index i
                if nes >= 0:
                    perm_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,perm_col))) # sum up all positives (permuted col)
                    nes_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,nes_col)))   #      "    "          (real col)
                    perm_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0
                                                                                                                                        # sum up all above NES (permuted col) and normalize with colsum
                    nes_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0)    # "    " nes_val    "    "
                                                                                                                                        # "    " (real col) "    " "

                else:
                    perm_col_sum = sum(map(lambda es: 1 if es < 0 else 0,perm_col)) # number of all negatives (permuted col)
                    nes_col_sum = sum(map(lambda es: 1 if es < 0 else 0,nes_col))   #      "    "          (real col)
                    perm_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0:
                                                                                                                                        # sum up all below NES (permuted col) and normalize with colsum
                    nes_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0)    # # "    " nes_val    "    "
                                                                                                                                        # "    " (real col) "    "
            perm_norm_mean = np.mean(perm_col_sum_l)
            nes_norm_mean = np.mean(nes_col_sum_l)
            perm_norm_median = np.median(perm_col_sum_l)
            nes_norm_median = np.median(nes_col_sum_l)

            FDR_mean[mir] = perm_norm_mean/nes_norm_mean if perm_norm_mean/nes_norm_mean < 1 else 1             # permutated proportional to real
            FDR_median[mir] = perm_norm_median/nes_norm_median if perm_norm_median/nes_norm_median < 1 else 1   # "    "
        else:
            FDR_mean[mir], FDR_median[mir] = "NaN", "NaN"
    return FDR_mean, FDR_median

def get_output(mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, outPath):
    full_val_l = []
    # round floating point numbers #
    for mir in mti_d:
        size = len(mti_d[mir])
        # round floating point numbers - if they are floats #
        formatted_nes = format(nes_d[mir],".2f") if not nes_d[mir]=="NaN" else nes_d[mir]
        formatted_fdr = format(FDR_mean_d[mir],".3f") if not FDR_mean_d[mir]=="NaN" else FDR_mean_d[mir]
        formatted_es = format(max_es_d[mir],".2f") if not type(max_es_d[mir])==str else max_es_d[mir]
        max_es_ind = max_es_ind_d[mir]+1 if type(max_es_ind_d[mir])==int else max_es_ind_d[mir]
        # list of all values for output #
        full_val_l.append([mir, size, formatted_es, formatted_nes, formatted_fdr, max_es_ind, lead_edge_d[mir]])

    full_val_l.sort(key=itemgetter(3), reverse=True) # sort values by NES (greater first)
    ind_le = 6 # index where leading edge values are in list
    epi_str = "" # epilog string for output ("No gene in rank list" MTIs)
    for mset in full_val_l:
        mset_str = map(lambda it: str(it),mset[:ind_le])    # all values except for leading edge values
        tag_str = str(int(round(100*mset[ind_le][0]))) if not mset[ind_le][0]=="NaN" else mset[ind_le][0]
        list_str = str(int(round(100*mset[ind_le][1]))) if not mset[ind_le][1]=="NaN" else mset[ind_le][1]
        sig_str = str(int(round(100*mset[ind_le][2]))) if not mset[ind_le][2]=="NaN" else mset[ind_le][2]
        le_str = "tags="+tag_str+"%"+",list="+list_str+"%"+",signal="+sig_str+"%" # leading edge values
        if "No gene" in mset_str[2]:
            # empty sets at end of list #
            epi_str += "\t".join(mset_str)+"\t"+le_str+"\n"
        else:
            out_str += "\t".join(mset_str)+"\t"+le_str+"\n"     # prolog string for output
    out_str += epi_str
    ofile = open(outPath,"w")
    ofile.write(out_str)
    ofile.close()

def es_genes(mir, es_l, tar_l, max_es, max_es_ind, gene_file):
    '''Extends the MTI-Set-Genes list with a row for each gene in the MTI-Set.
    Columns are: "MTI-Set    Target    Index in ranked list    Running ES    LE Member" '''
    for tar in tar_l:
        if tar:
            index = tar_l.index(tar)    # index of gene in ranked list
            if max_es >= 0:              # positive max ES - all genes before and at index
                if index <= max_es_ind: le = "Yes"
                else: le = "No"
            else:                       # negative max ES - all genes at and after index
                if index >= max_es_ind: le = "Yes"
                else: le = "No"
            gene_file.write("\t".join([mir, tar, str(index+1), str(format(es_l[index],".2f")), le])+"\n")

def es_plot(s, es_l, max_es, max_es_ind, mir_l, tar_l, n, es_plot_pages):
    ''' Plots running ES for each index of ranking list and target occurrence per MTI-set '''

    ticks_tar = np.arange(1,n+1,1) # target occurrence ticks - rearranged so they are centered to ES ticks
    ticks_es = np.arange(1,n+1)     # ES ticks (startig from index 1)

# One plot per MTI-set #

    fig = plt.figure(figsize=(8,3))

    ## targets ##
    plt.subplot2grid((3,1),(0,0))      # subplot: 3rows,1col - r0,c0
    plt.vlines(ticks_tar, [0], mir_l)
    plt.xlim(xmin=0.5,xmax=n+0.5)           # x-axis from 0.5 - num of items + 0.5
    plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
                    labelleft='off',right='off',top='off')  # turns of all axis ticks and labels
    plt.title("Enrichment Plot: "+s)#, y= fig_height,fontsize=font_s)

    ## ES ##
    ymin=min(es_l)-0.1
    ymax=max(es_l)+0.1
    plt.subplot2grid((3,1), (1,0),rowspan=2) # subplot: 3rows,1column - starts from r0c0 and spans over 2 rows (1:3)
    plt.axhline(0,0,n+0.5,linestyle='--',color='grey') # horizontal line through zero
    plt.plot((max_es_ind,max_es_ind),(ymin,ymax),'r-',ticks_es,tuple(es_l),'b-') # color points per data point and line through data points
    plt.xlim(xmin=0.5,xmax=n+0.5)
    plt.xlabel("Rank in Ordered Dataset")
    plt.ylabel("Enrichment Score (ES)")
    plt.ylim(ymin=ymin, ymax=ymax) # reset xlimits

    plt.subplots_adjust(hspace=0)   # no space between subplots
    es_plot_pages.savefig(fig, bbox_inches='tight')
    plt.close()

def empty_es_plot(es_plot_pages):
    fig = plt.figure(figsize=(3,3))
    plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
                    labelleft='off',right='off',top='off')  # turns of all axis ticks and labels
    plt.text(0.5,0.5,"No gene in rank list.", ha="center")
    es_plot_pages.savefig(fig)
    plt.close()

def normalize(v_list, pos_m, neg_m):
    norm_list = []
    for es in v_list:
        if es >= 0:
            if pos_m:
                norm_list.append(es/pos_m)
            else:
                norm_list.append("NaN")
        else:
            if neg_m:
                norm_list.append(es/neg_m)
            else:
                norm_list.append("NaN")
    return norm_list

if __name__ == '__main__':
    main()
	'''
	@author: jbayer

	If a ranking file was passed to miRNA, a reduced version of the Gene Set Enrichment Analysis tool is started, analysing the enrichment of the identified MTI sets based on the ranked UniProtAccs.
	With a running sum statistic, a weighted Enrichment Score (ES) is calculated for each gene set based on position dependant gene matches between the ranked list and the set.
	The Leading Edge analysis additionally identifies and analyses the core genes of the gene set which mainly affect the ES.
	At this, the Leading Edge analysis proceeds as follows: Depending on whether the ES of a MTI set is positive or negative, the set of Leading Edge targets either consists of the MTI set targets before
	or after the peak in the running sum calculation.
	Based on this, three statistics are calculated, where tags represents the ratio of leading edge targets to all targets in the given set, list starts out from all UniProtAcc in the ranking file either before
	positive ES) or after (negative ES) the peak and calculates the ratio of these UniProtAccs to all existent within the file and where signal is a combination of the two previous calculations,
	describing the distribution of the MTI set targets over the ranked dataset, resulting in 100% or more, if all targets can be found at the beginning of the ranked list.
	To take the set sizes into account, MTI set enrichment analysis calculates in the next step the Normalized Enrichment Score (NES) for each gene set by using permutations of the dataset.
	Additionally, the False Discovery Rate (FDR) q-value is calculated, representing the estimated probability of a false positive result for each set with a given NES.

	INPUT:

	mti_d - dictionaries with sets of targets per miRNA
	rank_l - list of rows from the target ranking file
	perm_num - number of permutations for normalization
	p - power for weighting of ranking values
	s_path - Path for MTI sets ranked file
	h_path - Path for MTI LE heatmap
	e_path - Path for enrichment score plots
	le_path - Path for MTI set target information file
	'''
	import matplotlib
	matplotlib.use('Agg',warn=False)
	import matplotlib.pyplot as plt, numpy as np, random
	from operator import itemgetter
	from matplotlib.backends.backend_pdf import PdfPages
	import natural_sort as ns
	import create_figure
	import multiprocessing as ms
	from multiprocessing import Queue

	def main(mti_d = {}, rank_l = [], perm_num = 1000, p = 1, s_path = '', h_path = '', e_path = '', le_path = ''):

	# turn values in ranked list around: [value,target] and sort by values in reversed order #
	rank_l = map(lambda line:[float(line.strip().split('\t')[1].replace(",",".")),line.strip().split('\t')[0]] if line.strip() else "not",rank_l)
	while "not" in rank_l:
	rank_l.remove("not")
	rank_l.sort(reverse=True) # sort high to low
	n = len(rank_l) # number of entries in ranked_l

	# all genes from ranked list #
	rank_gene_l = map(lambda entry: entry[1],rank_l)
	rank_gene_l = list(set(rank_gene_l))
	# gene list for permutation #
	perm_gene_l = rank_gene_l

	max_es_d = {} # dict of max ES per set (mir:es)
	max_es_ind_d = {} # dict of max ES index per set (mir:es_ind)
	nes_d = {} # dict of NES per set (mir:nes)
	nes_l = []
	perm_es_norm_d = {} # dict of all normalized ES for each set (mir:[])
	lead_edge_d = {} # LE values per mir
	lead_edge_gene_d = {} #LE genes per mir

	out_str = "Geneset\tSize\tES\tNES\tFDR q-val\tRank at Max\tLeading Edge\n"

	mir_l = mti_d.keys()
	mir_l.sort(key=ns.natural_keys)
	es_q = Queue()
	proc_l = []

	# calculate ES + Leading Edge and NES #
	for mir in mir_l:
	es_ms = ms.Process(target=get_es, args=(mir,mti_d[mir], rank_l, perm_gene_l,p, n, perm_num, es_q))
	proc_l.append(es_ms)
	es_ms.start()

	# Get all results from multiprocess (unordered) #
	result_l = []
	for _ in range(0,len(mir_l)):
	result_l.append(es_q.get())
	for p in proc_l:
	p.join()
	# Sort result lists by miRNA #
	result_l.sort(key=lambda x: ns.natural_keys(x[0]))
	new_mti_d = {}

	le_gene_file = open(le_path, "w")
	le_gene_file.write("Geneset\tGene\tIndex in Ranked List\tRunning ES\tLE Member\n")

	# Process results and create plots for running ES per miRNA #
	for e in result_l:
	mir = e[0]
	new_mti_d[mir] = e[1]
	max_es_d[mir] = e[2]
	max_es_ind_d[mir] = e[3]
	nes_d[mir] = e[4]
	lead_edge_d[mir] = e[6]

	## If Set passed ##
	if not "No gene" in str(max_es_d[mir]):
	lead_edge_gene_d[mir] = e[7]
	# add nes to list #
	nes_l.append(e[4])
	# add list of permuted ES to dict #
	perm_es_norm_d[mir] = e[5]
	# Extend MTI-set-genes file #
	es_genes(mir, e[8], e[10], e[2], e[3], le_gene_file)
	# ES plot with all main running ESs #
	es_plot_pages = PdfPages(e_path+"-"+mir+".pdf")
	es_plot(mir, e[8], e[2], e[3]+1, e[9], e[10], e[11], es_plot_pages)
	es_plot_pages.close()
	le_gene_file.close()

	# calculate FDR q-value#
	FDR_mean_d, FDR_median_d = get_fdr(mir_l, nes_d,nes_l, perm_es_norm_d, perm_num)

	# create LE mir overlap heatmap #
	if lead_edge_gene_d:
	create_figure.heatmap(lead_edge_gene_d, h_path, le=True)
	else:
	es_plot_pages = PdfPages(e_path)
	empty_es_plot(es_plot_pages)
	es_plot_pages.close()
	create_figure.heatmap(mti_d, h_path, le=False)

	# create output #
	get_output(new_mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, s_path)

	def get_es(s, mti_l, rank_l, gene_l, p, n, perm_num, es_q):


	n = float(n) # number of genes in ranked list
	es_l = [] # new ES list for set
	mir_l = [] # " "
	le_gene_l = [] # leading edge gene list
	perm_es_norm_l = [] # normalized permuted ES list

	# define new set, if not all genes in set are in rank list #
	new_set_l = set(mti_l).intersection(gene_l)
	mti_l = list(new_set_l)
	nh = float(len(mti_l)) # number of targets in current set

	# if at least one gene is in both lists #
	if nh > 0:
	empty_set = False

	# start enrichment analysis calculations #
	hit_gene_l = map(lambda gene: gene[1] if gene[1] in mti_l else "",rank_l) # list hits (gene name) and fails (empty string) of genes in rank list for current set
	hit_bin = map(lambda gene: 1 if gene[1] in mti_l else 0,rank_l) # list hits (1) and fails (0) of genes in rank list for current set
	miss_bin = map(lambda gene: 0 if gene[1] in mti_l else 1,rank_l) # inverted hit_bin list -> hits = 0, fails = 1
	calc_l = map(lambda item:abs(float(item[0]))**p, rank_l) # list with weighted ranking values (val^p)
	nr = abs(sum([hit_bin[i]*calc_l[i] for i in range(0,len(calc_l))])) # absolute (positive) sum of ranking values of genes in current set

	norm_hit = 1.0/nr
	norm_miss = 1.0/(n-nh)
	sumes = 0
	rES_l = []
	for i in range(0,len(calc_l)):
	res = ((hit_bin[i]calc_l[i]norm_hit)-(miss_bin[i]*norm_miss))+sumes
	rES_l.append(res)
	sumes = res

	es_l = rES_l
	mir_l = hit_bin

	## get real ES (max/min) ##
	max_es = max(es_l)
	max_es_ind = es_l.index(max_es) # first occurring max
	min_es = min(es_l)
	min_es_ind = es_l.index(min_es) # first occurring min

	# min or max ES as max ES #
	if max_es > -min_es:
	max_es = max_es
	max_es_ind = max_es_ind
	else:
	max_es = min_es
	max_es_ind = min_es_ind

	# no gene overlaps with rank list #
	else:
	# set is empty #
	empty_set = True

	## further processes in dependence of the tool's state ##
	# function was called for main calculation #
	if perm_num:
	if not empty_set:
	## Leading Edge values and genes ##
	if max_es >= 0:
	tag_frac = sum(hit_bin[:max_es_ind+1])/nh
	gene_frac = (max_es_ind+1)/n
	le_gene_l = hit_gene_l[:max_es_ind+1]
	else:
	tag_frac = sum(hit_bin[max_es_ind:])/nh
	gene_frac = (n-(max_es_ind+1)+1)/n
	le_gene_l = hit_gene_l[max_es_ind:]
	signal = tag_frac * (1-gene_frac) * (n/(n-nh)) # gives different results than GSEA, but has the official formular -.-

	le_gene_l = list(set(le_gene_l))
	if "" in le_gene_l:
	le_gene_l.remove("")

	## calculate NES by permute gene sets #
	nes, perm_es_norm_l = get_nes(max_es, int(nh), rank_l, gene_l, perm_num, p, n)
	es_q.put([s,mti_l, max_es, max_es_ind, nes, perm_es_norm_l, [tag_frac, gene_frac, signal], le_gene_l,es_l, mir_l, hit_gene_l, n])
	else:
	es_q.put([s,[],"No gene in rank list.", "NaN","NaN", perm_es_norm_l, ["NaN","NaN","NaN"], []])

	# function was called for permuted calculation - just return max_es #
	else:
	return max_es

	def get_nes(max_es, nh, rank_l, gene_l, perm_num, p, n):
	''' permutations per set '''
	perm_es_l = [] # list of all permuted escores
	perm_pos_es_l = [] # list of all permuted positive escores
	perm_neg_es_l = [] # list of all permuted negative escores
	for _ in range(0,perm_num):
	tmp_mti_l = random.sample(gene_l,nh) # randomly fill sets with same number of targets
	# new mti_dict done #
	tmp_es = get_es('x', tmp_mti_l, rank_l, gene_l, p, n,False, None)
	perm_es_l.append(tmp_es)
	if tmp_es >= 0:
	perm_pos_es_l.append(tmp_es)
	else:
	perm_neg_es_l.append(tmp_es)
	# get mean of p_es for normalization #
	if len(perm_pos_es_l) > 0:
	pos_mean = sum(perm_pos_es_l)/len(perm_pos_es_l)
	else: pos_mean = False
	if len(perm_neg_es_l) > 0:
	neg_mean = abs(sum(perm_neg_es_l)/len(perm_neg_es_l))
	else: neg_mean = False

	# normalize all ES scores (permuted, real, max) #
	perm_es_norm_l = normalize(perm_es_l, pos_mean, neg_mean) # list of all normalized permuted escores
	max_es_norm = normalize([max_es], pos_mean, neg_mean)[0] # normalized max escore = NES
	nes = max_es_norm

	return nes, perm_es_norm_l

	def get_fdr(mir_l, nes_d, nes_l, perm_es_norm_d, perm_num):
	FDR_mean = {}
	FDR_median = {}
	nes_col = list(nes_l)

	for mir in mir_l:
	nes = nes_d[mir]
	perm_col_sum_l = []
	nes_col_sum_l = []
	if not nes == "NaN":
	for i in range(0,perm_num):
	perm_col = map(lambda es_l:es_l[i],perm_es_norm_d.values()) # all perm ES over all sets from index i
	if nes >= 0:
	perm_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,perm_col))) # sum up all positives (permuted col)
	nes_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,nes_col))) # " " (real col)
	perm_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0
	# sum up all above NES (permuted col) and normalize with colsum
	nes_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # " " nes_val " "
	# " " (real col) " " "

	else:
	perm_col_sum = sum(map(lambda es: 1 if es < 0 else 0,perm_col)) # number of all negatives (permuted col)
	nes_col_sum = sum(map(lambda es: 1 if es < 0 else 0,nes_col)) # " " (real col)
	perm_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0:
	# sum up all below NES (permuted col) and normalize with colsum
	nes_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # # " " nes_val " "
	# " " (real col) " "
	perm_norm_mean = np.mean(perm_col_sum_l)
	nes_norm_mean = np.mean(nes_col_sum_l)
	perm_norm_median = np.median(perm_col_sum_l)
	nes_norm_median = np.median(nes_col_sum_l)

	FDR_mean[mir] = perm_norm_mean/nes_norm_mean if perm_norm_mean/nes_norm_mean < 1 else 1 # permutated proportional to real
	FDR_median[mir] = perm_norm_median/nes_norm_median if perm_norm_median/nes_norm_median < 1 else 1 # " "
	else:
	FDR_mean[mir], FDR_median[mir] = "NaN", "NaN"
	return FDR_mean, FDR_median

	def get_output(mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, outPath):
	full_val_l = []
	# round floating point numbers #
	for mir in mti_d:
	size = len(mti_d[mir])
	# round floating point numbers - if they are floats #
	formatted_nes = format(nes_d[mir],".2f") if not nes_d[mir]=="NaN" else nes_d[mir]
	formatted_fdr = format(FDR_mean_d[mir],".3f") if not FDR_mean_d[mir]=="NaN" else FDR_mean_d[mir]
	formatted_es = format(max_es_d[mir],".2f") if not type(max_es_d[mir])==str else max_es_d[mir]
	max_es_ind = max_es_ind_d[mir]+1 if type(max_es_ind_d[mir])==int else max_es_ind_d[mir]
	# list of all values for output #
	full_val_l.append([mir, size, formatted_es, formatted_nes, formatted_fdr, max_es_ind, lead_edge_d[mir]])

	full_val_l.sort(key=itemgetter(3), reverse=True) # sort values by NES (greater first)
	ind_le = 6 # index where leading edge values are in list
	epi_str = "" # epilog string for output ("No gene in rank list" MTIs)
	for mset in full_val_l:
	mset_str = map(lambda it: str(it),mset[:ind_le]) # all values except for leading edge values
	tag_str = str(int(round(100*mset[ind_le][0]))) if not mset[ind_le][0]=="NaN" else mset[ind_le][0]
	list_str = str(int(round(100*mset[ind_le][1]))) if not mset[ind_le][1]=="NaN" else mset[ind_le][1]
	sig_str = str(int(round(100*mset[ind_le][2]))) if not mset[ind_le][2]=="NaN" else mset[ind_le][2]
	le_str = "tags="+tag_str+"%"+",list="+list_str+"%"+",signal="+sig_str+"%" # leading edge values
	if "No gene" in mset_str[2]:
	# empty sets at end of list #
	epi_str += "\t".join(mset_str)+"\t"+le_str+"\n"
	else:
	out_str += "\t".join(mset_str)+"\t"+le_str+"\n" # prolog string for output
	out_str += epi_str
	ofile = open(outPath,"w")
	ofile.write(out_str)
	ofile.close()

	def es_genes(mir, es_l, tar_l, max_es, max_es_ind, gene_file):
	'''Extends the MTI-Set-Genes list with a row for each gene in the MTI-Set.
	Columns are: "MTI-Set Target Index in ranked list Running ES LE Member" '''
	for tar in tar_l:
	if tar:
	index = tar_l.index(tar) # index of gene in ranked list
	if max_es >= 0: # positive max ES - all genes before and at index
	if index <= max_es_ind: le = "Yes"
	else: le = "No"
	else: # negative max ES - all genes at and after index
	if index >= max_es_ind: le = "Yes"
	else: le = "No"
	gene_file.write("\t".join([mir, tar, str(index+1), str(format(es_l[index],".2f")), le])+"\n")

	def es_plot(s, es_l, max_es, max_es_ind, mir_l, tar_l, n, es_plot_pages):
	''' Plots running ES for each index of ranking list and target occurrence per MTI-set '''

	ticks_tar = np.arange(1,n+1,1) # target occurrence ticks - rearranged so they are centered to ES ticks
	ticks_es = np.arange(1,n+1) # ES ticks (startig from index 1)

	# One plot per MTI-set #

	fig = plt.figure(figsize=(8,3))

	## targets ##
	plt.subplot2grid((3,1),(0,0)) # subplot: 3rows,1col - r0,c0
	plt.vlines(ticks_tar, [0], mir_l)
	plt.xlim(xmin=0.5,xmax=n+0.5) # x-axis from 0.5 - num of items + 0.5
	plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
	labelleft='off',right='off',top='off') # turns of all axis ticks and labels
	plt.title("Enrichment Plot: "+s)#, y= fig_height,fontsize=font_s)

	## ES ##
	ymin=min(es_l)-0.1
	ymax=max(es_l)+0.1
	plt.subplot2grid((3,1), (1,0),rowspan=2) # subplot: 3rows,1column - starts from r0c0 and spans over 2 rows (1:3)
	plt.axhline(0,0,n+0.5,linestyle='--',color='grey') # horizontal line through zero
	plt.plot((max_es_ind,max_es_ind),(ymin,ymax),'r-',ticks_es,tuple(es_l),'b-') # color points per data point and line through data points
	plt.xlim(xmin=0.5,xmax=n+0.5)
	plt.xlabel("Rank in Ordered Dataset")
	plt.ylabel("Enrichment Score (ES)")
	plt.ylim(ymin=ymin, ymax=ymax) # reset xlimits

	plt.subplots_adjust(hspace=0) # no space between subplots
	es_plot_pages.savefig(fig, bbox_inches='tight')
	plt.close()

	def empty_es_plot(es_plot_pages):
	fig = plt.figure(figsize=(3,3))
	plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off',
	labelleft='off',right='off',top='off') # turns of all axis ticks and labels
	plt.text(0.5,0.5,"No gene in rank list.", ha="center")
	es_plot_pages.savefig(fig)
	plt.close()

	def normalize(v_list, pos_m, neg_m):
	norm_list = []
	for es in v_list:
	if es >= 0:
	if pos_m:
	norm_list.append(es/pos_m)
	else:
	norm_list.append("NaN")
	else:
	if neg_m:
	norm_list.append(es/neg_m)
	else:
	norm_list.append("NaN")
	return norm_list

	if __name__ == '__main__':
	main()