From aa53535785f865b177aec054cb8cf4c0938adf89 Mon Sep 17 00:00:00 2001 From: jenzopr Date: Wed, 22 Jul 2015 16:42:12 +0200 Subject: [PATCH] Implemented gene set enrichment --- .gitignore | 1 + src/__init__.py | 0 src/admire | 16 +- src/create_figure.py | 232 ++++++++++++++++++++++++++ src/enrichAnalysis.py | 379 ++++++++++++++++++++++++++++++++++++++++++ src/gsea.py | 32 ++++ src/natural_sort.py | 14 ++ 7 files changed, 670 insertions(+), 4 deletions(-) create mode 100755 src/__init__.py create mode 100755 src/create_figure.py create mode 100755 src/enrichAnalysis.py create mode 100755 src/gsea.py create mode 100755 src/natural_sort.py diff --git a/.gitignore b/.gitignore index ab1cfb4..eb5b14c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ test/* +src/*.pyc diff --git a/src/__init__.py b/src/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/src/admire b/src/admire index 001eb7f..a478000 100755 --- a/src/admire +++ b/src/admire @@ -260,9 +260,9 @@ EOL color=${color["$sample"]} if [[ "$file" = /* ]]; then - ln -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat + ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat else - ln -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat + ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat fi cat >> SampleSheet.csv_ << EOL @@ -484,6 +484,8 @@ Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1 echo "Step 8 of 9: Calculating enrichment across gene sets..." GSEA=geneset_enrichment mkdir -p $GSEA +# build up gene sets as serialized data structure +cat ${G[@]} | awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i $GSEA/genesets.dict set -- ${groups[@]} for a; do shift @@ -491,7 +493,14 @@ for a; do g=$a-$b for reg in "${R[@]}"; do r=`basename $reg .bed` - + # build ranked list(s) + tail -n +2 $EXCEL/$g-$r.csv | grep "$a" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$a.ranks + tail -n +2 $EXCEL/$g-$r.csv | grep "$b" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$b.ranks + if [[ -s $GSEA/$g-$r-$a.ranks || -s $GSEA/$g-$r-$b.ranks ]]; then + python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a + python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b + fi + rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks done done done @@ -504,5 +513,4 @@ done #---------------------------------------------------- echo "Step 9 of 9: Wrapping up results..." tar -zcvf $O visualization excel >/dev/null 2>&1 - echo "Done. Your results are compressed into $O." diff --git a/src/create_figure.py b/src/create_figure.py new file mode 100755 index 0000000..6b2652c --- /dev/null +++ b/src/create_figure.py @@ -0,0 +1,232 @@ +''' + +@author: jbayer + +Module with different functions for creating different figures. + +"heatmap" is specialised for plotting MTI-Set overlap ratios within a heatmap build +with pyplot from matplotlib. + INPUT: + mti_d - {mir:[uniProt Accessions]} dictionary + outPath - Path to save heatmap plot + le - True if function compares MTI-sets that have just leading edge genes in it + (-> changes in title) + OUTPUT: + Heatmap pdf file + +"bargraph" creates two graphs on one page with the number of miRNAs/MTIs per +analysis step. + INPUT: + db_num_l - Step 1: number of identified items in MTI DBs + occ_num_l - Step 2: number of items occurring as often as defined over MTI DBs + uni_num_l - Step 3: number of items mapped to UniProt Accessions + annot_num_l - Step 4: number of items overlapping with annotation file + out_path - Path to save bar plot + sel - True if MTIs were just selected (-> no step 4) + OUTPUT: + Bar plot pdf file + +"venn" is specialised to create a venn diagram for maximum four MTI DBs to show the +numerical intersections of their MTIs or miRNAs. + INPUT: + outpath - Path to save venn diagram + base_list - List of used MTI DBs as numerical abbreviation (1-4) + baseDict_list - List with the MTI dictionaries of the MTI DBs + baseName_list - List with the names of the used MTI DBs + mti - True if diagram shows the MTIs (-> changes in title) + OUTPUT: + Venn diagram pdf file +''' +import tempfile, os, matplotlib, natural_sort as ns +matplotlib.use('Agg',warn=False) +import matplotlib.pyplot as plt, numpy as np + +def heatmap(mti_d, outPath, le = False): + label_l = [] + for mir in mti_d.keys(): + if mti_d[mir]: + label_l.append(mir) + label_l.sort(key=ns.natural_keys) + +# calculate ratio of all sets # + matrix = [] # matrix with ratios + done_l = [] + for mir1 in label_l: + row_l = [] + for mir2 in label_l: + union = len(set(mti_d[mir1]+mti_d[mir2])) + overlap = float(len(set(mti_d[mir1]).intersection(mti_d[mir2]))) + try: + row_l.append(overlap/union) + except: + row_l.append(0) + matrix.append(row_l) + done_l.append(mir1) + +# create heatmap # + font_s = 280./len(label_l) + if font_s > 12: + font_s = 12 + greyline = 24./len(label_l) + plt.rc('axes',linewidth = 24./len(label_l)) + + cmapx = plt.cm.get_cmap("GnBu") + hm = plt.pcolormesh(np.array(matrix), cmap=cmapx, edgecolors='lightgrey',linewidth=greyline) + plt.xticks(np.arange(len(label_l))+0.5,label_l, rotation = 90, fontsize=font_s) # mir labels x-axis + plt.yticks(np.arange(len(label_l))+0.5,label_l, fontsize=font_s) # mir labels y-axis + plt.tick_params(axis='both',bottom='off',left='off',right='off',top='off') # turns off all axis ticks + plt.xlim(xmax=len(label_l)) # x-axis length = number of mirs + plt.ylim(ymax=len(label_l)) # y-axis length = number of mirs + plt.colorbar(hm) + #cb.ax.tick_params(labelsize=font_s) + if le: + plt.title("Leading Edge Geneset Overlap Ratio") + else: + plt.title("Geneset Overlap Ratio")#, fontsize=font_s) + plt.savefig(outPath,format='pdf',bbox_inches='tight') + plt.close() + #plt.show() + +def bargraph(db_num_l, occ_num_l, uni_num_l, annot_num_l, out_path, sel): + ''' CREATES + a Multiplot of two Bar-Graphs in one document.''' + + all_num_l = [db_num_l, occ_num_l, uni_num_l, annot_num_l] + if sel: all_num_l = all_num_l[:-1] # solely selecting MTIs: uni = annot number + mir_num_tuple = tuple(map(lambda num_l: num_l[0],all_num_l)) # number of mirnas per tool step in a tuple + tar_num_tuple = tuple(map(lambda num_l: num_l[1],all_num_l)) # number of mir-tar-ias per tool step in a tuple + + xtick_name_l = ['DB search','DB occurrence','UniProt mapping', 'Annotation mapping'] + if sel: xtick_name_l = xtick_name_l[:-1] + +# number of bars and their locations for plot 1 and 2 # + numBar = 4 + width = 0.8 # bar width + if sel: + numBar = 3 + loc1 = np.arange(numBar) # (0 1 2 3 4 5) + loc2 = np.arange(numBar) + +## Subplot miRNAs ## + plt.figure(figsize=(6,9), dpi=80,facecolor='w') + plt.subplot(211) + p1 = plt.bar(loc1,mir_num_tuple,width)#,color=color_list1) +# Properties for plot 1 # + plt.title('Number of miRNAs\nover analysis steps') + #plt.ylabel('Number of miRNAs') + plt.xticks(loc1,xtick_name_l, rotation=50) + # no ticks, no labels (height) on yaxis # + plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off') +# numbers above bars (set for each) # + for bar in p1: + height = bar.get_height() + num_height = set_numHeight(height,mir_num_tuple[0]) + plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height), + ha='center',va='bottom') +# set height of xaxis a bit higher for numbers above # + if not mir_num_tuple[0] == 0: + plt.axis([-0.3, 4, 0, mir_num_tuple[0]/6.+mir_num_tuple[0]]) + else: + plt.axis([0, 4, -4, 4]) + +## Subplot miRNA-target interactions ## + plt.subplot(212) + p2 = plt.bar(loc2,tar_num_tuple,width)#,color=color_list) +# Properties for plot 2 # + plt.title('Number of miRNA-target interactions\nover analysis steps') + #plt.ylabel('Number of miRNA-target interactions') + plt.xticks(loc2,xtick_name_l, rotation=50) + plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off') +# numbers above bars (set for each) # + for bar in p2: + max_h = max(tar_num_tuple[0],tar_num_tuple[2]) + height = bar.get_height() + num_height = set_numHeight(height, max_h) + plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height), + ha='center',va='bottom') +# set height of xaxis a bit higher for numbers above # + if not tar_num_tuple[0] == 0: + plt.axis([-0.3, 4, 0, max_h/6.+max_h]) + else: + plt.axis([0, 4, -4, 4]) + +# adjust distance between subplots # + plt.subplots_adjust(hspace=1) +# save figure as pdf, remove white space # + plt.savefig(out_path,format='pdf',bbox_inches='tight') + plt.close() + + +def venn(outpath, base_list, baseDict_list, baseName_list, mti): + ''' CREATES + a venn diagram with the number of miRNAs/ MTIs ''' + + color_list = ["#4F81BD","#9BBB59","#8064A2","#C0504D"] + colors = [] + # R - string # + r_str = '''suppressPackageStartupMessages(library(VennDiagram)) + ''' + # create lists of miRFams and colors for each DB in R # + for base in base_list: + b_dict = getBaseDict(base, baseDict_list, baseName_list) + if not mti: + add = 'miRs.pdf' + main = 'miRNAs' + miRFam_li = list(b_dict.keys()) + else: + add = 'MTIs.pdf' + main = 'miRNA-target interactions' + miRFam_li = b_dict + miRFam_str = str(miRFam_li).replace("[", "").replace("]","") + + if miRFam_str: + r_str += ''+base+' <- c('+miRFam_str+')\n' + colors.append(color_list[base_list.index(base)]) + + color_str = str(colors)[1:-1] + bases = str(base_list)[1:-1] + + # set different circle and label sizes for diff. numbers of sets # + if not len(base_list) == 0: + if len(base_list) == 4 or len(base_list) == 3: + catCex = '1.3' # label size + labelPos = ')' # label position (here: default) + else: + catCex = '2' + labelPos = ',cat.pos=0)' # position (here: top - 0 degrees ) + + # Send lists for each DB to R # + r_str += '''base_data <- list('''+bases.replace("'", '')+''') # list of miRNAs per DB + names(base_data) <- c('''+bases.replace("'", '"')+''') # DB names for miRNA lists + pdf(file="'''+outpath+'Venn_'+add+'''",7,7)\n''' + # Create Venn-Diagrams # + r_str += '''grid.draw(venn.diagram(base_data, filename = NULL, margin=0.1, main="'''+main+'''", + main.fontfamily="sans", main.cex='''+catCex+''',main.pos=c(0.5,1), + scaled=FALSE,euler.d=FALSE, cat.fontfamily=rep("sans",'''+str(len(base_list))+'''), + col=c('''+color_str+'''),cex=2,fontfamily="sans",cat.cex='''+catCex+labelPos+''') + ''' + r_str += "dev.off()" + # create temporary file and open it with R (command line) # + with tempfile.NamedTemporaryFile(suffix='.R') as tmp: + tmp.write(r_str) + tmp.flush() + os.system("R --vanilla < {tmp}".format(tmp=tmp.name)+" >/dev/null") + tmp.close() + + +def getBaseDict(base, baseDict_list, baseName_list): + '''returns the dictionary of the given target base''' + return baseDict_list[baseName_list.index(base)] + +def set_numHeight(height,maxi): + ''' returns the height to write the number ''' + if height < maxi/6: + num_height = 1.5*height + elif height < maxi/4: + num_height = 1.25*height + elif height < maxi/2: + num_height = 1.1*height + else: num_height = 1.05*height + return num_height + + diff --git a/src/enrichAnalysis.py b/src/enrichAnalysis.py new file mode 100755 index 0000000..34b45d9 --- /dev/null +++ b/src/enrichAnalysis.py @@ -0,0 +1,379 @@ +''' +@author: jbayer + +If a ranking file was passed to miRNA, a reduced version of the Gene Set Enrichment Analysis tool is started, analysing the enrichment of the identified MTI sets based on the ranked UniProtAccs. +With a running sum statistic, a weighted Enrichment Score (ES) is calculated for each gene set based on position dependant gene matches between the ranked list and the set. +The Leading Edge analysis additionally identifies and analyses the core genes of the gene set which mainly affect the ES. +At this, the Leading Edge analysis proceeds as follows: Depending on whether the ES of a MTI set is positive or negative, the set of Leading Edge targets either consists of the MTI set targets before +or after the peak in the running sum calculation. +Based on this, three statistics are calculated, where tags represents the ratio of leading edge targets to all targets in the given set, list starts out from all UniProtAcc in the ranking file either before +positive ES) or after (negative ES) the peak and calculates the ratio of these UniProtAccs to all existent within the file and where signal is a combination of the two previous calculations, +describing the distribution of the MTI set targets over the ranked dataset, resulting in 100% or more, if all targets can be found at the beginning of the ranked list. +To take the set sizes into account, MTI set enrichment analysis calculates in the next step the Normalized Enrichment Score (NES) for each gene set by using permutations of the dataset. +Additionally, the False Discovery Rate (FDR) q-value is calculated, representing the estimated probability of a false positive result for each set with a given NES. + +INPUT: + +mti_d - dictionaries with sets of targets per miRNA +rank_l - list of rows from the target ranking file +perm_num - number of permutations for normalization +p - power for weighting of ranking values +s_path - Path for MTI sets ranked file +h_path - Path for MTI LE heatmap +e_path - Path for enrichment score plots +le_path - Path for MTI set target information file +''' +import matplotlib +matplotlib.use('Agg',warn=False) +import matplotlib.pyplot as plt, numpy as np, random +from operator import itemgetter +from matplotlib.backends.backend_pdf import PdfPages +import natural_sort as ns +import create_figure +import multiprocessing as ms +from multiprocessing import Queue + +def main(mti_d = {}, rank_l = [], perm_num = 1000, p = 1, s_path = '', h_path = '', e_path = '', le_path = ''): + +# turn values in ranked list around: [value,target] and sort by values in reversed order # + rank_l = map(lambda line:[float(line.strip().split('\t')[1].replace(",",".")),line.strip().split('\t')[0]] if line.strip() else "not",rank_l) + while "not" in rank_l: + rank_l.remove("not") + rank_l.sort(reverse=True) # sort high to low + n = len(rank_l) # number of entries in ranked_l + +# all genes from ranked list # + rank_gene_l = map(lambda entry: entry[1],rank_l) + rank_gene_l = list(set(rank_gene_l)) +# gene list for permutation # + perm_gene_l = rank_gene_l + + max_es_d = {} # dict of max ES per set (mir:es) + max_es_ind_d = {} # dict of max ES index per set (mir:es_ind) + nes_d = {} # dict of NES per set (mir:nes) + nes_l = [] + perm_es_norm_d = {} # dict of all normalized ES for each set (mir:[]) + lead_edge_d = {} # LE values per mir + lead_edge_gene_d = {} #LE genes per mir + + out_str = "Geneset\tSize\tES\tNES\tFDR q-val\tRank at Max\tLeading Edge\n" + + mir_l = mti_d.keys() + mir_l.sort(key=ns.natural_keys) + es_plot_pages = PdfPages(e_path) + es_q = Queue() + proc_l = [] + +# calculate ES + Leading Edge and NES # + for mir in mir_l: + es_ms = ms.Process(target=get_es, args=(mir,mti_d[mir], rank_l, perm_gene_l,p, n, perm_num, es_plot_pages, es_q)) + proc_l.append(es_ms) + es_ms.start() + +# Get all results from multiprocess (unordered) # + result_l = [] + for _ in range(0,len(mir_l)): + result_l.append(es_q.get()) + for p in proc_l: + p.join() +# Sort result lists by miRNA # + result_l.sort(key=lambda x: ns.natural_keys(x[0])) + new_mti_d = {} + + le_gene_file = open(le_path, "w") + le_gene_file.write("Geneset\tGene\tIndex in Ranked List\tRunning ES\tLE Member\n") + +# Process results and create plots for running ES per miRNA # + for e in result_l: + mir = e[0] + new_mti_d[mir] = e[1] + max_es_d[mir] = e[2] + max_es_ind_d[mir] = e[3] + nes_d[mir] = e[4] + lead_edge_d[mir] = e[6] + + ## If Set passed ## + if not "No gene" in str(max_es_d[mir]): + lead_edge_gene_d[mir] = e[7] + # add nes to list # + nes_l.append(e[4]) + # add list of permuted ES to dict # + perm_es_norm_d[mir] = e[5] + # Extend MTI-set-genes file # + es_genes(mir, e[8], e[10], e[2], e[3], le_gene_file) + # ES plot with all main running ESs # + es_plot(mir, e[8], e[2], e[3]+1, e[9], e[10], e[11], es_plot_pages) + es_plot_pages.close() + le_gene_file.close() + +# calculate FDR q-value# + FDR_mean_d, FDR_median_d = get_fdr(mir_l, nes_d,nes_l, perm_es_norm_d, perm_num) + +# create LE mir overlap heatmap # + if lead_edge_gene_d: + create_figure.heatmap(lead_edge_gene_d, h_path, le=True) + else: + es_plot_pages = PdfPages(e_path) + empty_es_plot(es_plot_pages) + es_plot_pages.close() + create_figure.heatmap(mti_d, h_path, le=False) + +# create output # + get_output(new_mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, s_path) + +def get_es(s, mti_l, rank_l, gene_l, p, n, perm_num, es_plot_pages, es_q): + + + n = float(n) # number of genes in ranked list + es_l = [] # new ES list for set + mir_l = [] # " " + le_gene_l = [] # leading edge gene list + perm_es_norm_l = [] # normalized permuted ES list + +# define new set, if not all genes in set are in rank list # + new_set_l = set(mti_l).intersection(gene_l) + mti_l = list(new_set_l) + nh = float(len(mti_l)) # number of targets in current set + +# if at least one gene is in both lists # + if nh > 0: + empty_set = False + + # start enrichment analysis calculations # + hit_gene_l = map(lambda gene: gene[1] if gene[1] in mti_l else "",rank_l) # list hits (gene name) and fails (empty string) of genes in rank list for current set + hit_bin = map(lambda gene: 1 if gene[1] in mti_l else 0,rank_l) # list hits (1) and fails (0) of genes in rank list for current set + miss_bin = map(lambda gene: 0 if gene[1] in mti_l else 1,rank_l) # inverted hit_bin list -> hits = 0, fails = 1 + calc_l = map(lambda item:item[0]**p, rank_l) # list with weighted ranking values (val^p) + nr = abs(sum([hit_bin[i]*calc_l[i] for i in range(0,len(calc_l))])) # absolute (positive) sum of ranking values of genes in current set + + norm_hit = 1.0/nr + norm_miss = 1.0/(n-nh) + sumes = 0 + rES_l = [] + for i in range(0,len(calc_l)): + res = ((hit_bin[i]*calc_l[i]*norm_hit)-(miss_bin[i]*norm_miss))+sumes + rES_l.append(res) + sumes = res + + es_l = rES_l + mir_l = hit_bin + + ## get real ES (max/min) ## + max_es = max(es_l) + max_es_ind = es_l.index(max_es) # first occurring max + min_es = min(es_l) + min_es_ind = es_l.index(min_es) # first occurring min + + # min or max ES as max ES # + if max_es > -min_es: + max_es = max_es + max_es_ind = max_es_ind + else: + max_es = min_es + max_es_ind = min_es_ind + +# no gene overlaps with rank list # + else: + # set is empty # + empty_set = True + +## further processes in dependence of the tool's state ## + # function was called for main calculation # + if perm_num: + if not empty_set: + ## Leading Edge values and genes ## + if max_es >= 0: + tag_frac = sum(hit_bin[:max_es_ind+1])/nh + gene_frac = (max_es_ind+1)/n + le_gene_l = hit_gene_l[:max_es_ind+1] + else: + tag_frac = sum(hit_bin[max_es_ind:])/nh + gene_frac = (n-(max_es_ind+1)+1)/n + le_gene_l = hit_gene_l[max_es_ind:] + signal = tag_frac * (1-gene_frac) * (n/(n-nh)) # gives different results than GSEA, but has the official formular -.- + + le_gene_l = list(set(le_gene_l)) + if "" in le_gene_l: + le_gene_l.remove("") + + ## calculate NES by permute gene sets # + nes, perm_es_norm_l = get_nes(max_es, int(nh), rank_l, gene_l, perm_num, p, n) + es_q.put([s,mti_l, max_es, max_es_ind, nes, perm_es_norm_l, [tag_frac, gene_frac, signal], le_gene_l,es_l, mir_l, hit_gene_l, n]) + else: + es_q.put([s,[],"No gene in rank list.", "NaN","NaN", perm_es_norm_l, ["NaN","NaN","NaN"], []]) + + # function was called for permuted calculation - just return max_es # + else: + return max_es + +def get_nes(max_es, nh, rank_l, gene_l, perm_num, p, n): + ''' permutations per set ''' + perm_es_l = [] # list of all permuted escores + perm_pos_es_l = [] # list of all permuted positive escores + perm_neg_es_l = [] # list of all permuted negative escores + for _ in range(0,perm_num): + tmp_mti_l = random.sample(gene_l,nh) # randomly fill sets with same number of targets + # new mti_dict done # + tmp_es = get_es('x', tmp_mti_l, rank_l, gene_l, p, n,False,False, None) + perm_es_l.append(tmp_es) + if tmp_es >= 0: + perm_pos_es_l.append(tmp_es) + else: + perm_neg_es_l.append(tmp_es) + # get mean of p_es for normalization # + if len(perm_pos_es_l) > 0: + pos_mean = sum(perm_pos_es_l)/len(perm_pos_es_l) + else: pos_mean = False + if len(perm_neg_es_l) > 0: + neg_mean = abs(sum(perm_neg_es_l)/len(perm_neg_es_l)) + else: neg_mean = False + + # normalize all ES scores (permuted, real, max) # + perm_es_norm_l = normalize(perm_es_l, pos_mean, neg_mean) # list of all normalized permuted escores + max_es_norm = normalize([max_es], pos_mean, neg_mean)[0] # normalized max escore = NES + nes = max_es_norm + + return nes, perm_es_norm_l + +def get_fdr(mir_l, nes_d, nes_l, perm_es_norm_d, perm_num): + FDR_mean = {} + FDR_median = {} + nes_col = list(nes_l) + + for mir in mir_l: + nes = nes_d[mir] + perm_col_sum_l = [] + nes_col_sum_l = [] + if not nes == "NaN": + for i in range(0,perm_num): + perm_col = map(lambda es_l:es_l[i],perm_es_norm_d.values()) # all perm ES over all sets from index i + if nes >= 0: + perm_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,perm_col))) # sum up all positives (permuted col) + nes_col_sum = float(sum(map(lambda es: 1 if es >= 0 else 0,nes_col))) # " " (real col) + perm_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0 + # sum up all above NES (permuted col) and normalize with colsum + nes_col_sum_l.append(sum(map(lambda es: 1 if es >= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # " " nes_val " " + # " " (real col) " " " + + else: + perm_col_sum = sum(map(lambda es: 1 if es < 0 else 0,perm_col)) # number of all negatives (permuted col) + nes_col_sum = sum(map(lambda es: 1 if es < 0 else 0,nes_col)) # " " (real col) + perm_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, perm_col))/float(perm_col_sum) if perm_col_sum > 0 else 0)# list of all perm_val where colsum > 0: + # sum up all below NES (permuted col) and normalize with colsum + nes_col_sum_l.append(sum(map(lambda es: 1 if es <= nes else 0, nes_col))/float(nes_col_sum) if nes_col_sum > 0 else 0) # # " " nes_val " " + # " " (real col) " " + perm_norm_mean = np.mean(perm_col_sum_l) + nes_norm_mean = np.mean(nes_col_sum_l) + perm_norm_median = np.median(perm_col_sum_l) + nes_norm_median = np.median(nes_col_sum_l) + + FDR_mean[mir] = perm_norm_mean/nes_norm_mean if perm_norm_mean/nes_norm_mean < 1 else 1 # permutated proportional to real + FDR_median[mir] = perm_norm_median/nes_norm_median if perm_norm_median/nes_norm_median < 1 else 1 # " " + else: + FDR_mean[mir], FDR_median[mir] = "NaN", "NaN" + return FDR_mean, FDR_median + +def get_output(mti_d, max_es_d, max_es_ind_d, nes_d, FDR_mean_d, FDR_median_d, lead_edge_d, out_str, outPath): + full_val_l = [] + # round floating point numbers # + for mir in mti_d: + size = len(mti_d[mir]) + # round floating point numbers - if they are floats # + formatted_nes = format(nes_d[mir],".2f") if not nes_d[mir]=="NaN" else nes_d[mir] + formatted_fdr = format(FDR_mean_d[mir],".3f") if not FDR_mean_d[mir]=="NaN" else FDR_mean_d[mir] + formatted_es = format(max_es_d[mir],".2f") if not type(max_es_d[mir])==str else max_es_d[mir] + max_es_ind = max_es_ind_d[mir]+1 if type(max_es_ind_d[mir])==int else max_es_ind_d[mir] + # list of all values for output # + full_val_l.append([mir, size, formatted_es, formatted_nes, formatted_fdr, max_es_ind, lead_edge_d[mir]]) + + full_val_l.sort(key=itemgetter(3), reverse=True) # sort values by NES (greater first) + ind_le = 6 # index where leading edge values are in list + epi_str = "" # epilog string for output ("No gene in rank list" MTIs) + for mset in full_val_l: + mset_str = map(lambda it: str(it),mset[:ind_le]) # all values except for leading edge values + tag_str = str(int(round(100*mset[ind_le][0]))) if not mset[ind_le][0]=="NaN" else mset[ind_le][0] + list_str = str(int(round(100*mset[ind_le][1]))) if not mset[ind_le][1]=="NaN" else mset[ind_le][1] + sig_str = str(int(round(100*mset[ind_le][2]))) if not mset[ind_le][2]=="NaN" else mset[ind_le][2] + le_str = "tags="+tag_str+"%"+",list="+list_str+"%"+",signal="+sig_str+"%" # leading edge values + if "No gene" in mset_str[2]: + # empty sets at end of list # + epi_str += "\t".join(mset_str)+"\t"+le_str+"\n" + else: + out_str += "\t".join(mset_str)+"\t"+le_str+"\n" # prolog string for output + out_str += epi_str + ofile = open(outPath,"w") + ofile.write(out_str) + ofile.close() + +def es_genes(mir, es_l, tar_l, max_es, max_es_ind, gene_file): + '''Extends the MTI-Set-Genes list with a row for each gene in the MTI-Set. + Columns are: "MTI-Set Target Index in ranked list Running ES LE Member" ''' + for tar in tar_l: + if tar: + index = tar_l.index(tar) # index of gene in ranked list + if max_es >= 0: # positive max ES - all genes before and at index + if index <= max_es_ind: le = "Yes" + else: le = "No" + else: # negative max ES - all genes at and after index + if index >= max_es_ind: le = "Yes" + else: le = "No" + gene_file.write("\t".join([mir, tar, str(index+1), str(format(es_l[index],".2f")), le])+"\n") + +def es_plot(s, es_l, max_es, max_es_ind, mir_l, tar_l, n, es_plot_pages): + ''' Plots running ES for each index of ranking list and target occurrence per MTI-set ''' + + ticks_tar = np.arange(1,n+1,1) # target occurrence ticks - rearranged so they are centered to ES ticks + ticks_es = np.arange(1,n+1) # ES ticks (startig from index 1) + +# One plot per MTI-set # + + fig = plt.figure(figsize=(8,3)) + + ## targets ## + plt.subplot2grid((3,1),(0,0)) # subplot: 3rows,1col - r0,c0 + plt.vlines(ticks_tar, [0], mir_l) + plt.xlim(xmin=0.5,xmax=n+0.5) # x-axis from 0.5 - num of items + 0.5 + plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off', + labelleft='off',right='off',top='off') # turns of all axis ticks and labels + plt.title("Enrichment Plot: "+s)#, y= fig_height,fontsize=font_s) + + ## ES ## + ymin=min(es_l)-0.1 + ymax=max(es_l)+0.1 + plt.subplot2grid((3,1), (1,0),rowspan=2) # subplot: 3rows,1column - starts from r0c0 and spans over 2 rows (1:3) + plt.axhline(0,0,n+0.5,linestyle='--',color='grey') # horizontal line through zero + plt.plot((max_es_ind,max_es_ind),(ymin,ymax),'r-',ticks_es,tuple(es_l),'b-') # color points per data point and line through data points + plt.xlim(xmin=0.5,xmax=n+0.5) + plt.xlabel("Rank in Ordered Dataset") + plt.ylabel("Enrichment Score (ES)") + plt.ylim(ymin=ymin, ymax=ymax) # reset xlimits + + plt.subplots_adjust(hspace=0) # no space between subplots + es_plot_pages.savefig(fig, bbox_inches='tight') + plt.close() + +def empty_es_plot(es_plot_pages): + fig = plt.figure(figsize=(3,3)) + plt.tick_params(axis='both',bottom='off',labelbottom='off',left='off', + labelleft='off',right='off',top='off') # turns of all axis ticks and labels + plt.text(0.5,0.5,"No gene in rank list.", ha="center") + es_plot_pages.savefig(fig) + plt.close() + +def normalize(v_list, pos_m, neg_m): + norm_list = [] + for es in v_list: + if es >= 0: + if pos_m: + norm_list.append(es/pos_m) + else: + norm_list.append("NaN") + else: + if neg_m: + norm_list.append(es/neg_m) + else: + norm_list.append("NaN") + return norm_list + +if __name__ == '__main__': + main() diff --git a/src/gsea.py b/src/gsea.py new file mode 100755 index 0000000..705ac7d --- /dev/null +++ b/src/gsea.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +''' +@author: Jens Preussner + +This is a wrapper for Julia Bayers geneset enrichment analysis program. + +Input: +sys.argv[1] - geneset dictionary, key=name of geneset, value=list of genes (strings) +sys.argv[2] - tab separated rank file, 1st col genes, 2nd col rank +sys.argv[3] - output base +''' + +import sys +import enrichAnalysis +import json + +try: + with open(sys.argv[1]) as geneset_dict: + genesets = json.load(geneset_dict) +except IOError as e: + print("Unable to open file "+sys.argv[1]+". File might not exist or missing read permissions.") + + +ranks = [] +try: + with open(sys.argv[2]) as rank_file: + for line in rank_file: + ranks.append(line) +except IOError as e: + print("Unable to open file "+sys.argv[2]+". File might not exist or missing read permissions.") + +enrichAnalysis.main(mti_d = genesets, rank_l = ranks, s_path = sys.argv[3]+'-genesets_ranked.txt', h_path = sys.argv[3]+'-geneset_overlap.pdf', e_path = sys.argv[3]+'-genesets.pdf', le_path = sys.argv[3]+'-geneset_genes.txt') diff --git a/src/natural_sort.py b/src/natural_sort.py new file mode 100755 index 0000000..ecf9d29 --- /dev/null +++ b/src/natural_sort.py @@ -0,0 +1,14 @@ +''' +@author: jbayer + +Sorts a list of items in naturla order ( [mir2,mir1,mir10] -> [mir1,mir2,mir10] ) +''' +import re + +def natural_keys(text): + '''alist.sort(key=natural_keys) sorts in human order''' + return [ atoi(c) for c in re.split('(\d+)', text) ] + + +def atoi(text): + return int(text) if text.isdigit() else text