Implemented gene set enrichment

loosolab · Jul 22, 2015 · aa53535 · aa53535
1 parent 916de7e
commit aa53535
Show file tree

Hide file tree

Showing 7 changed files with 670 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 test/*
+src/*.pyc
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/admire b/src/admire
@@ -260,9 +260,9 @@ EOL
 
 		color=${color["$sample"]}
 		if [[ "$file" = /* ]]; then
-			ln -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
+			ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
 		else
-			ln -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
+			ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
 		fi
 
 		cat >> SampleSheet.csv_ << EOL
@@ -484,14 +484,23 @@ Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1
 echo "Step 8 of 9: Calculating enrichment across gene sets..."
 GSEA=geneset_enrichment
 mkdir -p $GSEA
+# build up gene sets as serialized data structure
+cat ${G[@]} | awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i<NF){genes=genes"\""$(i)"\","}else{genes=genes"\""$(i)"\""}};if(NR!=1){print ",\""name"\" : ["genes"]"}else{print "\""name"\" : ["genes"]"}}END{print "}"}' | tr -d '\n' > $GSEA/genesets.dict
 set -- ${groups[@]}
 for a; do
         shift
         for b; do
                 g=$a-$b
 		for reg in "${R[@]}"; do
 			r=`basename $reg .bed`
-
+			# build ranked list(s)
+			tail -n +2 $EXCEL/$g-$r.csv | grep "$a" | cut -f 2,17 -d "," | grep -v '^,' |  tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$a.ranks
+			tail -n +2 $EXCEL/$g-$r.csv | grep "$b" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$b.ranks
+			if [[ -s $GSEA/$g-$r-$a.ranks || -s $GSEA/$g-$r-$b.ranks ]]; then
+				python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a
+				python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b
+			fi
+			rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks
 		done
 	done
 done
@@ -504,5 +513,4 @@ done
 #----------------------------------------------------
 echo "Step 9 of 9: Wrapping up results..."
 tar -zcvf $O visualization excel >/dev/null 2>&1
-
 echo "Done. Your results are compressed into $O."
diff --git a/src/create_figure.py b/src/create_figure.py
@@ -0,0 +1,232 @@
+'''
+
+@author: jbayer
+
+Module with different functions for creating different figures.
+
+"heatmap" is specialised for plotting MTI-Set overlap ratios within a heatmap build
+with pyplot from matplotlib.
+    INPUT:
+    mti_d - {mir:[uniProt Accessions]} dictionary
+    outPath - Path to save heatmap plot
+    le - True if function compares MTI-sets that have just leading edge genes in it
+            (-> changes in title)
+    OUTPUT:
+    Heatmap pdf file
+    
+"bargraph" creates two graphs on one page with the number of miRNAs/MTIs per
+analysis step.
+    INPUT:
+    db_num_l  -   Step 1: number of identified items in MTI DBs
+    occ_num_l -   Step 2: number of items occurring as often as defined over MTI DBs
+    uni_num_l -   Step 3: number of items mapped to UniProt Accessions
+    annot_num_l - Step 4: number of items overlapping with annotation file
+    out_path - Path to save bar plot
+    sel - True if MTIs were just selected (-> no step 4)
+    OUTPUT:
+    Bar plot pdf file
+    
+"venn" is specialised to create a venn diagram for maximum four MTI DBs to show the
+numerical intersections of their MTIs or miRNAs.
+    INPUT:
+    outpath - Path to save venn diagram
+    base_list - List of used MTI DBs as numerical abbreviation (1-4)
+    baseDict_list - List with the MTI dictionaries of the MTI DBs
+    baseName_list - List with the names of the used MTI DBs
+    mti - True if diagram shows the MTIs (-> changes in title)
+    OUTPUT:
+    Venn diagram pdf file
+'''
+import tempfile, os, matplotlib, natural_sort as ns
+matplotlib.use('Agg',warn=False)
+import matplotlib.pyplot as plt, numpy as np
+
+def heatmap(mti_d, outPath, le = False):
+    label_l = []
+    for mir in mti_d.keys():
+        if mti_d[mir]:
+            label_l.append(mir)
+    label_l.sort(key=ns.natural_keys)
+
+# calculate ratio of all sets #
+    matrix = [] # matrix with ratios
+    done_l = []
+    for mir1 in label_l:
+        row_l = []
+        for mir2 in label_l:
+            union = len(set(mti_d[mir1]+mti_d[mir2]))
+            overlap = float(len(set(mti_d[mir1]).intersection(mti_d[mir2])))
+            try:
+                row_l.append(overlap/union)
+            except:
+                row_l.append(0)
+        matrix.append(row_l)
+        done_l.append(mir1)
+
+# create heatmap #
+    font_s = 280./len(label_l)
+    if font_s > 12:
+        font_s = 12
+    greyline = 24./len(label_l)
+    plt.rc('axes',linewidth = 24./len(label_l))
+
+    cmapx = plt.cm.get_cmap("GnBu")
+    hm = plt.pcolormesh(np.array(matrix), cmap=cmapx, edgecolors='lightgrey',linewidth=greyline)
+    plt.xticks(np.arange(len(label_l))+0.5,label_l, rotation = 90, fontsize=font_s) # mir labels x-axis
+    plt.yticks(np.arange(len(label_l))+0.5,label_l, fontsize=font_s) # mir labels y-axis
+    plt.tick_params(axis='both',bottom='off',left='off',right='off',top='off')  # turns off all axis ticks
+    plt.xlim(xmax=len(label_l))           # x-axis length = number of mirs 
+    plt.ylim(ymax=len(label_l))           # y-axis length = number of mirs 
+    plt.colorbar(hm)
+    #cb.ax.tick_params(labelsize=font_s)
+    if le:
+        plt.title("Leading Edge Geneset Overlap Ratio")
+    else:
+        plt.title("Geneset Overlap Ratio")#, fontsize=font_s)
+    plt.savefig(outPath,format='pdf',bbox_inches='tight')
+    plt.close()        
+    #plt.show() 
+
+def bargraph(db_num_l, occ_num_l, uni_num_l, annot_num_l, out_path, sel):
+    ''' CREATES
+    a Multiplot of two Bar-Graphs in one document.'''
+
+    all_num_l = [db_num_l, occ_num_l, uni_num_l, annot_num_l]
+    if sel: all_num_l = all_num_l[:-1]  # solely selecting MTIs: uni = annot number
+    mir_num_tuple = tuple(map(lambda num_l: num_l[0],all_num_l))    # number of mirnas per tool step in a tuple
+    tar_num_tuple = tuple(map(lambda num_l: num_l[1],all_num_l))    # number of mir-tar-ias per tool step in a tuple
+
+    xtick_name_l = ['DB search','DB occurrence','UniProt mapping', 'Annotation mapping']
+    if sel: xtick_name_l = xtick_name_l[:-1]
+
+# number of bars and their locations for plot 1 and 2 #
+    numBar = 4
+    width = 0.8 # bar width
+    if sel: 
+        numBar = 3
+    loc1 = np.arange(numBar) # (0 1 2 3 4 5)
+    loc2 = np.arange(numBar)
+
+## Subplot miRNAs ##
+    plt.figure(figsize=(6,9), dpi=80,facecolor='w')
+    plt.subplot(211)
+    p1 = plt.bar(loc1,mir_num_tuple,width)#,color=color_list1)
+# Properties for plot 1 #
+    plt.title('Number of miRNAs\nover analysis steps')
+    #plt.ylabel('Number of miRNAs')
+    plt.xticks(loc1,xtick_name_l, rotation=50)
+    # no ticks, no labels (height) on yaxis #
+    plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
+# numbers above bars (set for each) #
+    for bar in p1:
+        height = bar.get_height()
+        num_height = set_numHeight(height,mir_num_tuple[0])
+        plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
+                 ha='center',va='bottom')
+# set height of xaxis a bit higher for numbers above #
+    if not mir_num_tuple[0] == 0:
+        plt.axis([-0.3, 4, 0, mir_num_tuple[0]/6.+mir_num_tuple[0]])
+    else:
+        plt.axis([0, 4, -4, 4])
+
+## Subplot miRNA-target interactions ##
+    plt.subplot(212)
+    p2 = plt.bar(loc2,tar_num_tuple,width)#,color=color_list)
+# Properties for plot 2 #
+    plt.title('Number of miRNA-target interactions\nover analysis steps')
+    #plt.ylabel('Number of miRNA-target interactions')
+    plt.xticks(loc2,xtick_name_l, rotation=50)
+    plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
+# numbers above bars (set for each) #
+    for bar in p2:
+        max_h = max(tar_num_tuple[0],tar_num_tuple[2])
+        height = bar.get_height()
+        num_height = set_numHeight(height, max_h)
+        plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
+                 ha='center',va='bottom')
+# set height of xaxis a bit higher for numbers above #
+    if not tar_num_tuple[0] == 0:
+        plt.axis([-0.3, 4, 0, max_h/6.+max_h])
+    else:
+        plt.axis([0, 4, -4, 4])
+
+# adjust distance between subplots #
+    plt.subplots_adjust(hspace=1)
+# save figure as pdf, remove white space #
+    plt.savefig(out_path,format='pdf',bbox_inches='tight')
+    plt.close()
+
+
+def venn(outpath, base_list, baseDict_list, baseName_list, mti):
+    ''' CREATES 
+    a venn diagram with the number of miRNAs/ MTIs '''
+
+    color_list = ["#4F81BD","#9BBB59","#8064A2","#C0504D"]
+    colors = []
+    # R - string #
+    r_str = '''suppressPackageStartupMessages(library(VennDiagram))
+    '''                              
+    # create lists of miRFams and colors for each DB in R #                       
+    for base in base_list:
+        b_dict = getBaseDict(base, baseDict_list, baseName_list)
+        if not mti:
+            add = 'miRs.pdf'
+            main = 'miRNAs'
+            miRFam_li = list(b_dict.keys())
+        else:
+            add = 'MTIs.pdf'
+            main = 'miRNA-target interactions'
+            miRFam_li = b_dict
+        miRFam_str = str(miRFam_li).replace("[", "").replace("]","")
+
+        if miRFam_str:
+            r_str += ''+base+' <- c('+miRFam_str+')\n'
+            colors.append(color_list[base_list.index(base)])
+
+    color_str = str(colors)[1:-1]
+    bases = str(base_list)[1:-1]
+
+    # set different circle and label sizes for diff. numbers of sets #
+    if not len(base_list) == 0:
+        if len(base_list) == 4 or len(base_list) == 3:
+            catCex = '1.3' # label size
+            labelPos = ')' # label position (here: default)
+        else:
+            catCex = '2'
+            labelPos = ',cat.pos=0)' # position (here: top - 0 degrees )
+
+        # Send lists for each DB to R #
+        r_str += '''base_data <- list('''+bases.replace("'", '')+''')   # list of miRNAs per DB
+        names(base_data) <- c('''+bases.replace("'", '"')+''')          # DB names for miRNA lists
+        pdf(file="'''+outpath+'Venn_'+add+'''",7,7)\n'''
+        # Create Venn-Diagrams #
+        r_str += '''grid.draw(venn.diagram(base_data, filename = NULL, margin=0.1, main="'''+main+'''",
+        main.fontfamily="sans", main.cex='''+catCex+''',main.pos=c(0.5,1),
+        scaled=FALSE,euler.d=FALSE, cat.fontfamily=rep("sans",'''+str(len(base_list))+'''),
+        col=c('''+color_str+'''),cex=2,fontfamily="sans",cat.cex='''+catCex+labelPos+''')
+        '''
+        r_str += "dev.off()"
+        # create temporary file and open it with R (command line) #
+        with tempfile.NamedTemporaryFile(suffix='.R') as tmp:
+            tmp.write(r_str)
+            tmp.flush()
+            os.system("R --vanilla < {tmp}".format(tmp=tmp.name)+" >/dev/null")
+            tmp.close()
+
+
+def getBaseDict(base, baseDict_list, baseName_list):
+    '''returns the dictionary of the given target base''' 
+    return baseDict_list[baseName_list.index(base)]
+
+def set_numHeight(height,maxi):
+    ''' returns the height to write the number '''
+    if height < maxi/6:
+        num_height = 1.5*height
+    elif height < maxi/4:
+        num_height = 1.25*height
+    elif height < maxi/2:
+        num_height = 1.1*height
+    else: num_height = 1.05*height
+    return num_height
+
+