Skip to content
Permalink
Browse files
Implemented gene set enrichment
  • Loading branch information
jenzopr committed Jul 22, 2015
1 parent 916de7e commit aa53535785f865b177aec054cb8cf4c0938adf89
Showing with 670 additions and 4 deletions.
  1. +1 −0 .gitignore
  2. 0 src/__init__.py
  3. +12 −4 src/admire
  4. +232 −0 src/create_figure.py
  5. +379 −0 src/enrichAnalysis.py
  6. +32 −0 src/gsea.py
  7. +14 −0 src/natural_sort.py
@@ -1 +1,2 @@
test/*
src/*.pyc
Empty file.
@@ -260,9 +260,9 @@ EOL

color=${color["$sample"]}
if [[ "$file" = /* ]]; then
ln -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
else
ln -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
fi

cat >> SampleSheet.csv_ << EOL
@@ -484,14 +484,23 @@ Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1
echo "Step 8 of 9: Calculating enrichment across gene sets..."
GSEA=geneset_enrichment
mkdir -p $GSEA
# build up gene sets as serialized data structure
cat ${G[@]} | awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i<NF){genes=genes"\""$(i)"\","}else{genes=genes"\""$(i)"\""}};if(NR!=1){print ",\""name"\" : ["genes"]"}else{print "\""name"\" : ["genes"]"}}END{print "}"}' | tr -d '\n' > $GSEA/genesets.dict
set -- ${groups[@]}
for a; do
shift
for b; do
g=$a-$b
for reg in "${R[@]}"; do
r=`basename $reg .bed`

# build ranked list(s)
tail -n +2 $EXCEL/$g-$r.csv | grep "$a" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$a.ranks
tail -n +2 $EXCEL/$g-$r.csv | grep "$b" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$b.ranks
if [[ -s $GSEA/$g-$r-$a.ranks || -s $GSEA/$g-$r-$b.ranks ]]; then
python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a
python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b
fi
rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks
done
done
done
@@ -504,5 +513,4 @@ done
#----------------------------------------------------
echo "Step 9 of 9: Wrapping up results..."
tar -zcvf $O visualization excel >/dev/null 2>&1

echo "Done. Your results are compressed into $O."
@@ -0,0 +1,232 @@
'''
@author: jbayer
Module with different functions for creating different figures.
"heatmap" is specialised for plotting MTI-Set overlap ratios within a heatmap build
with pyplot from matplotlib.
INPUT:
mti_d - {mir:[uniProt Accessions]} dictionary
outPath - Path to save heatmap plot
le - True if function compares MTI-sets that have just leading edge genes in it
(-> changes in title)
OUTPUT:
Heatmap pdf file
"bargraph" creates two graphs on one page with the number of miRNAs/MTIs per
analysis step.
INPUT:
db_num_l - Step 1: number of identified items in MTI DBs
occ_num_l - Step 2: number of items occurring as often as defined over MTI DBs
uni_num_l - Step 3: number of items mapped to UniProt Accessions
annot_num_l - Step 4: number of items overlapping with annotation file
out_path - Path to save bar plot
sel - True if MTIs were just selected (-> no step 4)
OUTPUT:
Bar plot pdf file
"venn" is specialised to create a venn diagram for maximum four MTI DBs to show the
numerical intersections of their MTIs or miRNAs.
INPUT:
outpath - Path to save venn diagram
base_list - List of used MTI DBs as numerical abbreviation (1-4)
baseDict_list - List with the MTI dictionaries of the MTI DBs
baseName_list - List with the names of the used MTI DBs
mti - True if diagram shows the MTIs (-> changes in title)
OUTPUT:
Venn diagram pdf file
'''
import tempfile, os, matplotlib, natural_sort as ns
matplotlib.use('Agg',warn=False)
import matplotlib.pyplot as plt, numpy as np

def heatmap(mti_d, outPath, le = False):
label_l = []
for mir in mti_d.keys():
if mti_d[mir]:
label_l.append(mir)
label_l.sort(key=ns.natural_keys)

# calculate ratio of all sets #
matrix = [] # matrix with ratios
done_l = []
for mir1 in label_l:
row_l = []
for mir2 in label_l:
union = len(set(mti_d[mir1]+mti_d[mir2]))
overlap = float(len(set(mti_d[mir1]).intersection(mti_d[mir2])))
try:
row_l.append(overlap/union)
except:
row_l.append(0)
matrix.append(row_l)
done_l.append(mir1)

# create heatmap #
font_s = 280./len(label_l)
if font_s > 12:
font_s = 12
greyline = 24./len(label_l)
plt.rc('axes',linewidth = 24./len(label_l))

cmapx = plt.cm.get_cmap("GnBu")
hm = plt.pcolormesh(np.array(matrix), cmap=cmapx, edgecolors='lightgrey',linewidth=greyline)
plt.xticks(np.arange(len(label_l))+0.5,label_l, rotation = 90, fontsize=font_s) # mir labels x-axis
plt.yticks(np.arange(len(label_l))+0.5,label_l, fontsize=font_s) # mir labels y-axis
plt.tick_params(axis='both',bottom='off',left='off',right='off',top='off') # turns off all axis ticks
plt.xlim(xmax=len(label_l)) # x-axis length = number of mirs
plt.ylim(ymax=len(label_l)) # y-axis length = number of mirs
plt.colorbar(hm)
#cb.ax.tick_params(labelsize=font_s)
if le:
plt.title("Leading Edge Geneset Overlap Ratio")
else:
plt.title("Geneset Overlap Ratio")#, fontsize=font_s)
plt.savefig(outPath,format='pdf',bbox_inches='tight')
plt.close()
#plt.show()

def bargraph(db_num_l, occ_num_l, uni_num_l, annot_num_l, out_path, sel):
''' CREATES
a Multiplot of two Bar-Graphs in one document.'''

all_num_l = [db_num_l, occ_num_l, uni_num_l, annot_num_l]
if sel: all_num_l = all_num_l[:-1] # solely selecting MTIs: uni = annot number
mir_num_tuple = tuple(map(lambda num_l: num_l[0],all_num_l)) # number of mirnas per tool step in a tuple
tar_num_tuple = tuple(map(lambda num_l: num_l[1],all_num_l)) # number of mir-tar-ias per tool step in a tuple

xtick_name_l = ['DB search','DB occurrence','UniProt mapping', 'Annotation mapping']
if sel: xtick_name_l = xtick_name_l[:-1]

# number of bars and their locations for plot 1 and 2 #
numBar = 4
width = 0.8 # bar width
if sel:
numBar = 3
loc1 = np.arange(numBar) # (0 1 2 3 4 5)
loc2 = np.arange(numBar)

## Subplot miRNAs ##
plt.figure(figsize=(6,9), dpi=80,facecolor='w')
plt.subplot(211)
p1 = plt.bar(loc1,mir_num_tuple,width)#,color=color_list1)
# Properties for plot 1 #
plt.title('Number of miRNAs\nover analysis steps')
#plt.ylabel('Number of miRNAs')
plt.xticks(loc1,xtick_name_l, rotation=50)
# no ticks, no labels (height) on yaxis #
plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
# numbers above bars (set for each) #
for bar in p1:
height = bar.get_height()
num_height = set_numHeight(height,mir_num_tuple[0])
plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
ha='center',va='bottom')
# set height of xaxis a bit higher for numbers above #
if not mir_num_tuple[0] == 0:
plt.axis([-0.3, 4, 0, mir_num_tuple[0]/6.+mir_num_tuple[0]])
else:
plt.axis([0, 4, -4, 4])

## Subplot miRNA-target interactions ##
plt.subplot(212)
p2 = plt.bar(loc2,tar_num_tuple,width)#,color=color_list)
# Properties for plot 2 #
plt.title('Number of miRNA-target interactions\nover analysis steps')
#plt.ylabel('Number of miRNA-target interactions')
plt.xticks(loc2,xtick_name_l, rotation=50)
plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
# numbers above bars (set for each) #
for bar in p2:
max_h = max(tar_num_tuple[0],tar_num_tuple[2])
height = bar.get_height()
num_height = set_numHeight(height, max_h)
plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
ha='center',va='bottom')
# set height of xaxis a bit higher for numbers above #
if not tar_num_tuple[0] == 0:
plt.axis([-0.3, 4, 0, max_h/6.+max_h])
else:
plt.axis([0, 4, -4, 4])

# adjust distance between subplots #
plt.subplots_adjust(hspace=1)
# save figure as pdf, remove white space #
plt.savefig(out_path,format='pdf',bbox_inches='tight')
plt.close()


def venn(outpath, base_list, baseDict_list, baseName_list, mti):
''' CREATES
a venn diagram with the number of miRNAs/ MTIs '''

color_list = ["#4F81BD","#9BBB59","#8064A2","#C0504D"]
colors = []
# R - string #
r_str = '''suppressPackageStartupMessages(library(VennDiagram))
'''
# create lists of miRFams and colors for each DB in R #
for base in base_list:
b_dict = getBaseDict(base, baseDict_list, baseName_list)
if not mti:
add = 'miRs.pdf'
main = 'miRNAs'
miRFam_li = list(b_dict.keys())
else:
add = 'MTIs.pdf'
main = 'miRNA-target interactions'
miRFam_li = b_dict
miRFam_str = str(miRFam_li).replace("[", "").replace("]","")

if miRFam_str:
r_str += ''+base+' <- c('+miRFam_str+')\n'
colors.append(color_list[base_list.index(base)])

color_str = str(colors)[1:-1]
bases = str(base_list)[1:-1]

# set different circle and label sizes for diff. numbers of sets #
if not len(base_list) == 0:
if len(base_list) == 4 or len(base_list) == 3:
catCex = '1.3' # label size
labelPos = ')' # label position (here: default)
else:
catCex = '2'
labelPos = ',cat.pos=0)' # position (here: top - 0 degrees )

# Send lists for each DB to R #
r_str += '''base_data <- list('''+bases.replace("'", '')+''') # list of miRNAs per DB
names(base_data) <- c('''+bases.replace("'", '"')+''') # DB names for miRNA lists
pdf(file="'''+outpath+'Venn_'+add+'''",7,7)\n'''
# Create Venn-Diagrams #
r_str += '''grid.draw(venn.diagram(base_data, filename = NULL, margin=0.1, main="'''+main+'''",
main.fontfamily="sans", main.cex='''+catCex+''',main.pos=c(0.5,1),
scaled=FALSE,euler.d=FALSE, cat.fontfamily=rep("sans",'''+str(len(base_list))+'''),
col=c('''+color_str+'''),cex=2,fontfamily="sans",cat.cex='''+catCex+labelPos+''')
'''
r_str += "dev.off()"
# create temporary file and open it with R (command line) #
with tempfile.NamedTemporaryFile(suffix='.R') as tmp:
tmp.write(r_str)
tmp.flush()
os.system("R --vanilla < {tmp}".format(tmp=tmp.name)+" >/dev/null")
tmp.close()


def getBaseDict(base, baseDict_list, baseName_list):
'''returns the dictionary of the given target base'''
return baseDict_list[baseName_list.index(base)]

def set_numHeight(height,maxi):
''' returns the height to write the number '''
if height < maxi/6:
num_height = 1.5*height
elif height < maxi/4:
num_height = 1.25*height
elif height < maxi/2:
num_height = 1.1*height
else: num_height = 1.05*height
return num_height


0 comments on commit aa53535

Please sign in to comment.