create_figure.py

'''

@author: jbayer

Module with different functions for creating different figures.

"heatmap" is specialised for plotting MTI-Set overlap ratios within a heatmap build
with pyplot from matplotlib.
    INPUT:
    mti_d - {mir:[uniProt Accessions]} dictionary
    outPath - Path to save heatmap plot
    le - True if function compares MTI-sets that have just leading edge genes in it
            (-> changes in title)
    OUTPUT:
    Heatmap pdf file

"bargraph" creates two graphs on one page with the number of miRNAs/MTIs per
analysis step.
    INPUT:
    db_num_l  -   Step 1: number of identified items in MTI DBs
    occ_num_l -   Step 2: number of items occurring as often as defined over MTI DBs
    uni_num_l -   Step 3: number of items mapped to UniProt Accessions
    annot_num_l - Step 4: number of items overlapping with annotation file
    out_path - Path to save bar plot
    sel - True if MTIs were just selected (-> no step 4)
    OUTPUT:
    Bar plot pdf file

"venn" is specialised to create a venn diagram for maximum four MTI DBs to show the
numerical intersections of their MTIs or miRNAs.
    INPUT:
    outpath - Path to save venn diagram
    base_list - List of used MTI DBs as numerical abbreviation (1-4)
    baseDict_list - List with the MTI dictionaries of the MTI DBs
    baseName_list - List with the names of the used MTI DBs
    mti - True if diagram shows the MTIs (-> changes in title)
    OUTPUT:
    Venn diagram pdf file
'''
import tempfile, os, matplotlib, natural_sort as ns
matplotlib.use('Agg',warn=False)
import matplotlib.pyplot as plt, numpy as np

def heatmap(mti_d, outPath, le = False):
    label_l = []
    for mir in mti_d.keys():
        if mti_d[mir]:
            label_l.append(mir)
    label_l.sort(key=ns.natural_keys)

# calculate ratio of all sets #
    matrix = [] # matrix with ratios
    done_l = []
    for mir1 in label_l:
        row_l = []
        for mir2 in label_l:
            union = len(set(mti_d[mir1]+mti_d[mir2]))
            overlap = float(len(set(mti_d[mir1]).intersection(mti_d[mir2])))
            try:
                row_l.append(overlap/union)
            except:
                row_l.append(0)
        matrix.append(row_l)
        done_l.append(mir1)

# create heatmap #
    font_s = 280./len(label_l)
    if font_s > 12:
        font_s = 12
    greyline = 24./len(label_l)
    plt.rc('axes',linewidth = 24./len(label_l))

    cmapx = plt.cm.get_cmap("GnBu")
    hm = plt.pcolormesh(np.array(matrix), cmap=cmapx, edgecolors='lightgrey',linewidth=greyline)
    plt.xticks(np.arange(len(label_l))+0.5,label_l, rotation = 90, fontsize=font_s) # mir labels x-axis
    plt.yticks(np.arange(len(label_l))+0.5,label_l, fontsize=font_s) # mir labels y-axis
    plt.tick_params(axis='both',bottom='off',left='off',right='off',top='off')  # turns off all axis ticks
    plt.xlim(xmax=len(label_l))           # x-axis length = number of mirs
    plt.ylim(ymax=len(label_l))           # y-axis length = number of mirs
    plt.colorbar(hm)
    #cb.ax.tick_params(labelsize=font_s)
    if le:
        plt.title("Leading Edge Geneset Overlap Ratio")
    else:
        plt.title("Geneset Overlap Ratio")#, fontsize=font_s)
    plt.savefig(outPath,format='pdf',bbox_inches='tight')
    plt.close()
    #plt.show()

def bargraph(db_num_l, occ_num_l, uni_num_l, annot_num_l, out_path, sel):
    ''' CREATES
    a Multiplot of two Bar-Graphs in one document.'''

    all_num_l = [db_num_l, occ_num_l, uni_num_l, annot_num_l]
    if sel: all_num_l = all_num_l[:-1]  # solely selecting MTIs: uni = annot number
    mir_num_tuple = tuple(map(lambda num_l: num_l[0],all_num_l))    # number of mirnas per tool step in a tuple
    tar_num_tuple = tuple(map(lambda num_l: num_l[1],all_num_l))    # number of mir-tar-ias per tool step in a tuple

    xtick_name_l = ['DB search','DB occurrence','UniProt mapping', 'Annotation mapping']
    if sel: xtick_name_l = xtick_name_l[:-1]

# number of bars and their locations for plot 1 and 2 #
    numBar = 4
    width = 0.8 # bar width
    if sel:
        numBar = 3
    loc1 = np.arange(numBar) # (0 1 2 3 4 5)
    loc2 = np.arange(numBar)

## Subplot miRNAs ##
    plt.figure(figsize=(6,9), dpi=80,facecolor='w')
    plt.subplot(211)
    p1 = plt.bar(loc1,mir_num_tuple,width)#,color=color_list1)
# Properties for plot 1 #
    plt.title('Number of miRNAs\nover analysis steps')
    #plt.ylabel('Number of miRNAs')
    plt.xticks(loc1,xtick_name_l, rotation=50)
    # no ticks, no labels (height) on yaxis #
    plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
# numbers above bars (set for each) #
    for bar in p1:
        height = bar.get_height()
        num_height = set_numHeight(height,mir_num_tuple[0])
        plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
                 ha='center',va='bottom')
# set height of xaxis a bit higher for numbers above #
    if not mir_num_tuple[0] == 0:
        plt.axis([-0.3, 4, 0, mir_num_tuple[0]/6.+mir_num_tuple[0]])
    else:
        plt.axis([0, 4, -4, 4])

## Subplot miRNA-target interactions ##
    plt.subplot(212)
    p2 = plt.bar(loc2,tar_num_tuple,width)#,color=color_list)
# Properties for plot 2 #
    plt.title('Number of miRNA-target interactions\nover analysis steps')
    #plt.ylabel('Number of miRNA-target interactions')
    plt.xticks(loc2,xtick_name_l, rotation=50)
    plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
# numbers above bars (set for each) #
    for bar in p2:
        max_h = max(tar_num_tuple[0],tar_num_tuple[2])
        height = bar.get_height()
        num_height = set_numHeight(height, max_h)
        plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
                 ha='center',va='bottom')
# set height of xaxis a bit higher for numbers above #
    if not tar_num_tuple[0] == 0:
        plt.axis([-0.3, 4, 0, max_h/6.+max_h])
    else:
        plt.axis([0, 4, -4, 4])

# adjust distance between subplots #
    plt.subplots_adjust(hspace=1)
# save figure as pdf, remove white space #
    plt.savefig(out_path,format='pdf',bbox_inches='tight')
    plt.close()


def venn(outpath, base_list, baseDict_list, baseName_list, mti):
    ''' CREATES
    a venn diagram with the number of miRNAs/ MTIs '''

    color_list = ["#4F81BD","#9BBB59","#8064A2","#C0504D"]
    colors = []
    # R - string #
    r_str = '''suppressPackageStartupMessages(library(VennDiagram))
    '''
    # create lists of miRFams and colors for each DB in R #
    for base in base_list:
        b_dict = getBaseDict(base, baseDict_list, baseName_list)
        if not mti:
            add = 'miRs.pdf'
            main = 'miRNAs'
            miRFam_li = list(b_dict.keys())
        else:
            add = 'MTIs.pdf'
            main = 'miRNA-target interactions'
            miRFam_li = b_dict
        miRFam_str = str(miRFam_li).replace("[", "").replace("]","")

        if miRFam_str:
            r_str += ''+base+' <- c('+miRFam_str+')\n'
            colors.append(color_list[base_list.index(base)])

    color_str = str(colors)[1:-1]
    bases = str(base_list)[1:-1]

    # set different circle and label sizes for diff. numbers of sets #
    if not len(base_list) == 0:
        if len(base_list) == 4 or len(base_list) == 3:
            catCex = '1.3' # label size
            labelPos = ')' # label position (here: default)
        else:
            catCex = '2'
            labelPos = ',cat.pos=0)' # position (here: top - 0 degrees )

        # Send lists for each DB to R #
        r_str += '''base_data <- list('''+bases.replace("'", '')+''')   # list of miRNAs per DB
        names(base_data) <- c('''+bases.replace("'", '"')+''')          # DB names for miRNA lists
        pdf(file="'''+outpath+'Venn_'+add+'''",7,7)\n'''
        # Create Venn-Diagrams #
        r_str += '''grid.draw(venn.diagram(base_data, filename = NULL, margin=0.1, main="'''+main+'''",
        main.fontfamily="sans", main.cex='''+catCex+''',main.pos=c(0.5,1),
        scaled=FALSE,euler.d=FALSE, cat.fontfamily=rep("sans",'''+str(len(base_list))+'''),
        col=c('''+color_str+'''),cex=2,fontfamily="sans",cat.cex='''+catCex+labelPos+''')
        '''
        r_str += "dev.off()"
        # create temporary file and open it with R (command line) #
        with tempfile.NamedTemporaryFile(suffix='.R') as tmp:
            tmp.write(r_str)
            tmp.flush()
            os.system("R --vanilla < {tmp}".format(tmp=tmp.name)+" >/dev/null")
            tmp.close()


def getBaseDict(base, baseDict_list, baseName_list):
    '''returns the dictionary of the given target base'''
    return baseDict_list[baseName_list.index(base)]

def set_numHeight(height,maxi):
    ''' returns the height to write the number '''
    if height < maxi/6:
        num_height = 1.5*height
    elif height < maxi/4:
        num_height = 1.25*height
    elif height < maxi/2:
        num_height = 1.1*height
    else: num_height = 1.05*height
    return num_height
	'''

	@author: jbayer

	Module with different functions for creating different figures.

	"heatmap" is specialised for plotting MTI-Set overlap ratios within a heatmap build
	with pyplot from matplotlib.
	INPUT:
	mti_d - {mir:[uniProt Accessions]} dictionary
	outPath - Path to save heatmap plot
	le - True if function compares MTI-sets that have just leading edge genes in it
	(-> changes in title)
	OUTPUT:
	Heatmap pdf file

	"bargraph" creates two graphs on one page with the number of miRNAs/MTIs per
	analysis step.
	INPUT:
	db_num_l - Step 1: number of identified items in MTI DBs
	occ_num_l - Step 2: number of items occurring as often as defined over MTI DBs
	uni_num_l - Step 3: number of items mapped to UniProt Accessions
	annot_num_l - Step 4: number of items overlapping with annotation file
	out_path - Path to save bar plot
	sel - True if MTIs were just selected (-> no step 4)
	OUTPUT:
	Bar plot pdf file

	"venn" is specialised to create a venn diagram for maximum four MTI DBs to show the
	numerical intersections of their MTIs or miRNAs.
	INPUT:
	outpath - Path to save venn diagram
	base_list - List of used MTI DBs as numerical abbreviation (1-4)
	baseDict_list - List with the MTI dictionaries of the MTI DBs
	baseName_list - List with the names of the used MTI DBs
	mti - True if diagram shows the MTIs (-> changes in title)
	OUTPUT:
	Venn diagram pdf file
	'''
	import tempfile, os, matplotlib, natural_sort as ns
	matplotlib.use('Agg',warn=False)
	import matplotlib.pyplot as plt, numpy as np

	def heatmap(mti_d, outPath, le = False):
	label_l = []
	for mir in mti_d.keys():
	if mti_d[mir]:
	label_l.append(mir)
	label_l.sort(key=ns.natural_keys)

	# calculate ratio of all sets #
	matrix = [] # matrix with ratios
	done_l = []
	for mir1 in label_l:
	row_l = []
	for mir2 in label_l:
	union = len(set(mti_d[mir1]+mti_d[mir2]))
	overlap = float(len(set(mti_d[mir1]).intersection(mti_d[mir2])))
	try:
	row_l.append(overlap/union)
	except:
	row_l.append(0)
	matrix.append(row_l)
	done_l.append(mir1)

	# create heatmap #
	font_s = 280./len(label_l)
	if font_s > 12:
	font_s = 12
	greyline = 24./len(label_l)
	plt.rc('axes',linewidth = 24./len(label_l))

	cmapx = plt.cm.get_cmap("GnBu")
	hm = plt.pcolormesh(np.array(matrix), cmap=cmapx, edgecolors='lightgrey',linewidth=greyline)
	plt.xticks(np.arange(len(label_l))+0.5,label_l, rotation = 90, fontsize=font_s) # mir labels x-axis
	plt.yticks(np.arange(len(label_l))+0.5,label_l, fontsize=font_s) # mir labels y-axis
	plt.tick_params(axis='both',bottom='off',left='off',right='off',top='off') # turns off all axis ticks
	plt.xlim(xmax=len(label_l)) # x-axis length = number of mirs
	plt.ylim(ymax=len(label_l)) # y-axis length = number of mirs
	plt.colorbar(hm)
	#cb.ax.tick_params(labelsize=font_s)
	if le:
	plt.title("Leading Edge Geneset Overlap Ratio")
	else:
	plt.title("Geneset Overlap Ratio")#, fontsize=font_s)
	plt.savefig(outPath,format='pdf',bbox_inches='tight')
	plt.close()
	#plt.show()

	def bargraph(db_num_l, occ_num_l, uni_num_l, annot_num_l, out_path, sel):
	''' CREATES
	a Multiplot of two Bar-Graphs in one document.'''

	all_num_l = [db_num_l, occ_num_l, uni_num_l, annot_num_l]
	if sel: all_num_l = all_num_l[:-1] # solely selecting MTIs: uni = annot number
	mir_num_tuple = tuple(map(lambda num_l: num_l[0],all_num_l)) # number of mirnas per tool step in a tuple
	tar_num_tuple = tuple(map(lambda num_l: num_l[1],all_num_l)) # number of mir-tar-ias per tool step in a tuple

	xtick_name_l = ['DB search','DB occurrence','UniProt mapping', 'Annotation mapping']
	if sel: xtick_name_l = xtick_name_l[:-1]

	# number of bars and their locations for plot 1 and 2 #
	numBar = 4
	width = 0.8 # bar width
	if sel:
	numBar = 3
	loc1 = np.arange(numBar) # (0 1 2 3 4 5)
	loc2 = np.arange(numBar)

	## Subplot miRNAs ##
	plt.figure(figsize=(6,9), dpi=80,facecolor='w')
	plt.subplot(211)
	p1 = plt.bar(loc1,mir_num_tuple,width)#,color=color_list1)
	# Properties for plot 1 #
	plt.title('Number of miRNAs\nover analysis steps')
	#plt.ylabel('Number of miRNAs')
	plt.xticks(loc1,xtick_name_l, rotation=50)
	# no ticks, no labels (height) on yaxis #
	plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
	# numbers above bars (set for each) #
	for bar in p1:
	height = bar.get_height()
	num_height = set_numHeight(height,mir_num_tuple[0])
	plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
	ha='center',va='bottom')
	# set height of xaxis a bit higher for numbers above #
	if not mir_num_tuple[0] == 0:
	plt.axis([-0.3, 4, 0, mir_num_tuple[0]/6.+mir_num_tuple[0]])
	else:
	plt.axis([0, 4, -4, 4])

	## Subplot miRNA-target interactions ##
	plt.subplot(212)
	p2 = plt.bar(loc2,tar_num_tuple,width)#,color=color_list)
	# Properties for plot 2 #
	plt.title('Number of miRNA-target interactions\nover analysis steps')
	#plt.ylabel('Number of miRNA-target interactions')
	plt.xticks(loc2,xtick_name_l, rotation=50)
	plt.tick_params(axis='both',bottom='off',top='off',left='off',right='off',labelleft='off')
	# numbers above bars (set for each) #
	for bar in p2:
	max_h = max(tar_num_tuple[0],tar_num_tuple[2])
	height = bar.get_height()
	num_height = set_numHeight(height, max_h)
	plt.text(bar.get_x()+bar.get_width()/2.,num_height, '%d'%int(height),
	ha='center',va='bottom')
	# set height of xaxis a bit higher for numbers above #
	if not tar_num_tuple[0] == 0:
	plt.axis([-0.3, 4, 0, max_h/6.+max_h])
	else:
	plt.axis([0, 4, -4, 4])

	# adjust distance between subplots #
	plt.subplots_adjust(hspace=1)
	# save figure as pdf, remove white space #
	plt.savefig(out_path,format='pdf',bbox_inches='tight')
	plt.close()


	def venn(outpath, base_list, baseDict_list, baseName_list, mti):
	''' CREATES
	a venn diagram with the number of miRNAs/ MTIs '''

	color_list = ["#4F81BD","#9BBB59","#8064A2","#C0504D"]
	colors = []
	# R - string #
	r_str = '''suppressPackageStartupMessages(library(VennDiagram))
	'''
	# create lists of miRFams and colors for each DB in R #
	for base in base_list:
	b_dict = getBaseDict(base, baseDict_list, baseName_list)
	if not mti:
	add = 'miRs.pdf'
	main = 'miRNAs'
	miRFam_li = list(b_dict.keys())
	else:
	add = 'MTIs.pdf'
	main = 'miRNA-target interactions'
	miRFam_li = b_dict
	miRFam_str = str(miRFam_li).replace("[", "").replace("]","")

	if miRFam_str:
	r_str += ''+base+' <- c('+miRFam_str+')\n'
	colors.append(color_list[base_list.index(base)])

	color_str = str(colors)[1:-1]
	bases = str(base_list)[1:-1]

	# set different circle and label sizes for diff. numbers of sets #
	if not len(base_list) == 0:
	if len(base_list) == 4 or len(base_list) == 3:
	catCex = '1.3' # label size
	labelPos = ')' # label position (here: default)
	else:
	catCex = '2'
	labelPos = ',cat.pos=0)' # position (here: top - 0 degrees )

	# Send lists for each DB to R #
	r_str += '''base_data <- list('''+bases.replace("'", '')+''') # list of miRNAs per DB
	names(base_data) <- c('''+bases.replace("'", '"')+''') # DB names for miRNA lists
	pdf(file="'''+outpath+'Venn_'+add+'''",7,7)\n'''
	# Create Venn-Diagrams #
	r_str += '''grid.draw(venn.diagram(base_data, filename = NULL, margin=0.1, main="'''+main+'''",
	main.fontfamily="sans", main.cex='''+catCex+''',main.pos=c(0.5,1),
	scaled=FALSE,euler.d=FALSE, cat.fontfamily=rep("sans",'''+str(len(base_list))+'''),
	col=c('''+color_str+'''),cex=2,fontfamily="sans",cat.cex='''+catCex+labelPos+''')
	'''
	r_str += "dev.off()"
	# create temporary file and open it with R (command line) #
	with tempfile.NamedTemporaryFile(suffix='.R') as tmp:
	tmp.write(r_str)
	tmp.flush()
	os.system("R --vanilla < {tmp}".format(tmp=tmp.name)+" >/dev/null")
	tmp.close()


	def getBaseDict(base, baseDict_list, baseName_list):
	'''returns the dictionary of the given target base'''
	return baseDict_list[baseName_list.index(base)]

	def set_numHeight(height,maxi):
	''' returns the height to write the number '''
	if height < maxi/6:
	num_height = 1.5*height
	elif height < maxi/4:
	num_height = 1.25*height
	elif height < maxi/2:
	num_height = 1.1*height
	else: num_height = 1.05*height
	return num_height