0.6.2: internal changes to bindetect

loosolab · Jun 19, 2019 · 4f346b5 · 4f346b5
1 parent 8cad568
commit 4f346b5
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 36 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,8 @@
+## 0.6.2 (2019-06-19)
+- Added --skip-excel option to skip excel overview per factor in BINDetect (can slow the run considerably for large output)
+- Internal changes to BINDetect for better debug overview
+- Fixed normalization for plotAggregate
+
 ## 0.6.1 (2019-06-05)
 - Fixed motif cluster names to prefix C_
 - Added --filter function to format_motifs to filter output motifs using names/ids

diff --git a/tobias/__init__.py b/tobias/__init__.py
@@ -1 +1 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
diff --git a/tobias/footprinting/bindetect.py b/tobias/footprinting/bindetect.py
@@ -80,6 +80,7 @@ def add_bindetect_arguments(parser):
 	optargs.add_argument('--bound-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
 	optargs.add_argument('--pseudo', type=float, metavar="<float>", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None)
 	optargs.add_argument('--time-series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
+	optargs.add_argument('--skip-excel', action='store_true', help="Skip creation of excel files - for large datasets, this will speed up BINDetect considerably")
 
 	runargs = parser.add_argument_group("Run arguments")
 	runargs.add_argument('--outdir', metavar="<directory>", help="Output directory to place TFBS/plots in (default: bindetect_output)", default="bindetect_output")
@@ -204,6 +205,9 @@ def run_bindetect(args):
 	peak_chroms = peaks.get_chroms()
 	peak_columns = len(peaks[0]) #number of columns
 
+	if args.debug:
+		peaks = RegionList(peaks[:1000])
+
 	#Header
 	if args.peak_header != None:
 		content = open(args.peak_header, "r").read()
@@ -236,7 +240,6 @@ def run_bindetect(args):
 	bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])
 	logger.info("- GC content estimated at {0:.2f}%".format(gc_content*100))
 
-
 	################ Get motifs ################
 	logger.info("Reading motifs from file") 
 	motif_list = MotifList()

diff --git a/tobias/footprinting/bindetect_functions.py b/tobias/footprinting/bindetect_functions.py
@@ -321,7 +321,7 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 			outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state))
 			chosen_bool = 1 if state == "bound" else 0
 			bedlines_subset = [bedline for bedline in bedlines if bedline[condition + "_bound"] == chosen_bool]
-			bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
+			#bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
 			dict_to_tab(bedlines_subset, outfile, chosen_columns)
 
 	##### Write overview with scores, bound and log2fcs ####
@@ -331,22 +331,27 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 
 	#Write xlsx overview
 	bed_table = pd.DataFrame(bedlines)
-	try:
-		overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
-		writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter')
-		bed_table.to_excel(writer, index=False, columns=overview_columns)
-
-		worksheet = writer.sheets['Sheet1']
-		no_rows, no_cols = bed_table.shape
-		worksheet.autofilter(0,0,no_rows, no_cols-1)
-		writer.save()
-
-	except:
-		print("Error writing excelfile for TF {0}".format(TF_name))
-		sys.exit()
-
+	stime_excel = datetime.now()
+	if args.skip_excel == False:
+		try:
+			overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
+			writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') #, options=dict(constant_memory=True))
+			bed_table.to_excel(writer, index=False, columns=overview_columns)
+
+			#autfilter not possible with constant_memory
+			worksheet = writer.sheets['Sheet1']
+			no_rows, no_cols = bed_table.shape
+			worksheet.autofilter(0,0,no_rows, no_cols)
+			writer.save()
+
+		except Exception as e:
+			print("Error writing excelfile for TF {0}".format(TF_name))
+			print(e)
+			sys.exit()
+
+	etime_excel = datetime.now()
 	etime = datetime.now()
-	logger.debug("{0} - Local effects took:\t{1}".format(TF_name, etime - stime))
+	logger.debug("{0} - Local effects took:\t{1} (excel: {2})".format(TF_name, etime - stime, etime_excel - stime_excel))
 
 	############################## Global effects ##############################
 
@@ -402,6 +407,8 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 			info_table.at[TF_name, base + "_change"] = 0
 			info_table.at[TF_name, base + "_pvalue"] = 1
 
+		#stime_plot = datetime.now()
+
 		#### Plot comparison ###
 		fig, ax = plt.subplots(1,1)
 		ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True)
@@ -433,6 +440,9 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 		log2fc_pdf.savefig(fig, bbox_inches='tight')
 		plt.close(fig)
 
+		#etime_plot = datetime.now()
+		#logger.debug("{0} - Plotting took:\t{1}".format(TF_name, etime_plot - stime_plot))
+
 	log2fc_pdf.close()	
 
 	etime = datetime.now()

diff --git a/tobias/motifs/format_motifs.py b/tobias/motifs/format_motifs.py
@@ -68,20 +68,30 @@ def run_formatmotifs(args):
 	motif_list = MotifList()
 	converted_content = ""
 	for f in motif_files:
-		logger.info("- {0}".format(f))
+		logger.debug("- {0}".format(f))
 		motif_list.extend(MotifList().from_file(f))
 
 	logger.info("Read {} motifs\n".format(len(motif_list)))
 
+	#Sort out duplicate motifs
+	all_motif_ids = [motif.id for motif in motif_list]
+	unique_motif_ids = set(all_motif_ids)
+	if len(all_motif_ids) != len(unique_motif_ids):
+		logger.info("Found duplicate motif ids in file - choosing first motif with unique id.")
+		motif_list = MotifList([motif_list[all_motif_ids.index(motifid)] for motifid in unique_motif_ids])
+		logger.info("Reduced to {0} unique motif ids".format(len(motif_list)))
+
 	### Filter motif list ###
 	if args.filter != None:
 
 		#Read filter
 		logger.info("Reading entries in {0}".format(args.filter))
 		entries = open(args.filter, "r").read().split()
+		logger.info("Read {0} unique filter values".format(len(set(entries))))
 
 		#Match to input motifs #print(entries)
 		logger.info("Matching motifs to filter")
+		used_filters = []
 		filtered_list = MotifList()
 		for input_motif in motif_list:
 			found_in_filter = 0
@@ -92,13 +102,16 @@ def run_formatmotifs(args):
 					filtered_list.append(input_motif)
 					logger.debug("Selected motif {0} ({1}) due to filter value {2}".format(input_motif.name, input_motif.id, entries[i]))
 					found_in_filter = 1
+					used_filters.append(entries[i])
 
 			if found_in_filter == 0:
 				logger.debug("Could not find any match to motif {0} ({1}) in filter".format(input_motif.name, input_motif.id))
 
 		logger.info("Filtered number of motifs from {0} to {1}".format(len(motif_list), len(filtered_list)))
 		motif_list = filtered_list
 
+		logger.debug("Filters not used: {0}".format(list(set(entries) - set(used_filters))))
+
 	#### Write out results ####
 	if args.task == "split":
 		logger.info("Writing individual files to directory {0}".format(args.output))

diff --git a/tobias/plotting/plot_aggregate.py b/tobias/plotting/plot_aggregate.py
@@ -63,7 +63,7 @@ def add_aggregate_arguments(parser):
 	PLOT.add_argument('--share-y', metavar="", help="Share y-axis range across plots (none/signals/sites/both). Use \"--share_y signals\" if bigwig signals have similar ranges. Use \"--share_y sites\" if sites per bigwig are comparable, but bigwigs themselves aren't comparable. (default: none)", choices=["none", "signals", "sites", "both"], default="none")
 
 	#Signals / regions
-	PLOT.add_argument('--norm-comparisons', action='store_true', help="Normalize the aggregate signal in comparison column/row to the same range (default: the true range is shown)")
+	PLOT.add_argument('--normalize', action='store_true', help="Normalize the aggregate signal to be in the same range (default: the true range of values is shown)")
 	PLOT.add_argument('--negate', action='store_true', help="Negate overlap with regions")
 	PLOT.add_argument('--log-transform', help="", action="store_true")
 	PLOT.add_argument('--plot-boundaries', help="Plot TFBS boundaries", action='store_true')
@@ -250,8 +250,12 @@ def run_aggregate(args):
 				signalmat_log = np.log2(signalmat_abs + 1)
 				signalmat_log[signalmat < 0] *= -1	 #original negatives back to <0
 				signalmat = signalmat_log
-
+			
 			aggregate = np.nanmean(signalmat, axis=0)
+
+			if args.normalize:
+				aggregate = preprocessing.minmax_scale(aggregate)
+
 			aggregate_dict[signal_name][region_name] = aggregate
 
 			signalmat = None
@@ -263,7 +267,7 @@ def run_aggregate(args):
 
 	logger.comment("")
 	logger.info("---- Analysis ----")
-	
+
 	#Measure of footprint depth in comparison to baseline
 	logger.info("Calculating footprint depth measure")
 	logger.info("FPD (signal,regions): footprint_width baseline middle FPD")
@@ -430,28 +434,18 @@ def run_aggregate(args):
 				aggregate = aggregate_dict[signal_name][region_name] 
 				axarr[row, col].plot(xvals, aggregate, color=colors[col+row], linewidth=1, label=signal_name)
 
-				aggregate_norm = preprocessing.minmax_scale(aggregate)
-
 				#Compare across rows and cols
-				if col_compare: 	#compare between different columns by adding one more column
-					if args.norm_comparisons:
-						aggregate_compare = aggregate_norm
-					else:
-						aggregate_compare = aggregate		
-					axarr[row, -1].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8, label=col_names[col])
+				if col_compare: 	#compare between different columns by adding one more column	
+					axarr[row, -1].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8, label=col_names[col])
 					axarr[row, -1].legend(loc="lower right")
 
 				if row_compare:	#compare between different rows by adding one more row
-					if args.norm_comparisons:
-						aggregate_compare = aggregate_norm
-					else:
-						aggregate_compare = aggregate
-					axarr[-1, col].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8, label=row_names[row])
+					axarr[-1, col].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8, label=row_names[row])
 					axarr[-1, col].legend(loc="lower right")
 
 				#Diagonal comparison
 				if n_rows == n_cols and col_compare and row_compare and col == row:
-					axarr[-1, -1].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8)
+					axarr[-1, -1].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8)
 
 				#Add number of sites to plot
 				axarr[row, col].text(0.98,0.98,str(len(regions_dict[region_name])), transform = axarr[row, col].transAxes, fontsize=12, va="top", ha="right")