Merge remote-tracking branch 'origin/master' into network

loosolab · Jul 16, 2019 · 0222aa0 · 0222aa0
2 parents 6a0a33f + 0c8a9dc
commit 0222aa0
Show file tree

Hide file tree

Showing 18 changed files with 544 additions and 256 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,4 +1,36 @@
-## 0.4.0 (2019-04-26)
+## 0.6.3 (2019-07-16)
+- Increased size of texts in BINDetect volcano plot and moved label into plot
+
+## 0.6.2 (2019-06-19)
+- Added --skip-excel option to skip excel overview per factor in BINDetect (can slow the run considerably for large output)
+- Internal changes to BINDetect for better debug overview
+- Fixed normalization for plotAggregate
+
+## 0.6.1 (2019-06-05)
+- Fixed motif cluster names to prefix C_
+- Added --filter function to format_motifs to filter output motifs using names/ids
+
+## 0.6.0 (2019-05-29)
+- Added option to flip axes in PlotAggregate via "--signal-on-x"
+- Changed all command-line arguments containing "_" to "-" (e.g. --regions_in to --regions-in) (but retaining both options internally)
+
+## 0.5.3 (2019-05-28)
+- Improved error messaging from file writers in bindetect
+
+## 0.5.2 (2019-05-16)
+- Bugfix for reading meme format files in MotifList().from_file()
+
+## 0.5.1 (2019-05-15)
+- Internal changes to OneMotif and MotifList classes
+- Bindetect now takes directories/file(s) as input for --motifs
+
+## 0.5.0 (2019-05-02)
+- Added sum/mean/none scoring to ScoreBigwig as well as the option to get --absolute of input signal
+
+## 0.4.1 (2019-04-29)
+- Fixed weird "can't pickle SwigPyObject objects"-error in bindetect
+
+## 0.4.0 (2019-04-29)
 - Added --add_region_columns to TFBScan
 - Renamed FootprintScores to ScoreBigwig
 - Added normalization of input score distributions in BINDetect

diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@ TOBIAS - Transcription factor Occupancy prediction By Investigation of ATAC-seq
 =======================================
 
 [![PyPI Version](https://img.shields.io/pypi/v/tobias.svg?style=plastic)](https://pypi.org/project/tobias/)
+[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/tobias/README.html)
 
 Introduction 
 ------------
@@ -113,6 +114,11 @@ Split a motif file containing several motifs:
 $ TOBIAS FormatMotifs --input test_data/example_motifs.txt --format pfm --task split --output split_motifs
 ```
 
+Filter a larger motif file using TF names:
+```
+$ echo 'MAFK CTCF JUNB' > TF_names.txt
+$ FormatMotifs --input test_data/example_motifs.txt --output filtered_motifs.txt --filter TF_names.txt
+```
 
 Snakemake pipeline
 ------------

diff --git a/setup.py b/setup.py
@@ -52,11 +52,12 @@ def readme():
 		version=find_version(os.path.join(setupdir, "tobias", "__init__.py")),	#get version from __init__.py
 		description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal',
 		long_description=readme(),
+		long_description_content_type='text/markdown',
 		url='https://github.molgen.mpg.de/loosolab/TOBIAS',
 		author='Mette Bentsen',
 		author_email='mette.bentsen@mpi-bn.mpg.de',
 		license='MIT',
-		packages=find_packages(), #"tobias"), #['tobias', 'tobias.footprinting', 'tobias.plotting', 'tobias.motifs', 'tobias.misc', 'tobias.utils'],
+		packages=find_packages(),
 		entry_points={
 			'console_scripts': ['TOBIAS=tobias.TOBIAS:main']
 		},
@@ -77,6 +78,7 @@ def readme():
 			'pyBigWig',
 			'MOODS-python',
 		],
+		scripts=["tobias/utils/filter_important_factors.py"],
 		classifiers=[
 			'License :: OSI Approved :: MIT License',
 			'Intended Audience :: Science/Research',

diff --git a/tobias/TOBIAS.py b/tobias/TOBIAS.py
@@ -119,10 +119,15 @@ def main():
 
 			#Add version to subparser
 			subparser.add_argument("--version", action='version', version=TOBIAS_VERSION)
+			subparser = add_underscore_options(subparser)
 
 			#Add parser for old tool names
 			if "replaces" in info[tool]:
-				all_tool_parsers[info[tool]["replaces"].lower()] = subparser
+				replace_tool = info[tool]["replaces"]
+				subparser = subparsers.add_parser(replace_tool, usage=SUPPRESS)
+				subparser = info[tool]["add_arguments"](subparser)
+				subparser.set_defaults(func=info[tool]["function"])
+				all_tool_parsers[replace_tool.lower()] = subparser
 
 		parser.description += "\n"
 

diff --git a/tobias/__init__.py b/tobias/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.0"
+__version__ = "0.6.3"
diff --git a/tobias/footprinting/atacorrect.py b/tobias/footprinting/atacorrect.py
@@ -71,13 +71,13 @@ def add_atacorrect_arguments(parser):
 
 	#Optional arguments
 	optargs = parser.add_argument_group('Optional arguments')
-	optargs.add_argument('--regions_in', metavar="<bed>", help="Input regions for estimating bias (default: regions not in peaks.bed)")
-	optargs.add_argument('--regions_out', metavar="<bed>", help="Output regions (default: peaks.bed)")
+	optargs.add_argument('--regions-in', metavar="<bed>", help="Input regions for estimating bias (default: regions not in peaks.bed)")
+	optargs.add_argument('--regions-out', metavar="<bed>", help="Output regions (default: peaks.bed)")
 	optargs.add_argument('--blacklist', metavar="<bed>", help="Blacklisted regions in .bed-format (default: None)") #file containing blacklisted regions to be excluded from analysis")
 	optargs.add_argument('--extend', metavar="<int>", type=int, help="Extend output regions with basepairs upstream/downstream (default: 100)", default=100)
-	optargs.add_argument('--split_strands', help="Write out tracks per strand", action="store_true")
-	optargs.add_argument('--norm_off', help="Switches off normalization based on number of reads", action='store_true')
-	optargs.add_argument('--track_off', metavar="<track>", help="Switch off writing of individual .bigwig-tracks (uncorrected/bias/expected/corrected)", nargs="*", choices=["uncorrected", "bias", "expected", "corrected"], default=[])
+	optargs.add_argument('--split-strands', help="Write out tracks per strand", action="store_true")
+	optargs.add_argument('--norm-off', help="Switches off normalization based on number of reads", action='store_true')
+	optargs.add_argument('--track-off', metavar="<track>", help="Switch off writing of individual .bigwig-tracks (uncorrected/bias/expected/corrected)", nargs="*", choices=["uncorrected", "bias", "expected", "corrected"], default=[])
 
 	optargs = parser.add_argument_group('Advanced ATACorrect arguments (no need to touch)')
 	optargs.add_argument('--k_flank', metavar="<int>", help="Flank +/- of cutsite to estimate bias from (default: 12)", type=int, default=12)
@@ -94,6 +94,8 @@ def add_atacorrect_arguments(parser):
 
 	runargs = add_logger_args(runargs)
 
+
+
 	return(parser)
 
 #--------------------------------------------------------------------------------------------------------#

diff --git a/tobias/footprinting/bindetect.py b/tobias/footprinting/bindetect.py
@@ -69,17 +69,18 @@ def add_bindetect_arguments(parser):
 	required = parser.add_argument_group('Required arguments')
 	required.add_argument('--signals', metavar="<bigwig>", help="Signal per condition (.bigwig format)", nargs="*")
 	required.add_argument('--peaks', metavar="<bed>", help="Peaks.bed containing open chromatin regions across all conditions")
-	required.add_argument('--motifs', metavar="<motifs>", help="Motifs in pfm/jaspar format")
+	required.add_argument('--motifs', metavar="<motifs>", help="Motif file(s) in pfm/jaspar format", nargs="*")
 	required.add_argument('--genome', metavar="<fasta>", help="Genome .fasta file")
 
 	optargs = parser.add_argument_group('Optional arguments')
-	optargs.add_argument('--cond_names', metavar="<name>", nargs="*", help="Names of conditions fitting to --signals (default: prefix of --signals)")
-	optargs.add_argument('--peak_header', metavar="<file>", help="File containing the header of --peaks separated by whitespace or newlines (default: peak columns are named \"_additional_<count>\")")
+	optargs.add_argument('--cond-names', metavar="<name>", nargs="*", help="Names of conditions fitting to --signals (default: prefix of --signals)")
+	optargs.add_argument('--peak-header', metavar="<file>", help="File containing the header of --peaks separated by whitespace or newlines (default: peak columns are named \"_additional_<count>\")")
 	#optargs.add_argument('--naming', metavar="<type>", help="Naming convention for TFs ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id")
-	optargs.add_argument('--motif_pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001)
-	optargs.add_argument('--bound_pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
+	optargs.add_argument('--motif-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001)
+	optargs.add_argument('--bound-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
 	optargs.add_argument('--pseudo', type=float, metavar="<float>", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None)
-	optargs.add_argument('--time_series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
+	optargs.add_argument('--time-series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
+	optargs.add_argument('--skip-excel', action='store_true', help="Skip creation of excel files - for large datasets, this will speed up BINDetect considerably")
 
 	runargs = parser.add_argument_group("Run arguments")
 	runargs.add_argument('--outdir', metavar="<directory>", help="Output directory to place TFBS/plots in (default: bindetect_output)", default="bindetect_output")
@@ -190,7 +191,6 @@ def run_bindetect(args):
 	figure_pdf.savefig(bbox_inches='tight')
 	plt.close()
 
-
 	################# Peaks / GC in peaks ################
 	#Read peak and peak_header
 	peaks = RegionList().from_bed(args.peaks)
@@ -205,6 +205,9 @@ def run_bindetect(args):
 	peak_chroms = peaks.get_chroms()
 	peak_columns = len(peaks[0]) #number of columns
 
+	if args.debug:
+		peaks = RegionList(peaks[:1000])
+
 	#Header
 	if args.peak_header != None:
 		content = open(args.peak_header, "r").read()
@@ -237,16 +240,15 @@ def run_bindetect(args):
 	bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])
 	logger.info("- GC content estimated at {0:.2f}%".format(gc_content*100))
 
-
 	################ Get motifs ################
 	logger.info("Reading motifs from file") 
-
-	motif_content = open(args.motifs).read()
-	converted_content = convert_motif(motif_content, "pfm")
-	motif_list = pfm_to_motifs(converted_content) 			#List of OneMotif objects
+	motif_list = MotifList()
+	args.motifs = expand_dirs(args.motifs)
+	for f in args.motifs:
+		motif_list += MotifList().from_file(f)  #List of OneMotif objects
 	no_pfms = len(motif_list)
+	logger.info("- Read {0} motifs".format(no_pfms))
 
-	logger.info("- Found {0} motifs in file".format(no_pfms))
 	logger.debug("Getting motifs ready")
 	motif_list.bg = bg
 
@@ -328,7 +330,7 @@ def run_bindetect(args):
 		results = [task.get() for task in task_list]
 
 	logger.info("Done scanning for TFBS across regions!")
-	logger.stop_logger_queue()	#stop the listening process (wait until all was written)
+	#logger.stop_logger_queue()	#stop the listening process (wait until all was written)
 
 	#--------------------------------------#
 	logger.info("Waiting for bedfiles to write")
@@ -519,15 +521,17 @@ def run_bindetect(args):
 	cols = len(info_columns)
 	rows = len(motif_names)
 	info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=motif_names)
-	
+
 	#Starting calculations
 	results = []
 	if args.cores == 1:
 		for name in motif_names:
 			logger.info("- {0}".format(name))
 			results.append(process_tfbs(name, args, log2fc_params))
 	else:
-		task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params)) for name in motif_names]
+		logger.debug("Sending jobs to worker pool")
+
+		task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params, )) for name in motif_names]
 		monitor_progress(task_list, logger) 	#will not exit before all jobs are done
 		results = [task.get() for task in task_list]
 
@@ -537,6 +541,8 @@ def run_bindetect(args):
 	pool.terminate()
 	pool.join()
 
+	logger.stop_logger_queue()
+
 	#-------------------------------------------------------------------------------------------------------------#	
 	#------------------------------------------------ Cluster TFBS -----------------------------------------------#	
 	#-------------------------------------------------------------------------------------------------------------#	
@@ -603,7 +609,6 @@ def run_bindetect(args):
 		n_rows = worksheet.dim_rowmax
 		n_cols = worksheet.dim_colmax
 		worksheet.autofilter(0,0,n_rows,n_cols)
-
 	writer.save()
 
 	#Format comparisons
@@ -632,7 +637,7 @@ def run_bindetect(args):
 			base = cond1 + "_" + cond2
 
 			#Make copy of motifs and fill in with metadata
-			comparison_motifs = copy.deepcopy(motif_list)
+			comparison_motifs = motif_list 	#copy.deepcopy(motif_list) - swig pickle error, just overwrite motif_list
 			for motif in comparison_motifs:
 				name = motif.prefix
 				motif.change = float(info_table.at[name, base + "_change"])

diff --git a/tobias/footprinting/bindetect_functions.py b/tobias/footprinting/bindetect_functions.py
@@ -21,6 +21,7 @@
 import matplotlib
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
+import matplotlib.patches as patches
 from matplotlib.backends.backend_pdf import PdfPages
 from matplotlib.ticker import NullFormatter
 from cycler import cycler
@@ -279,13 +280,19 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 	comparisons = args.comparisons
 
 	#Read file to list of dicts
+	stime = datetime.now()
 	header = ["TFBS_chr", "TFBS_start", "TFBS_end", "TFBS_name", "TFBS_score", "TFBS_strand"] + args.peak_header_list + ["{0}_score".format(condition) for condition in args.cond_names]
 	with open(filename) as f:
 		bedlines = [dict(zip(header, line.rstrip().split("\t"))) for line in f.readlines()]
 	n_rows = len(bedlines)
+	etime = datetime.now()
+	logger.debug("{0} - Reading took:\t{1}".format(TF_name, etime - stime))
+
 
 	############################## Local effects ###############################
 
+	stime = datetime.now()
+
 	#Sort, scale and calculate log2fc
 	bedlines = sorted(bedlines, key=lambda line: (line["TFBS_chr"], int(line["TFBS_start"]), int(line["TFBS_end"])))
 	for line in bedlines:
@@ -294,7 +301,7 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 		for condition in args.cond_names:
 			threshold = args.thresholds[condition]
 			line[condition + "_score"] = float(line[condition + "_score"])
-			line[condition + "_score"] = np.round(args.norm_objects[condition].normalize(line[condition + "_score"] ), 5)
+			line[condition + "_score"] = round(args.norm_objects[condition].normalize(line[condition + "_score"] ), 5)
 			line[condition + "_bound"] = 1 if line[condition + "_score"] > threshold else 0
 
 		#Comparison specific
@@ -315,7 +322,7 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 			outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state))
 			chosen_bool = 1 if state == "bound" else 0
 			bedlines_subset = [bedline for bedline in bedlines if bedline[condition + "_bound"] == chosen_bool]
-			bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
+			#bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
 			dict_to_tab(bedlines_subset, outfile, chosen_columns)
 
 	##### Write overview with scores, bound and log2fcs ####
@@ -325,23 +332,32 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 
 	#Write xlsx overview
 	bed_table = pd.DataFrame(bedlines)
-	try:
-		overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
-		writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter')
-		bed_table.to_excel(writer, index=False, columns=overview_columns)
-
-		worksheet = writer.sheets['Sheet1']
-		no_rows, no_cols = bed_table.shape
-		worksheet.autofilter(0,0,no_rows, no_cols-1)
-		writer.save()
-
-	except:
-		print("Error writing excelfile for TF {0}".format(TF_name))
-		sys.exit()
-
+	stime_excel = datetime.now()
+	if args.skip_excel == False:
+		try:
+			overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
+			writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') #, options=dict(constant_memory=True))
+			bed_table.to_excel(writer, index=False, columns=overview_columns)
+
+			#autfilter not possible with constant_memory
+			worksheet = writer.sheets['Sheet1']
+			no_rows, no_cols = bed_table.shape
+			worksheet.autofilter(0,0,no_rows, no_cols)
+			writer.save()
+
+		except Exception as e:
+			print("Error writing excelfile for TF {0}".format(TF_name))
+			print(e)
+			sys.exit()
+
+	etime_excel = datetime.now()
+	etime = datetime.now()
+	logger.debug("{0} - Local effects took:\t{1} (excel: {2})".format(TF_name, etime - stime, etime_excel - stime_excel))
 
 	############################## Global effects ##############################
 
+	stime = datetime.now()
+
 	#Get info table ready
 	info_columns = ["total_tfbs"]
 	info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["mean_score", "bound"])])
@@ -392,6 +408,8 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 			info_table.at[TF_name, base + "_change"] = 0
 			info_table.at[TF_name, base + "_pvalue"] = 1
 
+		#stime_plot = datetime.now()
+
 		#### Plot comparison ###
 		fig, ax = plt.subplots(1,1)
 		ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True)
@@ -423,8 +441,14 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 		log2fc_pdf.savefig(fig, bbox_inches='tight')
 		plt.close(fig)
 
+		#etime_plot = datetime.now()
+		#logger.debug("{0} - Plotting took:\t{1}".format(TF_name, etime_plot - stime_plot))
+
 	log2fc_pdf.close()	
 
+	etime = datetime.now()
+	logger.debug("{0} - Global effects took:\t{1}".format(TF_name, etime - stime))
+
 	#################### Remove temporary file ######################
 	try:
 		os.remove(filename)
@@ -592,9 +616,13 @@ def plot_bindetect(motifs, cluster_obj, conditions, args):
 		ax1.scatter(coord[0], coord[1], color=diff_scores[TF]["color"], s=4.5)
 
 		if diff_scores[TF]["show"] == True:
-			txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=7))
+			txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=8))
 
-	adjust_text(txts, ax=ax1, text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5))  #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1))
+	#Plot custom legend for colors
+	legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="Higher scores in {0}".format(conditions[0])),
+						Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="Higher scores in {0}".format(conditions[1]))]
+	l = ax1.legend(handles=legend_elements, loc="lower left", framealpha=0.5)
+	adjust_text(txts, ax=ax1, add_objects=[l], text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5))  #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1))
 
 	"""
 	#Add arrows to other cluster members
@@ -614,12 +642,6 @@ def plot_bindetect(motifs, cluster_obj, conditions, args):
 
 				ax1.arrow(point_x, point_y, len_x, len_y, linestyle="-", color="black", lw=0.5)
 	"""
-	#print(txts)
-
-	#Plot custom legend for colors
-	legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="Higher scores in {0}".format(conditions[0])),
-						Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="Higher scores in {0}".format(conditions[1]))]
-	ax1.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
 
 	return(fig)