diff --git a/tobias/TOBIAS.py b/tobias/TOBIAS.py index b04a693..ac7475e 100644 --- a/tobias/TOBIAS.py +++ b/tobias/TOBIAS.py @@ -29,11 +29,50 @@ from tobias.misc.subsample_bam import * from tobias.misc.merge_pdfs import * from tobias.misc.maxpos import * +#from tobias.misc.create_network import * +from tobias.misc.log2table import * TOBIAS_VERSION = "0.2" #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change here :-) def main(): + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# + + all_parser_info = {"Tools for footprinting analysis": + { + "ATACorrect":{"help":"Correct reads with regards to Tn5 sequence bias", "add_arguments": add_atacorrect_arguments, "function":run_atacorrect}, + "FootprintScores":{"help":"Calculate footprint scores from cutsites", "add_arguments": add_footprint_arguments, "function":run_footprinting, "space":"\t"}, + "BINDetect":{"help":"Detect TF binding from footprints", "add_arguments": add_bindetect_arguments, "function":run_bindetect}, + }, + + "Tools for working with motifs/TFBS": + { + "TFBScan": {"help":"Identify positions of TFBS given sequence and motifs", "add_arguments": add_tfbscan_arguments, "function": run_tfbscan}, + "FormatMotifs": {"help": "Utility to deal with motif files", "add_arguments": add_formatmotifs_arguments, "function": run_formatmotifs}, + "ClusterTFBS": {"help": "Cluster TFs based on overlap of sites", "add_arguments": add_clustering_arguments, "function": run_clustering}, + "ScoreBed": {"help":"Score .bed-file with signal from .bigwig-file(s)", "add_arguments": add_scorebed_arguments, "function": run_scorebed}, + }, + + "Visualization tools": + { + "PlotAggregate": {"help": "Aggregate of .bigwig-signal across TF binding sites", "add_arguments": add_aggregate_arguments, "function": run_aggregate, "space":"\t"}, + "PlotHeatmap": {"help": "Heatmap of .bigwig-signal across TF binding sites", "add_arguments": add_heatmap_arguments, "function": run_heatmap}, + "PlotChanges": {"help": "Plot changes in TF binding across multiple conditions (from BINDetect output)", "add_arguments": add_plotchanges_arguments, "function": run_plotchanges} + }, + + "Miscellaneous tools": + { + "MergePDF": {"help": "Merge pdf files to one", "add_arguments":add_mergepdf_arguments, "function":run_mergepdf}, + "MaxPos": {"help": "Get .bed-positions of highest bigwig signal within .bed-regions", "add_arguments": add_maxpos_arguments, "function": run_maxpos}, + "SubsampleBam": {"help": "Subsample a .bam-file using samtools", "add_arguments": add_subsample_arguments, "function": run_subsampling}, + #"CreateNetwork": {"help": "Create TF-gene network from annotated TFBS", "add_arguments": add_network_arguments, "function": run_network, "space":"\t"} + "Log2Table": {"help": "Convert logs from PlotAggregate to tab-delimitered tables of footprint stats", "add_arguments": add_log2table_arguments, "function": run_log2table} + } + } + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# + parser = argparse.ArgumentParser("TOBIAS", usage=SUPPRESS) parser._action_groups.pop() parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=25, width=90) @@ -41,7 +80,7 @@ def main(): ______________________________________________________________________________ | | | ~ T O B I A S ~ | - | Transcription factor Occupancy prediction | + | Transcription factor Occupancy prediction | | By Investigation of ATAC-seq Signal | |______________________________________________________________________________| @@ -50,133 +89,21 @@ def main(): ''') subparsers = parser.add_subparsers(title=None, metavar="") - all_tool_parsers = {} - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for footprinting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - - parser.description += "Tools for footprinting analysis:\n" - - name, hlp = "ATACorrect", "Correct reads with regards to Tn5 sequence bias" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - atacorrect_parser = subparsers.add_parser(name, usage=SUPPRESS) - atacorrect_parser = add_atacorrect_arguments(atacorrect_parser) #add atacorrect arguments to the atacorrect subparser - atacorrect_parser.set_defaults(func=run_atacorrect) - all_tool_parsers[name.lower()] = atacorrect_parser - - name, hlp = "FootprintScores", "Calculate footprint scores from cutsites" - parser.description += " {0}\t{1}\n".format(name, hlp) - footprint_parser = subparsers.add_parser(name, usage=SUPPRESS) - footprint_parser = add_footprint_arguments(footprint_parser) - footprint_parser.set_defaults(func=run_footprinting) - all_tool_parsers[name.lower()] = footprint_parser - - name, hlp = "BINDetect", "Detects TF binding from footprints" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - detect_parser = subparsers.add_parser(name, usage=SUPPRESS) - detect_parser = add_bindetect_arguments(detect_parser) - detect_parser.set_defaults(func=run_bindetect) - all_tool_parsers[name.lower()] = detect_parser - - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for working with TFBS ~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - - parser.description += "\nTools for working with motifs/TFBS:\n" - - name, hlp = "TFBScan", "Identify positions of TFBS given sequence and motifs" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - tfbscan_parser = subparsers.add_parser(name, usage=SUPPRESS) - tfbscan_parser = add_tfbscan_arguments(tfbscan_parser) - tfbscan_parser.set_defaults(func=run_tfbscan) - all_tool_parsers[name.lower()] = tfbscan_parser - - name, hlp = "FormatMotifs", "Utility to deal with motif files" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - formatmotifs_parser = subparsers.add_parser(name, usage=SUPPRESS) - formatmotifs_parser = add_formatmotifs_arguments(formatmotifs_parser) - formatmotifs_parser.set_defaults(func=run_formatmotifs) - all_tool_parsers[name.lower()] = formatmotifs_parser - - - name, hlp = "ClusterTFBS", "Cluster TFs based on overlap of sites" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - clustering_parser = subparsers.add_parser(name, usage=SUPPRESS) - clustering_parser = add_clustering_arguments(clustering_parser) - clustering_parser.set_defaults(func=run_clustering) - all_tool_parsers[name.lower()] = clustering_parser - - name, hlp = "ScoreBed", "Score .bed-file with signal from .bigwig-file(s)" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - scorebed_parser = subparsers.add_parser(name, usage=SUPPRESS) - scorebed_parser = add_scorebed_arguments(scorebed_parser) - scorebed_parser.set_defaults(func=run_scorebed) - all_tool_parsers[name.lower()] = scorebed_parser - - - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for plotting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - - parser.description += "\nVisualization tools:\n" - - name, hlp = "PlotAggregate", "Aggregate of .bigwig-signal across TF binding sites" - parser.description += " {0}\t{1}\n".format(name, hlp) - aggregate_parser = subparsers.add_parser(name, usage=SUPPRESS) - aggregate_parser = add_aggregate_arguments(aggregate_parser) - aggregate_parser.set_defaults(func=run_aggregate) - all_tool_parsers[name.lower()] = aggregate_parser - - name, hlp = "PlotHeatmap", "Heatmap of .bigwig-signal across TF binding sites" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - heatmap_parser = subparsers.add_parser(name, usage=SUPPRESS) - heatmap_parser = add_heatmap_arguments(heatmap_parser) - heatmap_parser.set_defaults(func=run_heatmap) - all_tool_parsers[name.lower()] = heatmap_parser - - name, hlp = "PlotChanges", "Plot changes in TF binding across multiple conditions (from BINDetect output)" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - changeplot_parser = subparsers.add_parser(name, usage=SUPPRESS) - changeplot_parser = add_plotchanges_arguments(changeplot_parser) - changeplot_parser.set_defaults(func=run_plotchanges) - all_tool_parsers[name.lower()] = changeplot_parser - - - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Misc tools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - - parser.description += "\nMiscellaneous tools:\n" - - name, hlp = "MergePDF", "Merge pdf files to one" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - mergepdf_parser = subparsers.add_parser(name, usage=SUPPRESS) - mergepdf_parser = add_mergepdf_arguments(mergepdf_parser) - mergepdf_parser.set_defaults(func=run_mergepdf) - all_tool_parsers[name.lower()] = mergepdf_parser - - name, hlp = "MaxPos", "Get .bed-positions of highest bigwig signal within .bed-regions" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - maxpos_parser = subparsers.add_parser(name, usage=SUPPRESS) - maxpos_parser = add_maxpos_arguments(maxpos_parser) - maxpos_parser.set_defaults(func=run_maxpos) - all_tool_parsers[name.lower()] = maxpos_parser - - name, hlp = "SubsampleBam", "Subsample a .bam-file using samtools" - parser.description += " {0}\t\t{1}\n".format(name, hlp) - subsample_parser = subparsers.add_parser(name, usage=SUPPRESS) - subsample_parser = add_subsample_arguments(subsample_parser) - subsample_parser.set_defaults(func=run_subsampling) - all_tool_parsers[name.lower()] = subsample_parser - - parser.description += "\n" - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# + #Add all tools to parser + all_tool_parsers = {} + for group in all_parser_info: + parser.description += group + ":\n" + + info = all_parser_info[group] + for tool in info: + parser.description += " {0}{1}{2}\n".format(tool, info[tool].get("space", "\t\t"), info[tool]["help"]) + subparser = subparsers.add_parser(tool, usage=SUPPRESS) + subparser = info[tool]["add_arguments"](subparser) + subparser.set_defaults(func=info[tool]["function"]) + all_tool_parsers[tool.lower()] = subparser + + parser.description += "\n" parser.description += "For help on each tool, please run: TOBIAS --help\n" diff --git a/tobias/footprinting/ATACorrect_functions.py b/tobias/footprinting/ATACorrect_functions.py index 5f7bb3f..19d9de4 100644 --- a/tobias/footprinting/ATACorrect_functions.py +++ b/tobias/footprinting/ATACorrect_functions.py @@ -133,12 +133,15 @@ def bias_estimation(regions_list, params): #Map reads to positions read_per_pos = {} for read in read_lst_strand[strand]: - read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read] + if read.cigartuples is not None: + first_tuple = read.cigartuples[-1] if read.is_reverse else read.cigartuples[0] + if first_tuple[0] == 0 and first_tuple[1] > params.k_flank + max(np.abs(params.read_shift)): + read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read] for cutsite in read_per_pos: if cutsite > region.start and cutsite < region.end: #only reads within borders read = read_per_pos[cutsite][0] #use first read in list to establish kmer - no_cut = len(read_per_pos[cutsite]) + no_cut = min(len(read_per_pos[cutsite]), 10) #put cap on number of cuts to limit influence of outliers read.get_kmer(sequence_obj, k_flank) diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py index 722cb20..368c07f 100644 --- a/tobias/footprinting/BINDetect.py +++ b/tobias/footprinting/BINDetect.py @@ -435,9 +435,8 @@ def run_bindetect(args): leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params) #Flip over - mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]) - mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]) - + mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]).flatten() + mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten() popt, cov = curve_fit(lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf) norm_params = (mode, popt[0]) logger.debug("Theoretical normal parameters: {0}".format(norm_params)) diff --git a/tobias/footprinting/BINDetect_functions.py b/tobias/footprinting/BINDetect_functions.py index 1ecbf53..0193e0d 100644 --- a/tobias/footprinting/BINDetect_functions.py +++ b/tobias/footprinting/BINDetect_functions.py @@ -40,7 +40,8 @@ from tobias.utils.utilities import * from tobias.utils.motifs import * from tobias.utils.signals import * -from tobias.plotting.plot_bindetect import * + +import warnings #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# diff --git a/tobias/misc/log2table.py b/tobias/misc/log2table.py new file mode 100644 index 0000000..81fd4c8 --- /dev/null +++ b/tobias/misc/log2table.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +""" +Log2Table: Creates a table from PlotAggregate logfiles + +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT + +""" + + +import os +import sys +import argparse +import re +import pandas as pd + +#Utils from TOBIAS +from tobias.utils.utilities import * +from tobias.utils.logger import * + + + +def add_log2table_arguments(parser): + + parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) + description = "" + parser.description = format_help_description("Log2Table", description) + + parser._action_groups.pop() #pop -h + + #Required arguments + required = parser.add_argument_group('Required arguments') + required.add_argument('--logfiles', nargs="*", metavar="", help="Logfiles from PlotAggregate") + required.add_argument('--output', metavar="", help="Output directory for tables (default: current dir)", default=".") + required.add_argument('--prefix', metavar="", help="Prefix of output files", default="aggregate") + + return(parser) + +def run_log2table(args): + logger = TobiasLogger("Log2Table") + logger.begin() + + #Test input + check_required(args, ["logfiles"]) + + output_fpd = os.path.join(args.output, args.prefix + "_FPD.txt") + output_corr = os.path.join(args.output, args.prefix + "_CORRELATION.txt") + + FPD_data = [] + CORR_data = [] + + #Read all logfiles + for log_f in args.logfiles: + logger.info("Reading: {0}".format(log_f)) + with open(log_f) as f: + for line in f: + #Match FPD lines + #...... FPD (PWM_uncorrected,MA0073.1_all): 20 0.620 0.602 -0.018 + m = re.match(".*FPD\s\((.+),(.+)\): (.+) (.+) (.+) (.+)", line.rstrip()) + if m: + + elements = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6) + signal, sites, width, baseline, fp, fpd = elements + + columns = [signal, sites, width, baseline, fp, fpd] + FPD_data.append(columns) + + #Match correlation lines + #..... CORRELATION (PWM_uncorrected,MA0002.2_all) VS (PWM_corrected,MA0002.2_all): -0.17252 + m = re.match(".*CORRELATION \((.+),(.+)\) VS \((.+),(.+)\): (.+)", line.rstrip()) + if m: + elements = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5) + signal1, sites1, signal2, sites2, pearsonr = elements + + columns = [signal1, sites1, signal2, sites2, pearsonr] + CORR_data.append(columns) + + #Add reverse comparison as well + if pearsonr != "PEARSONR": + columns = [signal2, sites2, signal1, sites1, pearsonr] + CORR_data.append(columns) + + #All lines from all files read, write out tables + df_fpd = pd.DataFrame(FPD_data) + df_fpd.drop_duplicates(keep="first", inplace=True) + df_fpd.to_csv(output_fpd, sep="\t", index=False, header=False) + + df_corr = pd.DataFrame(CORR_data) + df_corr.drop_duplicates(keep="first", inplace=True) + df_corr.to_csv(output_corr, sep="\t", index=False, header=False) diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py index 6196fb4..db82788 100644 --- a/tobias/motifs/tfbscan.py +++ b/tobias/motifs/tfbscan.py @@ -142,8 +142,11 @@ def run_tfbscan(args): logger = TobiasLogger("TFBScan", args.verbosity) logger.begin() parser = add_tfbscan_arguments(argparse.ArgumentParser()) + logger.arguments_overview(parser, args) - logger.output_files([args.outfile]) + + if args.outfile != None: + logger.output_files([args.outfile]) ######## Read sequences from file and estimate background gc ######## diff --git a/tobias/plotting/plot_aggregate.py b/tobias/plotting/plot_aggregate.py index 4e24ff1..fdb6110 100644 --- a/tobias/plotting/plot_aggregate.py +++ b/tobias/plotting/plot_aggregate.py @@ -62,6 +62,7 @@ def add_aggregate_arguments(parser): PLOT.add_argument('--negate', action='store_true', help="Negate overlap with regions") PLOT.add_argument('--log_transform', help="", action="store_true") PLOT.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true') + #PLOT.add_argument('--outliers', help="") RUN = parser.add_argument_group("Run arguments") RUN = add_logger_args(RUN) @@ -231,10 +232,13 @@ def run_aggregate(args): signalmat = np.array([signal_dict[signal_name][reg.tup()] for reg in regions_dict[region_name]]) - #todo: Option to remove rows with lowest/highest signal (--percentiles?) - #signal_min, signal_max = np.percentile(signalmat, [1,99]) - #signalmat[signalmat < signal_min] = signal_min - #signalmat[signalmat > signal_max] = signal_max + #Exclude outliers from each column + lower_limit, upper_limit = -np.inf, np.inf + #lower_limit, upper_limit = np.percentile(signalmat[signalmat != 0], [1,99]) + + logger.debug("Lower limits: {0}".format(lower_limit)) + logger.debug("Upper limits: {0}".format(upper_limit)) + signalmat[np.logical_or(signalmat < lower_limit, signalmat > upper_limit)] = np.nan if args.log_transform: signalmat_abs = np.abs(signalmat) @@ -259,7 +263,7 @@ def run_aggregate(args): #Baseline level baseline_indices = list(range(0,args.flank-fp_flank)) + list(range(args.flank+fp_flank, len(agg))) - baseline = np.median(agg[baseline_indices]) + baseline = np.mean(agg[baseline_indices]) #Footprint level middle_indices = list(range(args.flank-fp_flank, args.flank+fp_flank)) diff --git a/tobias/utils/ngs.pyx b/tobias/utils/ngs.pyx index 2a9966e..e976ad3 100644 --- a/tobias/utils/ngs.pyx +++ b/tobias/utils/ngs.pyx @@ -51,6 +51,7 @@ class OneRead: self.is_reverse = read.is_reverse self.query_length = read.query_length self.flag = read.flag + self.cigartuples = read.cigartuples return(self)