Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Bug fix of plot_bindetect being imported into bindetect_functions; ad…
Browse files Browse the repository at this point in the history
…ded utility to convert aggregate logs to tables
  • Loading branch information
msbentsen committed Feb 27, 2019
1 parent 9ee2f62 commit 81ed108
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 139 deletions.
181 changes: 54 additions & 127 deletions tobias/TOBIAS.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,58 @@
from tobias.misc.subsample_bam import *
from tobias.misc.merge_pdfs import *
from tobias.misc.maxpos import *
#from tobias.misc.create_network import *
from tobias.misc.log2table import *


TOBIAS_VERSION = "0.2" #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change here :-)

def main():

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

all_parser_info = {"Tools for footprinting analysis":
{
"ATACorrect":{"help":"Correct reads with regards to Tn5 sequence bias", "add_arguments": add_atacorrect_arguments, "function":run_atacorrect},
"FootprintScores":{"help":"Calculate footprint scores from cutsites", "add_arguments": add_footprint_arguments, "function":run_footprinting, "space":"\t"},
"BINDetect":{"help":"Detect TF binding from footprints", "add_arguments": add_bindetect_arguments, "function":run_bindetect},
},

"Tools for working with motifs/TFBS":
{
"TFBScan": {"help":"Identify positions of TFBS given sequence and motifs", "add_arguments": add_tfbscan_arguments, "function": run_tfbscan},
"FormatMotifs": {"help": "Utility to deal with motif files", "add_arguments": add_formatmotifs_arguments, "function": run_formatmotifs},
"ClusterTFBS": {"help": "Cluster TFs based on overlap of sites", "add_arguments": add_clustering_arguments, "function": run_clustering},
"ScoreBed": {"help":"Score .bed-file with signal from .bigwig-file(s)", "add_arguments": add_scorebed_arguments, "function": run_scorebed},
},

"Visualization tools":
{
"PlotAggregate": {"help": "Aggregate of .bigwig-signal across TF binding sites", "add_arguments": add_aggregate_arguments, "function": run_aggregate, "space":"\t"},
"PlotHeatmap": {"help": "Heatmap of .bigwig-signal across TF binding sites", "add_arguments": add_heatmap_arguments, "function": run_heatmap},
"PlotChanges": {"help": "Plot changes in TF binding across multiple conditions (from BINDetect output)", "add_arguments": add_plotchanges_arguments, "function": run_plotchanges}
},

"Miscellaneous tools":
{
"MergePDF": {"help": "Merge pdf files to one", "add_arguments":add_mergepdf_arguments, "function":run_mergepdf},
"MaxPos": {"help": "Get .bed-positions of highest bigwig signal within .bed-regions", "add_arguments": add_maxpos_arguments, "function": run_maxpos},
"SubsampleBam": {"help": "Subsample a .bam-file using samtools", "add_arguments": add_subsample_arguments, "function": run_subsampling},
#"CreateNetwork": {"help": "Create TF-gene network from annotated TFBS", "add_arguments": add_network_arguments, "function": run_network, "space":"\t"}
"Log2Table": {"help": "Convert logs from PlotAggregate to tab-delimitered tables of footprint stats", "add_arguments": add_log2table_arguments, "function": run_log2table}
}
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser = argparse.ArgumentParser("TOBIAS", usage=SUPPRESS)
parser._action_groups.pop()
parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=25, width=90)
parser.description = textwrap.dedent('''
______________________________________________________________________________
| |
| ~ T O B I A S ~ |
| Transcription factor Occupancy prediction |
| Transcription factor Occupancy prediction |
| By Investigation of ATAC-seq Signal |
|______________________________________________________________________________|
Expand All @@ -50,133 +89,21 @@ def main():
''')

subparsers = parser.add_subparsers(title=None, metavar="")
all_tool_parsers = {}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for footprinting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser.description += "Tools for footprinting analysis:\n"

name, hlp = "ATACorrect", "Correct reads with regards to Tn5 sequence bias"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
atacorrect_parser = subparsers.add_parser(name, usage=SUPPRESS)
atacorrect_parser = add_atacorrect_arguments(atacorrect_parser) #add atacorrect arguments to the atacorrect subparser
atacorrect_parser.set_defaults(func=run_atacorrect)
all_tool_parsers[name.lower()] = atacorrect_parser

name, hlp = "FootprintScores", "Calculate footprint scores from cutsites"
parser.description += " {0}\t{1}\n".format(name, hlp)
footprint_parser = subparsers.add_parser(name, usage=SUPPRESS)
footprint_parser = add_footprint_arguments(footprint_parser)
footprint_parser.set_defaults(func=run_footprinting)
all_tool_parsers[name.lower()] = footprint_parser

name, hlp = "BINDetect", "Detects TF binding from footprints"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
detect_parser = subparsers.add_parser(name, usage=SUPPRESS)
detect_parser = add_bindetect_arguments(detect_parser)
detect_parser.set_defaults(func=run_bindetect)
all_tool_parsers[name.lower()] = detect_parser


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for working with TFBS ~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser.description += "\nTools for working with motifs/TFBS:\n"

name, hlp = "TFBScan", "Identify positions of TFBS given sequence and motifs"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
tfbscan_parser = subparsers.add_parser(name, usage=SUPPRESS)
tfbscan_parser = add_tfbscan_arguments(tfbscan_parser)
tfbscan_parser.set_defaults(func=run_tfbscan)
all_tool_parsers[name.lower()] = tfbscan_parser

name, hlp = "FormatMotifs", "Utility to deal with motif files"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
formatmotifs_parser = subparsers.add_parser(name, usage=SUPPRESS)
formatmotifs_parser = add_formatmotifs_arguments(formatmotifs_parser)
formatmotifs_parser.set_defaults(func=run_formatmotifs)
all_tool_parsers[name.lower()] = formatmotifs_parser


name, hlp = "ClusterTFBS", "Cluster TFs based on overlap of sites"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
clustering_parser = subparsers.add_parser(name, usage=SUPPRESS)
clustering_parser = add_clustering_arguments(clustering_parser)
clustering_parser.set_defaults(func=run_clustering)
all_tool_parsers[name.lower()] = clustering_parser


name, hlp = "ScoreBed", "Score .bed-file with signal from .bigwig-file(s)"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
scorebed_parser = subparsers.add_parser(name, usage=SUPPRESS)
scorebed_parser = add_scorebed_arguments(scorebed_parser)
scorebed_parser.set_defaults(func=run_scorebed)
all_tool_parsers[name.lower()] = scorebed_parser



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tools for plotting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser.description += "\nVisualization tools:\n"

name, hlp = "PlotAggregate", "Aggregate of .bigwig-signal across TF binding sites"
parser.description += " {0}\t{1}\n".format(name, hlp)
aggregate_parser = subparsers.add_parser(name, usage=SUPPRESS)
aggregate_parser = add_aggregate_arguments(aggregate_parser)
aggregate_parser.set_defaults(func=run_aggregate)
all_tool_parsers[name.lower()] = aggregate_parser

name, hlp = "PlotHeatmap", "Heatmap of .bigwig-signal across TF binding sites"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
heatmap_parser = subparsers.add_parser(name, usage=SUPPRESS)
heatmap_parser = add_heatmap_arguments(heatmap_parser)
heatmap_parser.set_defaults(func=run_heatmap)
all_tool_parsers[name.lower()] = heatmap_parser

name, hlp = "PlotChanges", "Plot changes in TF binding across multiple conditions (from BINDetect output)"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
changeplot_parser = subparsers.add_parser(name, usage=SUPPRESS)
changeplot_parser = add_plotchanges_arguments(changeplot_parser)
changeplot_parser.set_defaults(func=run_plotchanges)
all_tool_parsers[name.lower()] = changeplot_parser



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Misc tools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser.description += "\nMiscellaneous tools:\n"

name, hlp = "MergePDF", "Merge pdf files to one"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
mergepdf_parser = subparsers.add_parser(name, usage=SUPPRESS)
mergepdf_parser = add_mergepdf_arguments(mergepdf_parser)
mergepdf_parser.set_defaults(func=run_mergepdf)
all_tool_parsers[name.lower()] = mergepdf_parser

name, hlp = "MaxPos", "Get .bed-positions of highest bigwig signal within .bed-regions"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
maxpos_parser = subparsers.add_parser(name, usage=SUPPRESS)
maxpos_parser = add_maxpos_arguments(maxpos_parser)
maxpos_parser.set_defaults(func=run_maxpos)
all_tool_parsers[name.lower()] = maxpos_parser

name, hlp = "SubsampleBam", "Subsample a .bam-file using samtools"
parser.description += " {0}\t\t{1}\n".format(name, hlp)
subsample_parser = subparsers.add_parser(name, usage=SUPPRESS)
subsample_parser = add_subsample_arguments(subsample_parser)
subsample_parser.set_defaults(func=run_subsampling)
all_tool_parsers[name.lower()] = subsample_parser

parser.description += "\n"

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Add all tools to parser
all_tool_parsers = {}
for group in all_parser_info:
parser.description += group + ":\n"

info = all_parser_info[group]
for tool in info:
parser.description += " {0}{1}{2}\n".format(tool, info[tool].get("space", "\t\t"), info[tool]["help"])
subparser = subparsers.add_parser(tool, usage=SUPPRESS)
subparser = info[tool]["add_arguments"](subparser)
subparser.set_defaults(func=info[tool]["function"])
all_tool_parsers[tool.lower()] = subparser

parser.description += "\n"

parser.description += "For help on each tool, please run: TOBIAS <TOOLNAME> --help\n"

Expand Down
7 changes: 5 additions & 2 deletions tobias/footprinting/ATACorrect_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,15 @@ def bias_estimation(regions_list, params):
#Map reads to positions
read_per_pos = {}
for read in read_lst_strand[strand]:
read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read]
if read.cigartuples is not None:
first_tuple = read.cigartuples[-1] if read.is_reverse else read.cigartuples[0]
if first_tuple[0] == 0 and first_tuple[1] > params.k_flank + max(np.abs(params.read_shift)):
read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read]

for cutsite in read_per_pos:
if cutsite > region.start and cutsite < region.end: #only reads within borders
read = read_per_pos[cutsite][0] #use first read in list to establish kmer
no_cut = len(read_per_pos[cutsite])
no_cut = min(len(read_per_pos[cutsite]), 10) #put cap on number of cuts to limit influence of outliers

read.get_kmer(sequence_obj, k_flank)

Expand Down
5 changes: 2 additions & 3 deletions tobias/footprinting/BINDetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,9 +435,8 @@ def run_bindetect(args):
leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params)

#Flip over
mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x])
mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]])

mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]).flatten()
mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten()
popt, cov = curve_fit(lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf)
norm_params = (mode, popt[0])
logger.debug("Theoretical normal parameters: {0}".format(norm_params))
Expand Down
3 changes: 2 additions & 1 deletion tobias/footprinting/BINDetect_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
from tobias.utils.utilities import *
from tobias.utils.motifs import *
from tobias.utils.signals import *
from tobias.plotting.plot_bindetect import *

import warnings

#------------------------------------------------------------------------------------------------------#
#------------------------------------------------------------------------------------------------------#
Expand Down
92 changes: 92 additions & 0 deletions tobias/misc/log2table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python

"""
Log2Table: Creates a table from PlotAggregate logfiles
@author: Mette Bentsen
@contact: mette.bentsen (at) mpi-bn.mpg.de
@license: MIT
"""


import os
import sys
import argparse
import re
import pandas as pd

#Utils from TOBIAS
from tobias.utils.utilities import *
from tobias.utils.logger import *



def add_log2table_arguments(parser):

parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90)
description = ""
parser.description = format_help_description("Log2Table", description)

parser._action_groups.pop() #pop -h

#Required arguments
required = parser.add_argument_group('Required arguments')
required.add_argument('--logfiles', nargs="*", metavar="", help="Logfiles from PlotAggregate")
required.add_argument('--output', metavar="", help="Output directory for tables (default: current dir)", default=".")
required.add_argument('--prefix', metavar="", help="Prefix of output files", default="aggregate")

return(parser)

def run_log2table(args):
logger = TobiasLogger("Log2Table")
logger.begin()

#Test input
check_required(args, ["logfiles"])

output_fpd = os.path.join(args.output, args.prefix + "_FPD.txt")
output_corr = os.path.join(args.output, args.prefix + "_CORRELATION.txt")

FPD_data = []
CORR_data = []

#Read all logfiles
for log_f in args.logfiles:
logger.info("Reading: {0}".format(log_f))
with open(log_f) as f:
for line in f:
#Match FPD lines
#...... FPD (PWM_uncorrected,MA0073.1_all): 20 0.620 0.602 -0.018
m = re.match(".*FPD\s\((.+),(.+)\): (.+) (.+) (.+) (.+)", line.rstrip())
if m:

elements = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6)
signal, sites, width, baseline, fp, fpd = elements

columns = [signal, sites, width, baseline, fp, fpd]
FPD_data.append(columns)

#Match correlation lines
#..... CORRELATION (PWM_uncorrected,MA0002.2_all) VS (PWM_corrected,MA0002.2_all): -0.17252
m = re.match(".*CORRELATION \((.+),(.+)\) VS \((.+),(.+)\): (.+)", line.rstrip())
if m:
elements = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)
signal1, sites1, signal2, sites2, pearsonr = elements

columns = [signal1, sites1, signal2, sites2, pearsonr]
CORR_data.append(columns)

#Add reverse comparison as well
if pearsonr != "PEARSONR":
columns = [signal2, sites2, signal1, sites1, pearsonr]
CORR_data.append(columns)

#All lines from all files read, write out tables
df_fpd = pd.DataFrame(FPD_data)
df_fpd.drop_duplicates(keep="first", inplace=True)
df_fpd.to_csv(output_fpd, sep="\t", index=False, header=False)

df_corr = pd.DataFrame(CORR_data)
df_corr.drop_duplicates(keep="first", inplace=True)
df_corr.to_csv(output_corr, sep="\t", index=False, header=False)
5 changes: 4 additions & 1 deletion tobias/motifs/tfbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,11 @@ def run_tfbscan(args):
logger = TobiasLogger("TFBScan", args.verbosity)
logger.begin()
parser = add_tfbscan_arguments(argparse.ArgumentParser())

logger.arguments_overview(parser, args)
logger.output_files([args.outfile])

if args.outfile != None:
logger.output_files([args.outfile])


######## Read sequences from file and estimate background gc ########
Expand Down
14 changes: 9 additions & 5 deletions tobias/plotting/plot_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def add_aggregate_arguments(parser):
PLOT.add_argument('--negate', action='store_true', help="Negate overlap with regions")
PLOT.add_argument('--log_transform', help="", action="store_true")
PLOT.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true')
#PLOT.add_argument('--outliers', help="")

RUN = parser.add_argument_group("Run arguments")
RUN = add_logger_args(RUN)
Expand Down Expand Up @@ -231,10 +232,13 @@ def run_aggregate(args):

signalmat = np.array([signal_dict[signal_name][reg.tup()] for reg in regions_dict[region_name]])

#todo: Option to remove rows with lowest/highest signal (--percentiles?)
#signal_min, signal_max = np.percentile(signalmat, [1,99])
#signalmat[signalmat < signal_min] = signal_min
#signalmat[signalmat > signal_max] = signal_max
#Exclude outliers from each column
lower_limit, upper_limit = -np.inf, np.inf
#lower_limit, upper_limit = np.percentile(signalmat[signalmat != 0], [1,99])

logger.debug("Lower limits: {0}".format(lower_limit))
logger.debug("Upper limits: {0}".format(upper_limit))
signalmat[np.logical_or(signalmat < lower_limit, signalmat > upper_limit)] = np.nan

if args.log_transform:
signalmat_abs = np.abs(signalmat)
Expand All @@ -259,7 +263,7 @@ def run_aggregate(args):

#Baseline level
baseline_indices = list(range(0,args.flank-fp_flank)) + list(range(args.flank+fp_flank, len(agg)))
baseline = np.median(agg[baseline_indices])
baseline = np.mean(agg[baseline_indices])

#Footprint level
middle_indices = list(range(args.flank-fp_flank, args.flank+fp_flank))
Expand Down
1 change: 1 addition & 0 deletions tobias/utils/ngs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class OneRead:
self.is_reverse = read.is_reverse
self.query_length = read.query_length
self.flag = read.flag
self.cigartuples = read.cigartuples

return(self)

Expand Down

0 comments on commit 81ed108

Please sign in to comment.