Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/master' into network
Browse files Browse the repository at this point in the history
  • Loading branch information
msbentsen committed Jul 16, 2019
2 parents 6a0a33f + 0c8a9dc commit 0222aa0
Show file tree
Hide file tree
Showing 18 changed files with 544 additions and 256 deletions.
34 changes: 33 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
## 0.4.0 (2019-04-26)
## 0.6.3 (2019-07-16)
- Increased size of texts in BINDetect volcano plot and moved label into plot

## 0.6.2 (2019-06-19)
- Added --skip-excel option to skip excel overview per factor in BINDetect (can slow the run considerably for large output)
- Internal changes to BINDetect for better debug overview
- Fixed normalization for plotAggregate

## 0.6.1 (2019-06-05)
- Fixed motif cluster names to prefix C_
- Added --filter function to format_motifs to filter output motifs using names/ids

## 0.6.0 (2019-05-29)
- Added option to flip axes in PlotAggregate via "--signal-on-x"
- Changed all command-line arguments containing "_" to "-" (e.g. --regions_in to --regions-in) (but retaining both options internally)

## 0.5.3 (2019-05-28)
- Improved error messaging from file writers in bindetect

## 0.5.2 (2019-05-16)
- Bugfix for reading meme format files in MotifList().from_file()

## 0.5.1 (2019-05-15)
- Internal changes to OneMotif and MotifList classes
- Bindetect now takes directories/file(s) as input for --motifs

## 0.5.0 (2019-05-02)
- Added sum/mean/none scoring to ScoreBigwig as well as the option to get --absolute of input signal

## 0.4.1 (2019-04-29)
- Fixed weird "can't pickle SwigPyObject objects"-error in bindetect

## 0.4.0 (2019-04-29)
- Added --add_region_columns to TFBScan
- Renamed FootprintScores to ScoreBigwig
- Added normalization of input score distributions in BINDetect
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ TOBIAS - Transcription factor Occupancy prediction By Investigation of ATAC-seq
=======================================

[![PyPI Version](https://img.shields.io/pypi/v/tobias.svg?style=plastic)](https://pypi.org/project/tobias/)
[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/tobias/README.html)

Introduction
------------
Expand Down Expand Up @@ -113,6 +114,11 @@ Split a motif file containing several motifs:
$ TOBIAS FormatMotifs --input test_data/example_motifs.txt --format pfm --task split --output split_motifs
```

Filter a larger motif file using TF names:
```
$ echo 'MAFK CTCF JUNB' > TF_names.txt
$ FormatMotifs --input test_data/example_motifs.txt --output filtered_motifs.txt --filter TF_names.txt
```

Snakemake pipeline
------------
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ def readme():
version=find_version(os.path.join(setupdir, "tobias", "__init__.py")), #get version from __init__.py
description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal',
long_description=readme(),
long_description_content_type='text/markdown',
url='https://github.molgen.mpg.de/loosolab/TOBIAS',
author='Mette Bentsen',
author_email='mette.bentsen@mpi-bn.mpg.de',
license='MIT',
packages=find_packages(), #"tobias"), #['tobias', 'tobias.footprinting', 'tobias.plotting', 'tobias.motifs', 'tobias.misc', 'tobias.utils'],
packages=find_packages(),
entry_points={
'console_scripts': ['TOBIAS=tobias.TOBIAS:main']
},
Expand All @@ -77,6 +78,7 @@ def readme():
'pyBigWig',
'MOODS-python',
],
scripts=["tobias/utils/filter_important_factors.py"],
classifiers=[
'License :: OSI Approved :: MIT License',
'Intended Audience :: Science/Research',
Expand Down
7 changes: 6 additions & 1 deletion tobias/TOBIAS.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,15 @@ def main():

#Add version to subparser
subparser.add_argument("--version", action='version', version=TOBIAS_VERSION)
subparser = add_underscore_options(subparser)

#Add parser for old tool names
if "replaces" in info[tool]:
all_tool_parsers[info[tool]["replaces"].lower()] = subparser
replace_tool = info[tool]["replaces"]
subparser = subparsers.add_parser(replace_tool, usage=SUPPRESS)
subparser = info[tool]["add_arguments"](subparser)
subparser.set_defaults(func=info[tool]["function"])
all_tool_parsers[replace_tool.lower()] = subparser

parser.description += "\n"

Expand Down
2 changes: 1 addition & 1 deletion tobias/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.0"
__version__ = "0.6.3"
12 changes: 7 additions & 5 deletions tobias/footprinting/atacorrect.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,13 @@ def add_atacorrect_arguments(parser):

#Optional arguments
optargs = parser.add_argument_group('Optional arguments')
optargs.add_argument('--regions_in', metavar="<bed>", help="Input regions for estimating bias (default: regions not in peaks.bed)")
optargs.add_argument('--regions_out', metavar="<bed>", help="Output regions (default: peaks.bed)")
optargs.add_argument('--regions-in', metavar="<bed>", help="Input regions for estimating bias (default: regions not in peaks.bed)")
optargs.add_argument('--regions-out', metavar="<bed>", help="Output regions (default: peaks.bed)")
optargs.add_argument('--blacklist', metavar="<bed>", help="Blacklisted regions in .bed-format (default: None)") #file containing blacklisted regions to be excluded from analysis")
optargs.add_argument('--extend', metavar="<int>", type=int, help="Extend output regions with basepairs upstream/downstream (default: 100)", default=100)
optargs.add_argument('--split_strands', help="Write out tracks per strand", action="store_true")
optargs.add_argument('--norm_off', help="Switches off normalization based on number of reads", action='store_true')
optargs.add_argument('--track_off', metavar="<track>", help="Switch off writing of individual .bigwig-tracks (uncorrected/bias/expected/corrected)", nargs="*", choices=["uncorrected", "bias", "expected", "corrected"], default=[])
optargs.add_argument('--split-strands', help="Write out tracks per strand", action="store_true")
optargs.add_argument('--norm-off', help="Switches off normalization based on number of reads", action='store_true')
optargs.add_argument('--track-off', metavar="<track>", help="Switch off writing of individual .bigwig-tracks (uncorrected/bias/expected/corrected)", nargs="*", choices=["uncorrected", "bias", "expected", "corrected"], default=[])

optargs = parser.add_argument_group('Advanced ATACorrect arguments (no need to touch)')
optargs.add_argument('--k_flank', metavar="<int>", help="Flank +/- of cutsite to estimate bias from (default: 12)", type=int, default=12)
Expand All @@ -94,6 +94,8 @@ def add_atacorrect_arguments(parser):

runargs = add_logger_args(runargs)



return(parser)

#--------------------------------------------------------------------------------------------------------#
Expand Down
41 changes: 23 additions & 18 deletions tobias/footprinting/bindetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,18 @@ def add_bindetect_arguments(parser):
required = parser.add_argument_group('Required arguments')
required.add_argument('--signals', metavar="<bigwig>", help="Signal per condition (.bigwig format)", nargs="*")
required.add_argument('--peaks', metavar="<bed>", help="Peaks.bed containing open chromatin regions across all conditions")
required.add_argument('--motifs', metavar="<motifs>", help="Motifs in pfm/jaspar format")
required.add_argument('--motifs', metavar="<motifs>", help="Motif file(s) in pfm/jaspar format", nargs="*")
required.add_argument('--genome', metavar="<fasta>", help="Genome .fasta file")

optargs = parser.add_argument_group('Optional arguments')
optargs.add_argument('--cond_names', metavar="<name>", nargs="*", help="Names of conditions fitting to --signals (default: prefix of --signals)")
optargs.add_argument('--peak_header', metavar="<file>", help="File containing the header of --peaks separated by whitespace or newlines (default: peak columns are named \"_additional_<count>\")")
optargs.add_argument('--cond-names', metavar="<name>", nargs="*", help="Names of conditions fitting to --signals (default: prefix of --signals)")
optargs.add_argument('--peak-header', metavar="<file>", help="File containing the header of --peaks separated by whitespace or newlines (default: peak columns are named \"_additional_<count>\")")
#optargs.add_argument('--naming', metavar="<type>", help="Naming convention for TFs ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id")
optargs.add_argument('--motif_pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001)
optargs.add_argument('--bound_pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
optargs.add_argument('--motif-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001)
optargs.add_argument('--bound-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
optargs.add_argument('--pseudo', type=float, metavar="<float>", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None)
optargs.add_argument('--time_series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
optargs.add_argument('--time-series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
optargs.add_argument('--skip-excel', action='store_true', help="Skip creation of excel files - for large datasets, this will speed up BINDetect considerably")

runargs = parser.add_argument_group("Run arguments")
runargs.add_argument('--outdir', metavar="<directory>", help="Output directory to place TFBS/plots in (default: bindetect_output)", default="bindetect_output")
Expand Down Expand Up @@ -190,7 +191,6 @@ def run_bindetect(args):
figure_pdf.savefig(bbox_inches='tight')
plt.close()


################# Peaks / GC in peaks ################
#Read peak and peak_header
peaks = RegionList().from_bed(args.peaks)
Expand All @@ -205,6 +205,9 @@ def run_bindetect(args):
peak_chroms = peaks.get_chroms()
peak_columns = len(peaks[0]) #number of columns

if args.debug:
peaks = RegionList(peaks[:1000])

#Header
if args.peak_header != None:
content = open(args.peak_header, "r").read()
Expand Down Expand Up @@ -237,16 +240,15 @@ def run_bindetect(args):
bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])
logger.info("- GC content estimated at {0:.2f}%".format(gc_content*100))


################ Get motifs ################
logger.info("Reading motifs from file")

motif_content = open(args.motifs).read()
converted_content = convert_motif(motif_content, "pfm")
motif_list = pfm_to_motifs(converted_content) #List of OneMotif objects
motif_list = MotifList()
args.motifs = expand_dirs(args.motifs)
for f in args.motifs:
motif_list += MotifList().from_file(f) #List of OneMotif objects
no_pfms = len(motif_list)
logger.info("- Read {0} motifs".format(no_pfms))

logger.info("- Found {0} motifs in file".format(no_pfms))
logger.debug("Getting motifs ready")
motif_list.bg = bg

Expand Down Expand Up @@ -328,7 +330,7 @@ def run_bindetect(args):
results = [task.get() for task in task_list]

logger.info("Done scanning for TFBS across regions!")
logger.stop_logger_queue() #stop the listening process (wait until all was written)
#logger.stop_logger_queue() #stop the listening process (wait until all was written)

#--------------------------------------#
logger.info("Waiting for bedfiles to write")
Expand Down Expand Up @@ -519,15 +521,17 @@ def run_bindetect(args):
cols = len(info_columns)
rows = len(motif_names)
info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=motif_names)

#Starting calculations
results = []
if args.cores == 1:
for name in motif_names:
logger.info("- {0}".format(name))
results.append(process_tfbs(name, args, log2fc_params))
else:
task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params)) for name in motif_names]
logger.debug("Sending jobs to worker pool")

task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params, )) for name in motif_names]
monitor_progress(task_list, logger) #will not exit before all jobs are done
results = [task.get() for task in task_list]

Expand All @@ -537,6 +541,8 @@ def run_bindetect(args):
pool.terminate()
pool.join()

logger.stop_logger_queue()

#-------------------------------------------------------------------------------------------------------------#
#------------------------------------------------ Cluster TFBS -----------------------------------------------#
#-------------------------------------------------------------------------------------------------------------#
Expand Down Expand Up @@ -603,7 +609,6 @@ def run_bindetect(args):
n_rows = worksheet.dim_rowmax
n_cols = worksheet.dim_colmax
worksheet.autofilter(0,0,n_rows,n_cols)

writer.save()

#Format comparisons
Expand Down Expand Up @@ -632,7 +637,7 @@ def run_bindetect(args):
base = cond1 + "_" + cond2

#Make copy of motifs and fill in with metadata
comparison_motifs = copy.deepcopy(motif_list)
comparison_motifs = motif_list #copy.deepcopy(motif_list) - swig pickle error, just overwrite motif_list
for motif in comparison_motifs:
name = motif.prefix
motif.change = float(info_table.at[name, base + "_change"])
Expand Down
70 changes: 46 additions & 24 deletions tobias/footprinting/bindetect_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import NullFormatter
from cycler import cycler
Expand Down Expand Up @@ -279,13 +280,19 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
comparisons = args.comparisons

#Read file to list of dicts
stime = datetime.now()
header = ["TFBS_chr", "TFBS_start", "TFBS_end", "TFBS_name", "TFBS_score", "TFBS_strand"] + args.peak_header_list + ["{0}_score".format(condition) for condition in args.cond_names]
with open(filename) as f:
bedlines = [dict(zip(header, line.rstrip().split("\t"))) for line in f.readlines()]
n_rows = len(bedlines)
etime = datetime.now()
logger.debug("{0} - Reading took:\t{1}".format(TF_name, etime - stime))


############################## Local effects ###############################

stime = datetime.now()

#Sort, scale and calculate log2fc
bedlines = sorted(bedlines, key=lambda line: (line["TFBS_chr"], int(line["TFBS_start"]), int(line["TFBS_end"])))
for line in bedlines:
Expand All @@ -294,7 +301,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
for condition in args.cond_names:
threshold = args.thresholds[condition]
line[condition + "_score"] = float(line[condition + "_score"])
line[condition + "_score"] = np.round(args.norm_objects[condition].normalize(line[condition + "_score"] ), 5)
line[condition + "_score"] = round(args.norm_objects[condition].normalize(line[condition + "_score"] ), 5)
line[condition + "_bound"] = 1 if line[condition + "_score"] > threshold else 0

#Comparison specific
Expand All @@ -315,7 +322,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state))
chosen_bool = 1 if state == "bound" else 0
bedlines_subset = [bedline for bedline in bedlines if bedline[condition + "_bound"] == chosen_bool]
bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
#bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
dict_to_tab(bedlines_subset, outfile, chosen_columns)

##### Write overview with scores, bound and log2fcs ####
Expand All @@ -325,23 +332,32 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf

#Write xlsx overview
bed_table = pd.DataFrame(bedlines)
try:
overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter')
bed_table.to_excel(writer, index=False, columns=overview_columns)

worksheet = writer.sheets['Sheet1']
no_rows, no_cols = bed_table.shape
worksheet.autofilter(0,0,no_rows, no_cols-1)
writer.save()

except:
print("Error writing excelfile for TF {0}".format(TF_name))
sys.exit()

stime_excel = datetime.now()
if args.skip_excel == False:
try:
overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') #, options=dict(constant_memory=True))
bed_table.to_excel(writer, index=False, columns=overview_columns)

#autfilter not possible with constant_memory
worksheet = writer.sheets['Sheet1']
no_rows, no_cols = bed_table.shape
worksheet.autofilter(0,0,no_rows, no_cols)
writer.save()

except Exception as e:
print("Error writing excelfile for TF {0}".format(TF_name))
print(e)
sys.exit()

etime_excel = datetime.now()
etime = datetime.now()
logger.debug("{0} - Local effects took:\t{1} (excel: {2})".format(TF_name, etime - stime, etime_excel - stime_excel))

############################## Global effects ##############################

stime = datetime.now()

#Get info table ready
info_columns = ["total_tfbs"]
info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["mean_score", "bound"])])
Expand Down Expand Up @@ -392,6 +408,8 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
info_table.at[TF_name, base + "_change"] = 0
info_table.at[TF_name, base + "_pvalue"] = 1

#stime_plot = datetime.now()

#### Plot comparison ###
fig, ax = plt.subplots(1,1)
ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True)
Expand Down Expand Up @@ -423,8 +441,14 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
log2fc_pdf.savefig(fig, bbox_inches='tight')
plt.close(fig)

#etime_plot = datetime.now()
#logger.debug("{0} - Plotting took:\t{1}".format(TF_name, etime_plot - stime_plot))

log2fc_pdf.close()

etime = datetime.now()
logger.debug("{0} - Global effects took:\t{1}".format(TF_name, etime - stime))

#################### Remove temporary file ######################
try:
os.remove(filename)
Expand Down Expand Up @@ -592,9 +616,13 @@ def plot_bindetect(motifs, cluster_obj, conditions, args):
ax1.scatter(coord[0], coord[1], color=diff_scores[TF]["color"], s=4.5)

if diff_scores[TF]["show"] == True:
txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=7))
txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=8))

adjust_text(txts, ax=ax1, text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1))
#Plot custom legend for colors
legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="Higher scores in {0}".format(conditions[0])),
Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="Higher scores in {0}".format(conditions[1]))]
l = ax1.legend(handles=legend_elements, loc="lower left", framealpha=0.5)
adjust_text(txts, ax=ax1, add_objects=[l], text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1))

"""
#Add arrows to other cluster members
Expand All @@ -614,12 +642,6 @@ def plot_bindetect(motifs, cluster_obj, conditions, args):
ax1.arrow(point_x, point_y, len_x, len_y, linestyle="-", color="black", lw=0.5)
"""
#print(txts)

#Plot custom legend for colors
legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="Higher scores in {0}".format(conditions[0])),
Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="Higher scores in {0}".format(conditions[1]))]
ax1.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

return(fig)

Expand Down
Loading

0 comments on commit 0222aa0

Please sign in to comment.