Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Added --outfile utility to TFBScan, meaning changes to regions class …
Browse files Browse the repository at this point in the history
…and bedfile writer
  • Loading branch information
msbentsen committed Jan 11, 2019
1 parent 0efd686 commit 26ff437
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 108 deletions.
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def readme():
Extension("tobias.utils.signals", ["tobias/utils/signals.pyx"], include_dirs=[np.get_include()])]

setup(name='tobias',
version='1.0.0',
version='0.1',
description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal',
long_description=readme(),
url='https://github.molgen.mpg.de/loosolab/TOBIAS',
Expand All @@ -32,7 +32,6 @@ def readme():
'scikit-learn',
'pandas',
'pypdf2',
'seaborn',
'xlsxwriter',
'adjustText',
],
Expand Down
10 changes: 9 additions & 1 deletion tobias/TOBIAS.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from tobias.utils.merge_pdfs import *
from tobias.utils.score_bed import *

TOBIAS_VERSION = "0.1"

def main():
parser = argparse.ArgumentParser("TOBIAS", usage=SUPPRESS)
parser._action_groups.pop()
Expand Down Expand Up @@ -176,7 +178,13 @@ def main():

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

parser.description += "For help on each tool, please run: TOBIAS <TOOLNAME> --help"
parser.description += "For help on each tool, please run: TOBIAS <TOOLNAME> --help\n"

#Add version number to upper TOBIAS parser and all subparsers
parser.description += "For version number: TOBIAS --version"
parser.add_argument("--version", action='version', version=TOBIAS_VERSION)
for name in all_tool_parsers:
all_tool_parsers[name].add_argument("--version", action='version', version=TOBIAS_VERSION)

#If no args, print help for top-level TOBIAS
if len(sys.argv[1:]) == 0:
Expand Down
16 changes: 9 additions & 7 deletions tobias/footprinting/BINDetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,20 @@
import logging
import logging.handlers
import itertools
import pandas as pd

#Machine learning
import sklearn
from sklearn import mixture
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
#from sklearn.model_selection import GridSearchCV
import scipy
from scipy.optimize import curve_fit

#Plotting
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import NullFormatter
import pandas as pd
import scipy
from scipy.optimize import curve_fit
from sklearn import preprocessing

#Bio-specific packages
import pyBigWig
Expand Down Expand Up @@ -344,7 +345,8 @@ def run_bindetect(args):

q = manager.Queue()
qs_list.append(q)
writer_pool.apply_async(file_writer, args=(q, TF_names_sub, files, args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))

writer_pool.apply_async(file_writer, args=(q, dict(zip(TF_names_sub,files)), args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))
for TF in TF_names_sub:
qs[TF] = q
writer_pool.close() #no more jobs applied to writer_pool
Expand Down
3 changes: 3 additions & 0 deletions tobias/footprinting/BINDetect_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import MOODS.tools
import MOODS.parsers

#Internal functions and classes
from tobias.utils.regions import *
from tobias.utils.sequences import *
from tobias.utils.utilities import *
Expand Down Expand Up @@ -58,8 +59,10 @@ def find_nearest_idx(array, value):
idx = np.abs(array - value).argmin()
return(idx)

"""
def is_nan(x):
return (x is np.nan or x != x)
"""

### Logger runs in main process
def main_logger_process(queue, logger):
Expand Down
137 changes: 74 additions & 63 deletions tobias/motifs/tfbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,19 @@
import itertools

import pysam

from tobias.utils.utilities import *
from tobias.utils.regions import *
from tobias.utils.sequences import *
from tobias.utils.motifs import *


def restricted_float(f, f_min, f_max):
f = float(f)
if f < f_min or f > f_max:
raise argparse.ArgumentTypeError("{0} not in range [0.0, 1.0]".format(f))
return f


#----------------------------------------------------------------------------------------------------------#
def add_tfbscan_arguments(parser):

parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=35, width=90)
description = "Find positions of Transcription Factor Binding Sites (TFBS) in FASTA sequences by scanning with motifs.\n\n"
description += "Usage:\nTOBIAS TFBScan --motifs <motifs.txt> --fasta <genome.fa> \n\n"
description += "Output files:\n- <outdir>/<TF1>.bed\n- <outdir>/<TF2>.bed\n- (...)\n\n"
description += "By setting --outdir, the output files are:\n- <outdir>/<TF1>.bed\n- <outdir>/<TF2>.bed\n- (...)\n\n"
description += "By setting --outfile, all TFBS are written to one file (with motif specified in the 4th column of the .bed)."
parser.description = format_help_description("TFBScan", description)

parser._action_groups.pop() #pop -h
Expand All @@ -46,24 +38,23 @@ def add_tfbscan_arguments(parser):

#all other arguments are optional
optional_arguments = parser.add_argument_group('Optional arguments')
optional_arguments.add_argument('-o', '--outdir', metavar="", help='Output directory (default: ./motifscan_output/)', default="motifscan_output")
optional_arguments.add_argument('-r', '--regions', metavar="", help='Subset scanning to regions of interest')
optional_arguments.add_argument('--outdir', metavar="", help='Output directory for TFBS sites in one file per motif (default: ./tfbscan_output/). NOTE: Select either --outdir or --outfile.', default=None)
optional_arguments.add_argument('--outfile', metavar="", help='Output file for TFBS sites joined in one bed-file (default: not set). NOTE: Select either --outdir or --outfile.', default=None)

optional_arguments.add_argument('--naming', metavar="", help="Naming convention for bed-ids and output files ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id")
optional_arguments.add_argument('--gc', metavar="", type=lambda x: restricted_float(x,0,1), help='Set the gc content for background regions (default: will be estimated from fasta)')
optional_arguments.add_argument('--pvalue', metavar="", type=lambda x: restricted_float(x,0,1), help='Set p-value for motif matches (default: 0.0001)', default=0.0001)
##optional_arguments.add_argument('--tool', metavar="", choices=['FIMO', 'MOODS'], help='Choose the tool to perform motif scanning with (default: MOODS)', default='MOODS')
optional_arguments.add_argument('--keep_overlaps', action='store_true', help='Keep overlaps of same motifs (default: overlaps are resolved by keeping best-scoring site)')

optional_arguments.add_argument('--split', metavar="<int>", type=int, help="Split of multiprocessing jobs (default: 100)", default=100)
optional_arguments.add_argument('--cores', metavar="", type=int, help='Number of cores to use (default: 1)', default=1)
optional_arguments.add_argument('--log', metavar="", help="Path to logfile (default: writes to stdout)")
optional_arguments.add_argument('--debug', action="store_true", help=argparse.SUPPRESS)


return(parser)

#----------------------------------------------------------------------------------------------------------#

def motif_scanning(regions, args, motifs_obj):
""" motifs_obj is a MotifList object """

Expand All @@ -78,11 +69,25 @@ def motif_scanning(regions, args, motifs_obj):

seq = fasta_obj.fetch(region.chrom, region.start, region.end)
region_TFBS = motifs_obj.scan_sequence(seq, region) #Scan sequence

#Check format of region chromosome and convert sites if needed
m = re.match("(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom)
if m:
reg_chrom, reg_start, reg_end = m.group(1), m.group(2), m.group(3)
for TFBS in region_TFBS:
TFBS.chrom = region.chrom
TFBS.start = int(reg_start) + TFBS.start
TFBS.end = int(reg_start) + TFBS.end

#Split to single TFs
for TFBS in region_TFBS:
all_TFBS[TFBS.name].append(TFBS)

#Resolve overlaps
if args.keep_overlaps == False: #default
for name in all_TFBS:
all_TFBS[name] = all_TFBS[name].resolve_overlaps()

#Write to queue
for name in all_TFBS:
bed_content = all_TFBS[name].as_bed() #string
Expand All @@ -91,42 +96,23 @@ def motif_scanning(regions, args, motifs_obj):
return(1)


def process_TFBS(file, args):
""" Process TFBS bedfile - remove duplicates, overlaps and sort """

outfile = file.replace(".tmp", ".bed")

#Read sites to regionList
TFBS = RegionList().from_bed(file)

#Remove overlaps
if args.keep_overlaps == False:
TFBS = TFBS.resolve_overlaps() #automatically removes duplicates as well
else:
TFBS = TFBS.remove_duplicates() #

#If chrom looks like subset -> convert to true coordinates
for site in TFBS:
def process_TFBS(infile, args):
""" Process TFBS bedfile - remove duplicates and sort """

m = re.match("(.+)\:([0-9]+)\-([0-9]+)", site.chrom)
if m:
reg_chrom, reg_start, reg_end = m.group(1), m.group(2), m.group(3)
if args.outdir != None:
outfile = infile.replace(".tmp", ".bed") #single files, the names are controlled as motif names
elif args.outfile != None:
outfile = args.outfile

site.chrom = reg_chrom
site.start = int(reg_start) + site.start
site.end = int(reg_start) + site.end
site.update()
#Sort and print unique lines
os.system("sort -k1,1 -k2,2n {0} | uniq > {1}".format(infile, outfile))

#Write out to bed
TFBS.loc_sort()
TFBS.write_bed(outfile)

#Remove the tmp file
if not args.debug:
try:
os.remove(file)
os.remove(infile)
except:
print("Error removing file {0}".format(file))
print("Error removing file {0}".format(infile))

return(1)

Expand All @@ -139,10 +125,21 @@ def run_tfbscan(args):
###### Check input arguments ######
check_required(args, ["motifs", "fasta"]) #Check input arguments
check_files([args.motifs, args.fasta, args.regions]) #Check if files exist
make_directory(args.outdir) #Check and create output directory

##Test input
if args.outdir != None and args.outfile != None: #Error - both set
sys.exit("ERROR: Please choose either --outdir or --outfile")
elif ((args.outdir == None or args.outdir != None) and args.outfile == None): #Separate files
args.outdir = "tfbscan_output/" if args.outdir == None else args.outdir
make_directory(args.outdir) #Check and create output directory
elif args.outdir == None and args.outfile != None: #Joined file
check_files([args.outfile], "w")

###### Create logger and write argument overview ######
logger = create_logger(2, args.log)
if args.debug:
logger = create_logger(3, args.log)
else:
logger = create_logger(2, args.log)

logger.comment("#TOBIAS TFBScan (run started {0})\n".format(begin_time))
logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv)))
Expand All @@ -151,6 +148,8 @@ def run_tfbscan(args):
logger.comment(arguments_overview(parser, args))

######## Read sequences from file and estimate background gc ########

logger.critical("Handling input files")
logger.info("Reading sequences from fasta")

fastafile = pysam.FastaFile(args.fasta)
Expand All @@ -165,21 +164,19 @@ def run_tfbscan(args):
#If subset, setup regions
if args.regions:
regions = RegionList().from_bed(args.regions)


else: #set up regions from fasta references
regions = fasta_regions
regions = regions.apply_method(OneRegion.split_region, 1000000)
regions = regions.apply_method(OneRegion.extend_reg, 50) #extend to overlap at junctions

#Subset regions to maximum length and extend to overlap at junctions
regions = regions.apply_method(OneRegion.split_region, 1000000)
regions = regions.apply_method(OneRegion.extend_reg, 50)
#Clip regions at chromosome boundaries
regions = regions.apply_method(OneRegion.check_boundary, fasta_chrom_info, "cut")

logger.info("- Total of {0} regions (after splitting)".format(len(regions)))

#Background gc
if args.gc == None:
logger.info("Estimating GC content from fasta")
logger.info("Estimating GC content from fasta (set --gc to skip this step)")
args.gc = get_gc_content(regions, args.fasta)

bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])
Expand Down Expand Up @@ -209,7 +206,6 @@ def run_tfbscan(args):

motif_names = list(set([motif.name for motif in motif_list]))


pool = mp.Pool(processes=args.cores)
outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.pvalue]))
motif_list = MotifList(outlist)
Expand All @@ -224,35 +220,49 @@ def run_tfbscan(args):

manager = mp.Manager()

writer_cores = max(1,int(args.cores*0.1))
worker_cores = max(1,int(args.cores*0.9))
if args.outdir != None:
writer_cores = max(1,int(args.cores*0.1))
worker_cores = max(1,args.cores - writer_cores)

elif args.outfile != None: #Write to one file
writer_cores = 1
worker_cores = max(1,args.cores - writer_cores)

#Setup pools
logger.debug("Writer cores: {0}".format(writer_cores))
logger.debug("Worker cores: {0}".format(worker_cores))
worker_pool = mp.Pool(processes=worker_cores, maxtasksperchild=1)
writer_pool = mp.Pool(processes=writer_cores)

#Setup bed-writers
#Setup bed-writers based on --outdir or --outfile
temp_files = []
qs = {}
TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)]
for TF_names_sub in TF_names_chunks:
logger.debug("Creating writer queue for {0}".format(TF_names_sub))
files = [os.path.join(args.outdir, TF + ".tmp") for TF in TF_names_sub]
temp_files.extend(files)

if args.outdir != None:
files = [os.path.join(args.outdir, TF + ".tmp") for TF in TF_names_sub]
temp_files.extend(files)
elif args.outfile != None:
files = [args.outfile + ".tmp" for TF in TF_names_sub] #write to the same file for all
temp_files.append(files[0])

q = manager.Queue()
#qs_list.append(q)
writer_pool.apply_async(file_writer, args=(q, TF_names_sub, files, args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))

TF2files = dict(zip(TF_names_sub, files))
logger.debug("TF2files dict: {0}".format(TF2files))
writer_pool.apply_async(file_writer, args=(q, TF2files, args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))
for TF in TF_names_sub:
qs[TF] = q
writer_pool.close() #no more jobs applied to writer_pool
args.qs = qs #qs is a dict
args.qs = qs #qs is a dict

#Setup scanners pool
input_arguments = [(chunk, args, motif_list) for chunk in region_chunks]
task_list = [worker_pool.apply_async(motif_scanning, (chunk, args, motif_list, )) for chunk in region_chunks]
monitor_progress(task_list, logger)
results = [task.get() for task in task_list]
logger.info("Done!")

#Wait for files to write
for TF in qs:
Expand All @@ -261,13 +271,13 @@ def run_tfbscan(args):

#Process each file output and write out
logger.comment("")
logger.critical("Processing results for each TF")
logger.critical("Processing results from scanning")
logger.debug("Running processing for files: {0}".format(temp_files))
task_list = [worker_pool.apply_async(process_TFBS, (file, args)) for file in temp_files]
worker_pool.close()
monitor_progress(task_list, logger)
worker_pool.terminate()
results = [task.get() for task in task_list]
logger.info("Done!")

logger.debug("Joining multiprocessing pools")
worker_pool.join()
Expand All @@ -278,6 +288,7 @@ def run_tfbscan(args):
logger.critical("Finished TFBScan run (total time of {0})".format(end_time - begin_time))



#----------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":

Expand Down
Loading

0 comments on commit 26ff437

Please sign in to comment.