From b71a886380438e39539d885dc99c32cf35163140 Mon Sep 17 00:00:00 2001 From: msbentsen Date: Thu, 16 May 2019 09:49:39 +0200 Subject: [PATCH] tobias 0.5.1; new options for motiflist class --- CHANGES | 4 + setup.py | 2 +- tobias/__init__.py | 2 +- tobias/footprinting/bindetect.py | 12 +-- tobias/motifs/format_motifs.py | 67 ++++++---------- tobias/motifs/tfbscan.py | 3 - tobias/utils/motifs.py | 126 ++++++++++++++++++++++++++++--- tobias/utils/utilities.py | 28 +++++-- 8 files changed, 175 insertions(+), 69 deletions(-) diff --git a/CHANGES b/CHANGES index abe0cd1..2666303 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +## 0.5.1 (2019-05-15) +- Internal changes to OneMotif and MotifList classes +- Bindetect now takes directories/file(s) as input for --motifs + ## 0.5.0 (2019-05-02) - Added sum/mean/none scoring to ScoreBigwig as well as the option to get --absolute of input signal diff --git a/setup.py b/setup.py index b6225c0..0313771 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,6 @@ cmdclass = {'build_ext': build_ext} except ImportError: use_cython = False - else: use_cython = True @@ -53,6 +52,7 @@ def readme(): version=find_version(os.path.join(setupdir, "tobias", "__init__.py")), #get version from __init__.py description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal', long_description=readme(), + long_description_content_type='text/markdown', url='https://github.molgen.mpg.de/loosolab/TOBIAS', author='Mette Bentsen', author_email='mette.bentsen@mpi-bn.mpg.de', diff --git a/tobias/__init__.py b/tobias/__init__.py index 3d18726..dd9b22c 100644 --- a/tobias/__init__.py +++ b/tobias/__init__.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "0.5.1" diff --git a/tobias/footprinting/bindetect.py b/tobias/footprinting/bindetect.py index 7db350d..94e3369 100644 --- a/tobias/footprinting/bindetect.py +++ b/tobias/footprinting/bindetect.py @@ -69,7 +69,7 @@ def add_bindetect_arguments(parser): required = parser.add_argument_group('Required arguments') required.add_argument('--signals', metavar="", help="Signal per condition (.bigwig format)", nargs="*") required.add_argument('--peaks', metavar="", help="Peaks.bed containing open chromatin regions across all conditions") - required.add_argument('--motifs', metavar="", help="Motifs in pfm/jaspar format") + required.add_argument('--motifs', metavar="", help="Motif file(s) in pfm/jaspar format", nargs="*") required.add_argument('--genome', metavar="", help="Genome .fasta file") optargs = parser.add_argument_group('Optional arguments') @@ -239,13 +239,13 @@ def run_bindetect(args): ################ Get motifs ################ logger.info("Reading motifs from file") - - motif_content = open(args.motifs).read() - converted_content = convert_motif(motif_content, "pfm") - motif_list = pfm_to_motifs(converted_content) #List of OneMotif objects + motif_list = MotifList() + args.motifs = expand_dirs(args.motifs) + for f in args.motifs: + motif_list += MotifList().from_file(f) #List of OneMotif objects no_pfms = len(motif_list) + logger.info("- Read {0} motifs".format(no_pfms)) - logger.info("- Found {0} motifs in file".format(no_pfms)) logger.debug("Getting motifs ready") motif_list.bg = bg diff --git a/tobias/motifs/format_motifs.py b/tobias/motifs/format_motifs.py index 085a049..8141d0f 100644 --- a/tobias/motifs/format_motifs.py +++ b/tobias/motifs/format_motifs.py @@ -34,6 +34,7 @@ def add_formatmotifs_arguments(parser): required.add_argument('--input', metavar="", nargs="*", help="One or more input motif files") required.add_argument('--format', metavar="", help="Desired motif output format (pfm, jaspar, meme) (default: \"jaspar\")", choices=["pfm", "jaspar", "meme"], default="jaspar") required.add_argument('--task', metavar="", help="Which task to perform on motif files (join/split) (default: join)", choices=["join", "split"], default="join") + required.add_argument('--filter', metavar="", help="File containing list of motif names/ids to filter on. Only motifs fitting entries in filter will be output.") required.add_argument('--output', metavar="", help="If task == join, output is the joined output file; if task == split, output is a directory") additional = parser.add_argument_group('Additional arguments') @@ -46,7 +47,8 @@ def add_formatmotifs_arguments(parser): def run_formatmotifs(args): check_required(args, ["input", "output"]) #Check input arguments - check_files(args.input) #Check if files exist + motif_files = expand_dirs(args.input) #Expand any dirs in input + check_files(motif_files) #Check if files exist # Create logger and write argument overview logger = TobiasLogger("FormatMotifs", args.verbosity) @@ -58,70 +60,47 @@ def run_formatmotifs(args): ####### Getting ready ####### if args.task == "split": - logger.info("Making directory {0} if not existing".format(args.output)) make_directory(args.output) #Check and create output directory - else: - logger.info("Opening file {0} for writing".format(args.output)) - out_f = open(args.output, "w") #open file - ### Estimate format of input files + ### Read motifs from files ### logger.info("Reading input files...") + motif_list = MotifList() converted_content = "" - for f in args.input: - content = open(f).read() - motif_format = get_motif_format(content) - logger.info("- {0}: {1}".format(f, motif_format)) + for f in motif_files: + logger.info("- {0}".format(f)) + motif_list.extend(MotifList().from_file(f)) - #Convert to output format - converted_content += convert_motif(content, args.format) + logger.info("Read {} motifs\n".format(len(motif_list))) - logger.comment("") + ### Filter motif list ### + if args.filter != None: + + #Read filter + pass - #### Write out results #### - #todo: if meme is input, estimate meme header from input file; else set it to standard - meme_header = "MEME version 4\n\n" - meme_header += "ALPHABET=ACGT\n\n" - meme_header += "strands: + -\n\n" - meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n" + #### Write out results #### if args.task == "split": logger.info("Writing individual files to directory {0}".format(args.output)) - #Split on ">" - if args.format == "jaspar" or args.format == "pfm": - splitted_content = [ ">" + content for content in converted_content.split(">") if content != ""] - elif args.format == "meme": - splitted_content = [content for content in converted_content.split("MOTIF") if content != ""] - splitted_content = [content if content.startswith("MOTIF") else "MOTIF" + content for content in splitted_content] - - #Write out - for motif in splitted_content: - - #Get name from headerline - elements = motif.replace("MOTIF", "").replace(">", "").split() - motifid, name = elements[0], elements[1] - + for motif in motif_list: + motif_string = MotifList([motif]).as_string(args.format) #Open file and write - out_path = os.path.join(args.output, motifid + "." + args.format) + out_path = os.path.join(args.output, motif.id + "." + args.format) logger.info("- {0}".format(out_path)) f_out = open(out_path, "w") - - if args.format == "meme": - f_out.write(meme_header + motif) #make sure every motif has a header - else: - f_out.write(motif) - + f_out.write(motif_string) f_out.close() elif args.task == "join": logger.info("Writing converted motifs to file {0}".format(args.output)) - if args.format == "meme": - converted_content = meme_header + converted_content - - out_f.write(converted_content + "\n") + f_out = open(args.output, "w") + motif_string = motif_list.as_string(args.format) + f_out.write(motif_string) + f_out.close() logger.end() diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py index 4937d40..5ebf036 100644 --- a/tobias/motifs/tfbscan.py +++ b/tobias/motifs/tfbscan.py @@ -154,7 +154,6 @@ def run_tfbscan(args): if args.outfile != None: logger.output_files([args.outfile]) - ######## Read sequences from file and estimate background gc ######## logger.info("Handling input files") @@ -185,7 +184,6 @@ def run_tfbscan(args): sys.exit() logger.info("- Total of {0} regions (after splitting)".format(len(regions))) - #Background gc if args.gc == None: logger.info("Estimating GC content from fasta (set --gc to skip this step)") @@ -210,7 +208,6 @@ def run_tfbscan(args): logger.debug("Getting motifs ready") motif_list.bg = bg - #motif_names = [motif.prefix for motif in motif_list] reverse_motifs = [motif.get_reverse() for motif in motif_list] motif_list.extend(reverse_motifs) diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py index 60c5452..26b7399 100644 --- a/tobias/utils/motifs.py +++ b/tobias/utils/motifs.py @@ -25,7 +25,7 @@ #Internal from tobias.utils.regions import * -from tobias.utils.utilities import filafy #filafy for filenames +from tobias.utils.utilities import filafy, num #filafy for filenames #----------------------------------------------------------------------------------------# #List of OneMotif objects @@ -49,6 +49,114 @@ def __init__(self, lst=[]): def __str__(self): return("\n".join([str(onemotif) for onemotif in self])) + def from_file(self, path): + + content = open(path).read() + + #Establish format of motif + file_format = get_motif_format(content) + + lines = content.split("\n") + + #Read motifs + if file_format == "meme": + for idx, line in enumerate(lines): + columns = line.strip().split() + + if line.startswith("MOTIF"): + self.append(OneMotif()) #create new motif + self[-1].input_format = file_format + + #Get id/name of motif + if len(columns) > 2: #MOTIF, ID, NAME + motif_id, name = columns[1], columns[2] + elif len(columns) == 2: # MOTIF, ID + motif_id, name = columns[1], "" #name not given + + self[-1].id = motif_id + self[-1].name = name + + else: + if len(self) > 0: #if there was already one motif header found + + #If line contains counts + if re.match("^[\s]*([\d\.\s]+)$", line): #starts with any number of spaces (or none) followed by numbers + for i, col in enumerate(columns): + self[-1][i].append(num(col)) + + elif file_format in ["pfm", "jaspar"]: + + for line in lines: + m = re.match(".*?([\d]+[\d\.\s]+).*?", line) + + if line.startswith(">"): + self.append(OneMotif(counts=[])) #create new motif + self[-1].input_format = file_format + + columns = line[1:].strip().split() #[1:] to remove > from header + if len(columns) > 1: #ID, NAME + motif_id, name = columns[0], columns[1] + elif len(columns) == 1: #>ID + motif_id, name = columns[0], "" #name not given + + self[-1].id = motif_id + self[-1].name = name + + elif m: + columns = [num(field) for field in m.group(1).rstrip().split()] + self[-1].counts.append(columns) + + #Check correct format of pfms + for motif in self: + nuc, pos = np.array(motif.counts).shape + motif.w = pos + if nuc != 4: + sys.exit("ERROR: Motif {0} has an unexpected format and could not be read".format(motif)) + + #Estimate widths and n_sites + for motif in self: + motif.n = int(round(sum([base_counts[0] for base_counts in motif.counts]))) + + return(self) + + def as_string(self, output_format="pfm"): + + bases = ["A", "C", "G", "T"] + out_string = "" + + #Establish which output format + if output_format in ["pfm", "jaspar"]: + for motif in self: + out_string += ">{0}\t{1}\n".format(motif.id, motif.name) + for i, base_counts in enumerate(motif.counts): + base_counts_string = ["{0:.5f}".format(element) for element in base_counts] + out_string += "{0} [ {1} ] \n".format(bases[i], "\t".join(base_counts_string)) if output_format == "jaspar" else "\t".join(base_counts_string) + "\n" + out_string += "\n" + + elif output_format == "meme": + + meme_header = "MEME version 4\n\n" + meme_header += "ALPHABET=ACGT\n\n" + meme_header += "strands: + -\n\n" + meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n" + out_string += meme_header + + for motif in self: + out_string += "MOTIF\t{0}\t{1}\n".format(motif.id, motif.name) + out_string += "letter-probability matrix: alength=4 w={0} nsites={1} E=0\n".format(motif.w, motif.n) + + for i in range(motif.w): + row = [float(motif.counts[j][i]) for j in range(4)] #row contains original row from content + n_sites = round(sum(row), 0) + row_freq = ["{0:.5f}".format(num/n_sites) for num in row] + out_string += " ".join(row_freq) + "\n" + + out_string += "\n" + + return(out_string) + + #---------------- Functions for moods scanning ------------------------# + def setup_moods_scanner(self): tups = [(motif.prefix, motif.strand, motif.pssm, motif.threshold) for motif in self] #list of tups @@ -91,13 +199,13 @@ class OneMotif: bases = ["A", "C", "G", "T"] - def __init__(self, motifid, name, counts): + def __init__(self, motifid="", name="", counts=[[] for _ in range(4)]): self.id = motifid #must be unique self.name = name #does not have to be unique self.prefix = "" #output prefix set in set_prefix - self.counts = counts #counts + self.counts = counts #counts, list of 4 lists (each as long as motif) self.strand = "+" #default strand is + #Set later @@ -182,13 +290,13 @@ def calc_bit_score(self): def plot_logo(self): LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp), - "G" : TextPath((-0.384, 0), "G", size=1, prop=fp), - "A" : TextPath((-0.35, 0), "A", size=1, prop=fp), - "C" : TextPath((-0.366, 0), "C", size=1, prop=fp) } + "G" : TextPath((-0.384, 0), "G", size=1, prop=fp), + "A" : TextPath((-0.35, 0), "A", size=1, prop=fp), + "C" : TextPath((-0.366, 0), "C", size=1, prop=fp) } COLOR_SCHEME = {'G': 'orange', - 'A': "#CC0000", - 'C': 'mediumblue', - 'T': 'darkgreen'} + 'A': "#CC0000", + 'C': 'mediumblue', + 'T': 'darkgreen'} def add_letter(base, x, y, scale, ax): """ Add letter to axis at positions x/y""" diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py index 57bb242..8814d21 100644 --- a/tobias/utils/utilities.py +++ b/tobias/utils/utilities.py @@ -288,6 +288,11 @@ def check_required(args, required): #---------------------------------------- Misc ---------------------------------------------# #-------------------------------------------------------------------------------------------# +def num(s): + try: + return int(s) + except ValueError: + return float(s) class Progress: """ Class for writing out progress of processes such as multiprocessing """ @@ -316,11 +321,24 @@ def write(self, progress): def flatten_list(lst): - for element in lst: - if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)): - yield from flatten_list(element) - else: - yield element + for element in lst: + if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)): + yield from flatten_list(element) + else: + yield element + +def expand_dirs(list_of_paths): + """ Expands a list of files and dirs to a list of all files within dirs """ + + all_files = [] + for path in list_of_paths: + if os.path.isdir(path): + files = os.listdir(path) + all_files.extend([os.path.join(path, f) for f in files]) + else: + all_files.append(path) + + return(all_files) def check_files(lst_of_files, action="r"):