From b71a886380438e39539d885dc99c32cf35163140 Mon Sep 17 00:00:00 2001
From: msbentsen <mette.bentsen@mpi-bn.mpg.de>
Date: Thu, 16 May 2019 09:49:39 +0200
Subject: [PATCH] tobias 0.5.1; new options for motiflist class

---
 CHANGES                          |   4 +
 setup.py                         |   2 +-
 tobias/__init__.py               |   2 +-
 tobias/footprinting/bindetect.py |  12 +--
 tobias/motifs/format_motifs.py   |  67 ++++++----------
 tobias/motifs/tfbscan.py         |   3 -
 tobias/utils/motifs.py           | 126 ++++++++++++++++++++++++++++---
 tobias/utils/utilities.py        |  28 +++++--
 8 files changed, 175 insertions(+), 69 deletions(-)
diff --git a/CHANGES b/CHANGES
index abe0cd1..2666303 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,7 @@
+## 0.5.1 (2019-05-15)
+- Internal changes to OneMotif and MotifList classes
+- Bindetect now takes directories/file(s) as input for --motifs
+
 ## 0.5.0 (2019-05-02)
 - Added sum/mean/none scoring to ScoreBigwig as well as the option to get --absolute of input signal
 
diff --git a/setup.py b/setup.py
index b6225c0..0313771 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,6 @@
 	cmdclass = {'build_ext': build_ext}
 except ImportError:
 	use_cython = False
-
 else:
 	use_cython = True
 
@@ -53,6 +52,7 @@ def readme():
 		version=find_version(os.path.join(setupdir, "tobias", "__init__.py")),	#get version from __init__.py
 		description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal',
 		long_description=readme(),
+		long_description_content_type='text/markdown',
 		url='https://github.molgen.mpg.de/loosolab/TOBIAS',
 		author='Mette Bentsen',
 		author_email='mette.bentsen@mpi-bn.mpg.de',
diff --git a/tobias/__init__.py b/tobias/__init__.py
index 3d18726..dd9b22c 100644
--- a/tobias/__init__.py
+++ b/tobias/__init__.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.5.1"
diff --git a/tobias/footprinting/bindetect.py b/tobias/footprinting/bindetect.py
index 7db350d..94e3369 100644
--- a/tobias/footprinting/bindetect.py
+++ b/tobias/footprinting/bindetect.py
@@ -69,7 +69,7 @@ def add_bindetect_arguments(parser):
 	required = parser.add_argument_group('Required arguments')
 	required.add_argument('--signals', metavar="<bigwig>", help="Signal per condition (.bigwig format)", nargs="*")
 	required.add_argument('--peaks', metavar="<bed>", help="Peaks.bed containing open chromatin regions across all conditions")
-	required.add_argument('--motifs', metavar="<motifs>", help="Motifs in pfm/jaspar format")
+	required.add_argument('--motifs', metavar="<motifs>", help="Motif file(s) in pfm/jaspar format", nargs="*")
 	required.add_argument('--genome', metavar="<fasta>", help="Genome .fasta file")
 
 	optargs = parser.add_argument_group('Optional arguments')
@@ -239,13 +239,13 @@ def run_bindetect(args):
 
 	################ Get motifs ################
 	logger.info("Reading motifs from file") 
-
-	motif_content = open(args.motifs).read()
-	converted_content = convert_motif(motif_content, "pfm")
-	motif_list = pfm_to_motifs(converted_content) 			#List of OneMotif objects
+	motif_list = MotifList()
+	args.motifs = expand_dirs(args.motifs)
+	for f in args.motifs:
+		motif_list += MotifList().from_file(f)  #List of OneMotif objects
 	no_pfms = len(motif_list)
+	logger.info("- Read {0} motifs".format(no_pfms))
 
-	logger.info("- Found {0} motifs in file".format(no_pfms))
 	logger.debug("Getting motifs ready")
 	motif_list.bg = bg
 
diff --git a/tobias/motifs/format_motifs.py b/tobias/motifs/format_motifs.py
index 085a049..8141d0f 100644
--- a/tobias/motifs/format_motifs.py
+++ b/tobias/motifs/format_motifs.py
@@ -34,6 +34,7 @@ def add_formatmotifs_arguments(parser):
 	required.add_argument('--input', metavar="", nargs="*", help="One or more input motif files")			
 	required.add_argument('--format', metavar="", help="Desired motif output format (pfm, jaspar, meme) (default: \"jaspar\")", choices=["pfm", "jaspar", "meme"], default="jaspar")
 	required.add_argument('--task', metavar="", help="Which task to perform on motif files (join/split) (default: join)", choices=["join", "split"], default="join")
+	required.add_argument('--filter', metavar="", help="File containing list of motif names/ids to filter on. Only motifs fitting entries in filter will be output.")
 	required.add_argument('--output', metavar="", help="If task == join, output is the joined output file; if task == split, output is a directory")
 	
 	additional = parser.add_argument_group('Additional arguments')
@@ -46,7 +47,8 @@ def add_formatmotifs_arguments(parser):
 def run_formatmotifs(args):
 
 	check_required(args, ["input", "output"])	#Check input arguments
-	check_files(args.input) 					#Check if files exist
+	motif_files = expand_dirs(args.input)		#Expand any dirs in input
+	check_files(motif_files) 					#Check if files exist
 
 	# Create logger and write argument overview
 	logger = TobiasLogger("FormatMotifs", args.verbosity)
@@ -58,70 +60,47 @@ def run_formatmotifs(args):
 
 	####### Getting ready #######
 	if args.task == "split":
-	
 		logger.info("Making directory {0} if not existing".format(args.output))
 		make_directory(args.output)		#Check and create output directory
-	else:
-		logger.info("Opening file {0} for writing".format(args.output))
-		out_f = open(args.output, "w")	#open file
 
-	### Estimate format of input files
+	### Read motifs from files ###
 	logger.info("Reading input files...")
+	motif_list = MotifList()
 	converted_content = ""
-	for f in args.input:
-		content = open(f).read()
-		motif_format = get_motif_format(content)
-		logger.info("- {0}: {1}".format(f, motif_format))
+	for f in motif_files:
+		logger.info("- {0}".format(f))
+		motif_list.extend(MotifList().from_file(f))
 
-		#Convert to output format
-		converted_content += convert_motif(content, args.format)
+	logger.info("Read {} motifs\n".format(len(motif_list)))
 
-	logger.comment("")
+	### Filter motif list ###
+	if args.filter != None:
+
+		#Read filter
+		pass
 
-	#### Write out results ####
-	#todo: if meme is input, estimate meme header from input file; else set it to standard
-	meme_header = "MEME version 4\n\n"
-	meme_header += "ALPHABET=ACGT\n\n"
-	meme_header += "strands: + -\n\n"
-	meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n"
 
+	#### Write out results ####
 	if args.task == "split":
 		logger.info("Writing individual files to directory {0}".format(args.output))
 
-		#Split on ">"
-		if args.format == "jaspar" or args.format == "pfm":
-			splitted_content = [ ">" + content for content in converted_content.split(">") if content != ""]
-		elif args.format == "meme":
-			splitted_content = [content for content in converted_content.split("MOTIF") if content != ""]
-			splitted_content = [content if content.startswith("MOTIF") else "MOTIF" + content for content in splitted_content]
-			
-		#Write out
-		for motif in splitted_content:
-
-			#Get name from headerline
-			elements = motif.replace("MOTIF", "").replace(">", "").split()
-			motifid, name = elements[0], elements[1]
-
+		for motif in motif_list:
+			motif_string = MotifList([motif]).as_string(args.format)
 
 			#Open file and write
-			out_path = os.path.join(args.output, motifid + "." + args.format)
+			out_path = os.path.join(args.output, motif.id + "." + args.format)
 			logger.info("- {0}".format(out_path))
 			f_out = open(out_path, "w")
-
-			if args.format == "meme":
-				f_out.write(meme_header + motif) #make sure every motif has a header
-			else:
-				f_out.write(motif)
-
+			f_out.write(motif_string)
 			f_out.close()
 	
 	elif args.task == "join":
 		logger.info("Writing converted motifs to file {0}".format(args.output))
 
-		if args.format == "meme":
-			converted_content = meme_header + converted_content
-
-		out_f.write(converted_content + "\n")
+		f_out = open(args.output, "w")
+		motif_string = motif_list.as_string(args.format)
+		f_out.write(motif_string)
+		f_out.close()
 
 	logger.end()
 
diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py
index 4937d40..5ebf036 100644
--- a/tobias/motifs/tfbscan.py
+++ b/tobias/motifs/tfbscan.py
@@ -154,7 +154,6 @@ def run_tfbscan(args):
 	if args.outfile != None:
 		logger.output_files([args.outfile])
 
-
 	######## Read sequences from file and estimate background gc ########
 	
 	logger.info("Handling input files")
@@ -185,7 +184,6 @@ def run_tfbscan(args):
 		sys.exit()
 	logger.info("- Total of {0} regions (after splitting)".format(len(regions)))
 	
-
 	#Background gc
 	if args.gc == None:
 		logger.info("Estimating GC content from fasta (set --gc to skip this step)")
@@ -210,7 +208,6 @@ def run_tfbscan(args):
 	
 	logger.debug("Getting motifs ready")
 	motif_list.bg = bg
-	#motif_names = [motif.prefix for motif in motif_list]
 
 	reverse_motifs = [motif.get_reverse() for motif in motif_list]
 	motif_list.extend(reverse_motifs)
diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py
index 60c5452..26b7399 100644
--- a/tobias/utils/motifs.py
+++ b/tobias/utils/motifs.py
@@ -25,7 +25,7 @@
 
 #Internal 
 from tobias.utils.regions import * 
-from tobias.utils.utilities import filafy 	#filafy for filenames
+from tobias.utils.utilities import filafy, num 	#filafy for filenames
 
 #----------------------------------------------------------------------------------------#
 #List of OneMotif objects
@@ -49,6 +49,114 @@ def __init__(self, lst=[]):
 	def __str__(self):
 		return("\n".join([str(onemotif) for onemotif in self]))
 
+	def from_file(self, path):
+		
+		content = open(path).read()
+
+		#Establish format of motif
+		file_format = get_motif_format(content)
+
+		lines = content.split("\n")
+
+		#Read motifs
+		if file_format == "meme":
+			for idx, line in enumerate(lines):
+				columns = line.strip().split()
+
+				if line.startswith("MOTIF"):
+					self.append(OneMotif()) #create new motif
+					self[-1].input_format = file_format
+
+					#Get id/name of motif
+					if len(columns) > 2: #MOTIF, ID, NAME
+						motif_id, name = columns[1], columns[2]
+					elif len(columns) == 2: # MOTIF, ID
+						motif_id, name = columns[1], ""	#name not given
+
+					self[-1].id = motif_id
+					self[-1].name = name
+				
+				else:
+					if len(self) > 0: #if there was already one motif header found
+					
+						#If line contains counts
+						if re.match("^[\s]*([\d\.\s]+)$", line):	#starts with any number of spaces (or none) followed by numbers
+							for i, col in enumerate(columns):
+								self[-1][i].append(num(col))
+
+		elif file_format in ["pfm", "jaspar"]:
+																				
+			for line in lines:
+				m = re.match(".*?([\d]+[\d\.\s]+).*?", line)
+				
+				if line.startswith(">"):
+					self.append(OneMotif(counts=[])) #create new motif
+					self[-1].input_format = file_format
+
+					columns = line[1:].strip().split()		#[1:] to remove > from header
+					if len(columns) > 1: #ID, NAME
+						motif_id, name = columns[0], columns[1]
+					elif len(columns) == 1: #>ID
+						motif_id, name = columns[0], ""	#name not given
+
+					self[-1].id = motif_id
+					self[-1].name = name	
+						
+				elif m:
+					columns = [num(field) for field in m.group(1).rstrip().split()]
+					self[-1].counts.append(columns)
+
+		#Check correct format of pfms
+		for motif in self:
+			nuc, pos = np.array(motif.counts).shape
+			motif.w = pos
+			if nuc != 4:
+				sys.exit("ERROR: Motif {0} has an unexpected format and could not be read".format(motif))
+
+		#Estimate widths and n_sites
+		for motif in self:
+			motif.n = int(round(sum([base_counts[0] for base_counts in motif.counts])))
+				
+		return(self)
+
+	def as_string(self, output_format="pfm"):
+
+		bases = ["A", "C", "G", "T"]
+		out_string = ""
+
+		#Establish which output format
+		if output_format in ["pfm", "jaspar"]:
+			for motif in self:
+				out_string += ">{0}\t{1}\n".format(motif.id, motif.name)
+				for i, base_counts in enumerate(motif.counts):
+					base_counts_string = ["{0:.5f}".format(element) for element in base_counts]
+					out_string += "{0} [ {1} ] \n".format(bases[i], "\t".join(base_counts_string)) if output_format == "jaspar" else "\t".join(base_counts_string) + "\n"
+				out_string += "\n"
+
+		elif output_format == "meme":
+			
+			meme_header = "MEME version 4\n\n"
+			meme_header += "ALPHABET=ACGT\n\n"
+			meme_header += "strands: + -\n\n"
+			meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n"
+			out_string += meme_header
+
+			for motif in self:
+				out_string += "MOTIF\t{0}\t{1}\n".format(motif.id, motif.name)
+				out_string += "letter-probability matrix: alength=4 w={0} nsites={1} E=0\n".format(motif.w, motif.n)
+
+				for i in range(motif.w):
+					row = [float(motif.counts[j][i]) for j in range(4)] 	#row contains original row from content
+					n_sites = round(sum(row), 0)
+					row_freq = ["{0:.5f}".format(num/n_sites) for num in row] 
+					out_string += "  ".join(row_freq) + "\n"
+				
+				out_string += "\n"	
+
+		return(out_string)			
+
+	#---------------- Functions for moods scanning ------------------------#
+
 	def setup_moods_scanner(self):
 
 		tups = [(motif.prefix, motif.strand, motif.pssm, motif.threshold) for motif in self] 		#list of tups
@@ -91,13 +199,13 @@ class OneMotif:
 
 	bases = ["A", "C", "G", "T"]
 
-	def __init__(self, motifid, name, counts):
+	def __init__(self, motifid="", name="", counts=[[] for _ in range(4)]):
 		
 		self.id = motifid			#must be unique
 		self.name = name	#does not have to be unique
 
 		self.prefix = "" 	#output prefix set in set_prefix
-		self.counts = counts  	#counts
+		self.counts = counts  	#counts, list of 4 lists (each as long as motif)
 		self.strand = "+"		#default strand is +
 
 		#Set later
@@ -182,13 +290,13 @@ def calc_bit_score(self):
 	def plot_logo(self):
 
 		LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp),
-		            "G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
-		            "A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
-		            "C" : TextPath((-0.366, 0), "C", size=1, prop=fp) }
+						"G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
+						"A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
+						"C" : TextPath((-0.366, 0), "C", size=1, prop=fp) }
 		COLOR_SCHEME = {'G': 'orange', 
-		                'A': "#CC0000", 
-		                'C': 'mediumblue', 
-		                'T': 'darkgreen'}
+							 'A': "#CC0000", 
+							 'C': 'mediumblue', 
+							 'T': 'darkgreen'}
 
 		def add_letter(base, x, y, scale, ax):
 			""" Add letter to axis at positions x/y"""
diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py
index 57bb242..8814d21 100644
--- a/tobias/utils/utilities.py
+++ b/tobias/utils/utilities.py
@@ -288,6 +288,11 @@ def check_required(args, required):
 #---------------------------------------- Misc ---------------------------------------------#
 #-------------------------------------------------------------------------------------------#
 
+def num(s):
+	try:
+		return int(s)
+	except ValueError:
+		return float(s)
 
 class Progress:
 	""" Class for writing out progress of processes such as multiprocessing """
@@ -316,11 +321,24 @@ def write(self, progress):
 
 def flatten_list(lst):
 
-    for element in lst:
-        if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)):
-            yield from flatten_list(element)
-        else:
-            yield element
+	for element in lst:
+		if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)):
+			yield from flatten_list(element)
+		else:
+			yield element
+
+def expand_dirs(list_of_paths):
+	""" Expands a list of files and dirs to a list of all files within dirs """
+
+	all_files = []
+	for path in list_of_paths:
+		if os.path.isdir(path):
+			files = os.listdir(path)
+			all_files.extend([os.path.join(path, f) for f in files])
+		else:
+			all_files.append(path)
+
+	return(all_files)
 
 def check_files(lst_of_files, action="r"):