Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
tobias 0.5.1; new options for motiflist class
  • Loading branch information
msbentsen committed May 16, 2019
1 parent ce5e13c commit b71a886
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 69 deletions.
4 changes: 4 additions & 0 deletions CHANGES
@@ -1,3 +1,7 @@
## 0.5.1 (2019-05-15)
- Internal changes to OneMotif and MotifList classes
- Bindetect now takes directories/file(s) as input for --motifs

## 0.5.0 (2019-05-02)
- Added sum/mean/none scoring to ScoreBigwig as well as the option to get --absolute of input signal

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -18,7 +18,6 @@
cmdclass = {'build_ext': build_ext}
except ImportError:
use_cython = False

else:
use_cython = True

Expand Down Expand Up @@ -53,6 +52,7 @@ def readme():
version=find_version(os.path.join(setupdir, "tobias", "__init__.py")), #get version from __init__.py
description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal',
long_description=readme(),
long_description_content_type='text/markdown',
url='https://github.molgen.mpg.de/loosolab/TOBIAS',
author='Mette Bentsen',
author_email='mette.bentsen@mpi-bn.mpg.de',
Expand Down
2 changes: 1 addition & 1 deletion tobias/__init__.py
@@ -1 +1 @@
__version__ = "0.5.0"
__version__ = "0.5.1"
12 changes: 6 additions & 6 deletions tobias/footprinting/bindetect.py
Expand Up @@ -69,7 +69,7 @@ def add_bindetect_arguments(parser):
required = parser.add_argument_group('Required arguments')
required.add_argument('--signals', metavar="<bigwig>", help="Signal per condition (.bigwig format)", nargs="*")
required.add_argument('--peaks', metavar="<bed>", help="Peaks.bed containing open chromatin regions across all conditions")
required.add_argument('--motifs', metavar="<motifs>", help="Motifs in pfm/jaspar format")
required.add_argument('--motifs', metavar="<motifs>", help="Motif file(s) in pfm/jaspar format", nargs="*")
required.add_argument('--genome', metavar="<fasta>", help="Genome .fasta file")

optargs = parser.add_argument_group('Optional arguments')
Expand Down Expand Up @@ -239,13 +239,13 @@ def run_bindetect(args):

################ Get motifs ################
logger.info("Reading motifs from file")

motif_content = open(args.motifs).read()
converted_content = convert_motif(motif_content, "pfm")
motif_list = pfm_to_motifs(converted_content) #List of OneMotif objects
motif_list = MotifList()
args.motifs = expand_dirs(args.motifs)
for f in args.motifs:
motif_list += MotifList().from_file(f) #List of OneMotif objects
no_pfms = len(motif_list)
logger.info("- Read {0} motifs".format(no_pfms))

logger.info("- Found {0} motifs in file".format(no_pfms))
logger.debug("Getting motifs ready")
motif_list.bg = bg

Expand Down
67 changes: 23 additions & 44 deletions tobias/motifs/format_motifs.py
Expand Up @@ -34,6 +34,7 @@ def add_formatmotifs_arguments(parser):
required.add_argument('--input', metavar="", nargs="*", help="One or more input motif files")
required.add_argument('--format', metavar="", help="Desired motif output format (pfm, jaspar, meme) (default: \"jaspar\")", choices=["pfm", "jaspar", "meme"], default="jaspar")
required.add_argument('--task', metavar="", help="Which task to perform on motif files (join/split) (default: join)", choices=["join", "split"], default="join")
required.add_argument('--filter', metavar="", help="File containing list of motif names/ids to filter on. Only motifs fitting entries in filter will be output.")
required.add_argument('--output', metavar="", help="If task == join, output is the joined output file; if task == split, output is a directory")

additional = parser.add_argument_group('Additional arguments')
Expand All @@ -46,7 +47,8 @@ def add_formatmotifs_arguments(parser):
def run_formatmotifs(args):

check_required(args, ["input", "output"]) #Check input arguments
check_files(args.input) #Check if files exist
motif_files = expand_dirs(args.input) #Expand any dirs in input
check_files(motif_files) #Check if files exist

# Create logger and write argument overview
logger = TobiasLogger("FormatMotifs", args.verbosity)
Expand All @@ -58,70 +60,47 @@ def run_formatmotifs(args):

####### Getting ready #######
if args.task == "split":

logger.info("Making directory {0} if not existing".format(args.output))
make_directory(args.output) #Check and create output directory
else:
logger.info("Opening file {0} for writing".format(args.output))
out_f = open(args.output, "w") #open file

### Estimate format of input files
### Read motifs from files ###
logger.info("Reading input files...")
motif_list = MotifList()
converted_content = ""
for f in args.input:
content = open(f).read()
motif_format = get_motif_format(content)
logger.info("- {0}: {1}".format(f, motif_format))
for f in motif_files:
logger.info("- {0}".format(f))
motif_list.extend(MotifList().from_file(f))

#Convert to output format
converted_content += convert_motif(content, args.format)
logger.info("Read {} motifs\n".format(len(motif_list)))

logger.comment("")
### Filter motif list ###
if args.filter != None:

#Read filter
pass

#### Write out results ####
#todo: if meme is input, estimate meme header from input file; else set it to standard
meme_header = "MEME version 4\n\n"
meme_header += "ALPHABET=ACGT\n\n"
meme_header += "strands: + -\n\n"
meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n"

#### Write out results ####
if args.task == "split":
logger.info("Writing individual files to directory {0}".format(args.output))

#Split on ">"
if args.format == "jaspar" or args.format == "pfm":
splitted_content = [ ">" + content for content in converted_content.split(">") if content != ""]
elif args.format == "meme":
splitted_content = [content for content in converted_content.split("MOTIF") if content != ""]
splitted_content = [content if content.startswith("MOTIF") else "MOTIF" + content for content in splitted_content]

#Write out
for motif in splitted_content:

#Get name from headerline
elements = motif.replace("MOTIF", "").replace(">", "").split()
motifid, name = elements[0], elements[1]

for motif in motif_list:
motif_string = MotifList([motif]).as_string(args.format)

#Open file and write
out_path = os.path.join(args.output, motifid + "." + args.format)
out_path = os.path.join(args.output, motif.id + "." + args.format)
logger.info("- {0}".format(out_path))
f_out = open(out_path, "w")

if args.format == "meme":
f_out.write(meme_header + motif) #make sure every motif has a header
else:
f_out.write(motif)

f_out.write(motif_string)
f_out.close()

elif args.task == "join":
logger.info("Writing converted motifs to file {0}".format(args.output))

if args.format == "meme":
converted_content = meme_header + converted_content

out_f.write(converted_content + "\n")
f_out = open(args.output, "w")
motif_string = motif_list.as_string(args.format)
f_out.write(motif_string)
f_out.close()

logger.end()

Expand Down
3 changes: 0 additions & 3 deletions tobias/motifs/tfbscan.py
Expand Up @@ -154,7 +154,6 @@ def run_tfbscan(args):
if args.outfile != None:
logger.output_files([args.outfile])


######## Read sequences from file and estimate background gc ########

logger.info("Handling input files")
Expand Down Expand Up @@ -185,7 +184,6 @@ def run_tfbscan(args):
sys.exit()
logger.info("- Total of {0} regions (after splitting)".format(len(regions)))


#Background gc
if args.gc == None:
logger.info("Estimating GC content from fasta (set --gc to skip this step)")
Expand All @@ -210,7 +208,6 @@ def run_tfbscan(args):

logger.debug("Getting motifs ready")
motif_list.bg = bg
#motif_names = [motif.prefix for motif in motif_list]

reverse_motifs = [motif.get_reverse() for motif in motif_list]
motif_list.extend(reverse_motifs)
Expand Down
126 changes: 117 additions & 9 deletions tobias/utils/motifs.py
Expand Up @@ -25,7 +25,7 @@

#Internal
from tobias.utils.regions import *
from tobias.utils.utilities import filafy #filafy for filenames
from tobias.utils.utilities import filafy, num #filafy for filenames

#----------------------------------------------------------------------------------------#
#List of OneMotif objects
Expand All @@ -49,6 +49,114 @@ def __init__(self, lst=[]):
def __str__(self):
return("\n".join([str(onemotif) for onemotif in self]))

def from_file(self, path):

content = open(path).read()

#Establish format of motif
file_format = get_motif_format(content)

lines = content.split("\n")

#Read motifs
if file_format == "meme":
for idx, line in enumerate(lines):
columns = line.strip().split()

if line.startswith("MOTIF"):
self.append(OneMotif()) #create new motif
self[-1].input_format = file_format

#Get id/name of motif
if len(columns) > 2: #MOTIF, ID, NAME
motif_id, name = columns[1], columns[2]
elif len(columns) == 2: # MOTIF, ID
motif_id, name = columns[1], "" #name not given

self[-1].id = motif_id
self[-1].name = name

else:
if len(self) > 0: #if there was already one motif header found

#If line contains counts
if re.match("^[\s]*([\d\.\s]+)$", line): #starts with any number of spaces (or none) followed by numbers
for i, col in enumerate(columns):
self[-1][i].append(num(col))

elif file_format in ["pfm", "jaspar"]:

for line in lines:
m = re.match(".*?([\d]+[\d\.\s]+).*?", line)

if line.startswith(">"):
self.append(OneMotif(counts=[])) #create new motif
self[-1].input_format = file_format

columns = line[1:].strip().split() #[1:] to remove > from header
if len(columns) > 1: #ID, NAME
motif_id, name = columns[0], columns[1]
elif len(columns) == 1: #>ID
motif_id, name = columns[0], "" #name not given

self[-1].id = motif_id
self[-1].name = name

elif m:
columns = [num(field) for field in m.group(1).rstrip().split()]
self[-1].counts.append(columns)

#Check correct format of pfms
for motif in self:
nuc, pos = np.array(motif.counts).shape
motif.w = pos
if nuc != 4:
sys.exit("ERROR: Motif {0} has an unexpected format and could not be read".format(motif))

#Estimate widths and n_sites
for motif in self:
motif.n = int(round(sum([base_counts[0] for base_counts in motif.counts])))

return(self)

def as_string(self, output_format="pfm"):

bases = ["A", "C", "G", "T"]
out_string = ""

#Establish which output format
if output_format in ["pfm", "jaspar"]:
for motif in self:
out_string += ">{0}\t{1}\n".format(motif.id, motif.name)
for i, base_counts in enumerate(motif.counts):
base_counts_string = ["{0:.5f}".format(element) for element in base_counts]
out_string += "{0} [ {1} ] \n".format(bases[i], "\t".join(base_counts_string)) if output_format == "jaspar" else "\t".join(base_counts_string) + "\n"
out_string += "\n"

elif output_format == "meme":

meme_header = "MEME version 4\n\n"
meme_header += "ALPHABET=ACGT\n\n"
meme_header += "strands: + -\n\n"
meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n"
out_string += meme_header

for motif in self:
out_string += "MOTIF\t{0}\t{1}\n".format(motif.id, motif.name)
out_string += "letter-probability matrix: alength=4 w={0} nsites={1} E=0\n".format(motif.w, motif.n)

for i in range(motif.w):
row = [float(motif.counts[j][i]) for j in range(4)] #row contains original row from content
n_sites = round(sum(row), 0)
row_freq = ["{0:.5f}".format(num/n_sites) for num in row]
out_string += " ".join(row_freq) + "\n"

out_string += "\n"

return(out_string)

#---------------- Functions for moods scanning ------------------------#

def setup_moods_scanner(self):

tups = [(motif.prefix, motif.strand, motif.pssm, motif.threshold) for motif in self] #list of tups
Expand Down Expand Up @@ -91,13 +199,13 @@ class OneMotif:

bases = ["A", "C", "G", "T"]

def __init__(self, motifid, name, counts):
def __init__(self, motifid="", name="", counts=[[] for _ in range(4)]):

self.id = motifid #must be unique
self.name = name #does not have to be unique

self.prefix = "" #output prefix set in set_prefix
self.counts = counts #counts
self.counts = counts #counts, list of 4 lists (each as long as motif)
self.strand = "+" #default strand is +

#Set later
Expand Down Expand Up @@ -182,13 +290,13 @@ def calc_bit_score(self):
def plot_logo(self):

LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp),
"G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
"A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
"C" : TextPath((-0.366, 0), "C", size=1, prop=fp) }
"G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
"A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
"C" : TextPath((-0.366, 0), "C", size=1, prop=fp) }
COLOR_SCHEME = {'G': 'orange',
'A': "#CC0000",
'C': 'mediumblue',
'T': 'darkgreen'}
'A': "#CC0000",
'C': 'mediumblue',
'T': 'darkgreen'}

def add_letter(base, x, y, scale, ax):
""" Add letter to axis at positions x/y"""
Expand Down
28 changes: 23 additions & 5 deletions tobias/utils/utilities.py
Expand Up @@ -288,6 +288,11 @@ def check_required(args, required):
#---------------------------------------- Misc ---------------------------------------------#
#-------------------------------------------------------------------------------------------#

def num(s):
try:
return int(s)
except ValueError:
return float(s)

class Progress:
""" Class for writing out progress of processes such as multiprocessing """
Expand Down Expand Up @@ -316,11 +321,24 @@ def write(self, progress):

def flatten_list(lst):

for element in lst:
if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)):
yield from flatten_list(element)
else:
yield element
for element in lst:
if isinstance(element, collections.Iterable) and not isinstance(element, (str, bytes)):
yield from flatten_list(element)
else:
yield element

def expand_dirs(list_of_paths):
""" Expands a list of files and dirs to a list of all files within dirs """

all_files = []
for path in list_of_paths:
if os.path.isdir(path):
files = os.listdir(path)
all_files.extend([os.path.join(path, f) for f in files])
else:
all_files.append(path)

return(all_files)

def check_files(lst_of_files, action="r"):

Expand Down

0 comments on commit b71a886

Please sign in to comment.