#!/usr/bin/env python
Classes for working with motifs and scanning with moods
@author: Mette Bentsen
@contact: mette.bentsen (at)
@license: MIT
import numpy as np
import copy
import MOODS.scan
import MOODS.parsers
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.text import TextPath
from matplotlib.patches import PathPatch
from matplotlib.font_manager import FontProperties
from tobias.utils.regions import *
from tobias.utils.utilities import filafy, num #filafy for filenames
#List of OneMotif objects
class MotifList(list):
def __init__(self, lst=[]):
super(MotifList, self).__init__(iter(lst)) = np.array([0.25,0.25,0.25,0.25])
#Set by setup moods scanner
self.names = []
self.matrices = [] #pssms
self.strands = []
self.thresholds = []
self.moods_scanner = None
def __str__(self):
return("\n".join([str(onemotif) for onemotif in self]))
def from_file(self, path):
content = open(path).read()
#Establish format of motif
file_format = get_motif_format(content)
lines = content.split("\n")
#Read motifs
if file_format == "meme":
for idx, line in enumerate(lines):
columns = line.strip().split()
if line.startswith("MOTIF"):
self.append(OneMotif()) #create new motif
self[-1].input_format = file_format
#Get id/name of motif
if len(columns) > 2: #MOTIF, ID, NAME
motif_id, name = columns[1], columns[2]
elif len(columns) == 2: # MOTIF, ID
motif_id, name = columns[1], "" #name not given
self[-1].id = motif_id
self[-1].name = name
if len(self) > 0: #if there was already one motif header found
#If line contains counts
if re.match("^[\s]*([\d\.\s]+)$", line): #starts with any number of spaces (or none) followed by numbers
for i, col in enumerate(columns):
elif file_format in ["pfm", "jaspar"]:
for line in lines:
m = re.match(".*?([\d]+[\d\.\s]+).*?", line)
if line.startswith(">"):
self.append(OneMotif(counts=[])) #create new motif
self[-1].input_format = file_format
columns = line[1:].strip().split() #[1:] to remove > from header
if len(columns) > 1: #ID, NAME
motif_id, name = columns[0], columns[1]
elif len(columns) == 1: #>ID
motif_id, name = columns[0], "" #name not given
self[-1].id = motif_id
self[-1].name = name
elif m:
columns = [num(field) for field in]
#Check correct format of pfms
for motif in self:
nuc, pos = np.array(motif.counts).shape
motif.w = pos
if nuc != 4:
sys.exit("ERROR: Motif {0} has an unexpected format and could not be read".format(motif))
#Estimate widths and n_sites
for motif in self:
motif.n = int(round(sum([base_counts[0] for base_counts in motif.counts])))
def as_string(self, output_format="pfm"):
bases = ["A", "C", "G", "T"]
out_string = ""
#Establish which output format
if output_format in ["pfm", "jaspar"]:
for motif in self:
out_string += ">{0}\t{1}\n".format(,
for i, base_counts in enumerate(motif.counts):
base_counts_string = ["{0:.5f}".format(element) for element in base_counts]
out_string += "{0} [ {1} ] \n".format(bases[i], "\t".join(base_counts_string)) if output_format == "jaspar" else "\t".join(base_counts_string) + "\n"
out_string += "\n"
elif output_format == "meme":
meme_header = "MEME version 4\n\n"
meme_header += "ALPHABET=ACGT\n\n"
meme_header += "strands: + -\n\n"
meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n"
out_string += meme_header
for motif in self:
out_string += "MOTIF\t{0}\t{1}\n".format(,
out_string += "letter-probability matrix: alength=4 w={0} nsites={1} E=0\n".format(motif.w, motif.n)
for i in range(motif.w):
row = [float(motif.counts[j][i]) for j in range(4)] #row contains original row from content
n_sites = round(sum(row), 0)
row_freq = ["{0:.5f}".format(num/n_sites) for num in row]
out_string += " ".join(row_freq) + "\n"
out_string += "\n"
#---------------- Functions for moods scanning ------------------------#
def setup_moods_scanner(self):
tups = [(motif.prefix, motif.strand, motif.pssm, motif.threshold) for motif in self] #list of tups
if len(tups) > 0:
self.names, self.strands, self.matrices, self.thresholds = list(map(list, zip(*tups))) #get "columns"
self.names, self.strands, self.marices, self.thresholds = ([], [], [], [])
scanner = MOODS.scan.Scanner(7)
scanner.set_motifs(self.matrices,, self.thresholds)
self.moods_scanner = scanner
def scan_sequence(self, seq, region):
if self.moods_scanner == None:
#Scan sequence
results = self.moods_scanner.scan(seq)
#Convert results to RegionList
sites = RegionList()
for (matrix, name, strand, result) in zip(self.matrices, self.names, self.strands, results):
motif_length = len(matrix[0])
for match in result:
start = region.start + match.pos #match pos is 1 based
end = start + motif_length
score = round(match.score, 5)
site = OneRegion([region.chrom, start, end, name, score, strand])
#Contains info on one motif formatted for use in moods
class OneMotif:
bases = ["A", "C", "G", "T"]
def __init__(self, motifid="", name="", counts=[[] for _ in range(4)]): = motifid #must be unique = name #does not have to be unique
self.prefix = "" #output prefix set in set_prefix
self.counts = counts #counts, list of 4 lists (each as long as motif)
self.strand = "+" #default strand is +
#Set later
self.pfm = None = np.array([0.25,0.25,0.25,0.25]) #background set to equal by default
self.pssm = None #pssm calculated from get_pssm
self.threshold = None #threshold calculated from get_threshold
self.bits = None
def __str__(self):
def set_prefix(self, naming="name_id"):
""" Set name to be used in 4th column and as output prefix """
if naming == "name":
prefix =
elif naming == "id":
prefix =
elif naming == "name_id":
prefix = + "_" +
elif naming == "id_name":
prefix = + "_" +
prefix = "None"
self.prefix = filafy(prefix)
def get_pfm(self):
self.pfm = self.counts / np.sum(self.counts, axis=0)
def get_reverse(self):
if self.pfm is None:
reverse_motif = copy.deepcopy(self)
reverse_motif.strand = "-"
reverse_motif.pfm =,4)
return(reverse_motif) #OneMotif object
def get_pssm(self, ps=0.01):
""" """
if self.pfm is None:
bg_col =,1))
pseudo_vector = ps * bg_col
pssm = np.log(np.true_divide(self.pfm + pseudo_vector, np.sum(self.pfm + pseudo_vector, axis=0))) - np.log(bg_col)
pssm = tuple([tuple(row) for row in pssm])
self.pssm = pssm
def get_threshold(self, pvalue):
if self.pssm is None:
self.threshold =,, pvalue, 4)
def calc_bit_score(self):
if self.pfm is None:
pfm_arr = np.copy(self.pfm)
pfm_arr[pfm_arr == 0] = np.nan
#Info content per pos
entro = pfm_arr * np.log2(pfm_arr)
entro[np.isnan(entro)] = 0
info_content = 2 - (- np.sum(entro, axis=0)) #information content per position in motif
self.bits = self.pfm * info_content
def plot_logo(self):
LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp),
"G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
"A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
"C" : TextPath((-0.366, 0), "C", size=1, prop=fp) }
COLOR_SCHEME = {'G': 'orange',
'A': "#CC0000",
'C': 'mediumblue',
'T': 'darkgreen'}
def add_letter(base, x, y, scale, ax):
""" Add letter to axis at positions x/y"""
text = LETTERS[base]
t = mpl.transforms.Affine2D().scale(1*globscale, scale*globscale) + \
mpl.transforms.Affine2D().translate(x,y) + ax.transData
p = PathPatch(text, lw=0, fc=COLOR_SCHEME[base], transform=t)
if ax != None:
return p
self.length = self.bits.shape[1]
fp = FontProperties(family='sans-serif', weight="bold")
globscale = 1.35
#Plot logo
fig, ax = plt.subplots(figsize=(10,3))
max_y = 0
for position in range(self.length): #0-based positions
base_bit_tups = zip(OneMotif.bases, self.bits[:,position])
#Go through bases sorted from lowest to highest bit score
y = 0 #position to place letter
for (base, score) in sorted(base_bit_tups, key=lambda tup: tup[1]):
add_letter(base, position+1, y, score, ax)
y += score
max_y = max(max_y, y)
plt.xlim((0.2, self.length+0.8))
plt.ylim((0, max_y))
return(fig, ax)
def get_motif_format(content):
#Estimate input format
motif_format = "unknown"
if re.match("MEME version.+", content, re.DOTALL) is not None: # MOTIF\s.+letter-probability matrix.+[\d\.\s]+", content, re.MULTILINE) is not None:
motif_format = "meme"
elif re.match(">.+A.+\[", content, re.DOTALL) is not None:
motif_format = "jaspar"
elif re.match(">.+", content, re.DOTALL) is not None:
motif_format = "pfm"
def convert_motif(content, output_format):
""" Output formats are "pfm", "jaspar" or "meme" """
bases = ["A", "C", "G", "T"]
input_format = get_motif_format(content)
converted_content = ""
if input_format == output_format:
#remove any meme headers
m = re.match("^(MEME.*?)(MOTIF.*)", content, re.DOTALL)
if m:
converted_content = + "\n"
converted_content = content + "\n"
################ pfm <-> jaspar ################
elif (input_format == "pfm" or input_format == "jaspar") and (output_format == "pfm" or output_format == "jaspar"):
for line in content.split("\n"):
if line.startswith(">"):
converted_content += line + "\n" #header line + \n as this was removed in split
i = -1
m = re.match(".*?([\d]+[\d\.\s]+).*?", line)
if m:
i += 1 #i is 0 for first pfm line
pfm_line =
fields = [field for field in pfm_line.rstrip().split()]
converted_line = "{0} [ {1} ] \n".format(bases[i], "\t".join(fields)) if output_format == "jaspar" else "\t".join(fields) + "\n"
converted_content += converted_line
if i == 3: # last line
converted_content += "\n"
################ meme -> jaspar/pfm ################
elif input_format == "meme" and (output_format == "jaspar" or output_format == "pfm"):
motif_content = []
header = ""
lines = content.split("\n") + ["MOTIF"] #add motif to end to write out motif
for idx, line in enumerate(lines):
if line.startswith("MOTIF"):
#Write any previous motif saved
if len(motif_content) > 0:
for i, column in enumerate(motif_content): #column = list of values
converted_line = "{0} [ {1} ] \n".format(bases[i], "\t".join(column)) if output_format == "jaspar" else "\t".join(column) + "\n"
converted_content += converted_line #contains \n
#Get ready for this motif
if idx < len(lines) - 1: #Up until the last line, it is possible to save for next
columns = line.strip().split()
if len(columns) > 2: #MOTIF, ID, NAME
motif_id, name = columns[1], columns[2]
elif len(columns) == 2: # MOTIF, ID
motif_id, name = columns[1], columns[1]
header = ">{0}\t{1}\n".format(motif_id, name)
converted_content += header
motif_content = [[] for _ in range(4)] #ACGT
elif re.match("^[\s]*([\d\.\s]+)$", line): #starts with any number of spaces (or none) followed by numbers
columns = line.rstrip().split()
for i, col in enumerate(columns):
################ jaspar/pfm -> meme ################
elif (input_format == "jaspar" or input_format == "pfm") and output_format == "meme":
motif_content = [] #no motifs found yet, this is empty
lines = content.split("\n") + [">"] #add ">" at the end to make sure that the last motif is saved
for idx, line in enumerate(lines):
m = re.match(".*?([\d]+[\d\.\s]+).*?", line)
if line.startswith(">"):
#Write any previous motif saved
if len(motif_content) > 0:
motif_w = len(motif_content[0])
n_sites = int(round(sum(float(motif_content[i][0]) for i in range(4)), 0)) #sum of first site freqs
converted_content += "letter-probability matrix: alength=4 w={0} nsites={1} E=0\n".format(motif_w, n_sites)
for i in range(motif_w):
row = [float(motif_content[j][i]) for j in range(4)] #row contains original row from content
n_sites = round(sum(row), 0)
row_freq = ["{0:.5f}".format(num/n_sites) for num in row]
converted_content += " ".join(row_freq) + "\n"
converted_content += "\n"
if idx < len(lines) - 1: #Up until the last line, it is possible to save for next
columns = line[1:].strip().split() #[1:] to remove > from header
motif_id, name = columns[0], columns[1]
motif_id, name = ".", "."
converted_content += "MOTIF {0} {1}\n".format(motif_id, name)
motif_content = [] #list of rows from jaspar format motif
elif m:
columns = [field for field in]
def pfm_to_motifs(content):
""" Content of a pfm motif file to MotifList format """
#Read motifs to moods
pfm_names = []
pfms = []
idx = -1
motiflist = MotifList([])
for line in content.split("\n"):
if line.startswith(">"):
#Read name for this motif
columns = line.replace(">", "").rstrip().split()
motifid, alt_name = columns[0], columns[1]
motif_obj = OneMotif(motifid, alt_name, []) #pfm is set to empty list
elif len(motiflist) > 0: #if at least one header line was found
m = re.match(".*?([\d]+[\d\.\s]+).*?", line)
if m:
pfm_line =
pfm_fields = [float(field) for field in pfm_line.rstrip().split()]
#check correct format of pfms
for motif in motiflist:
rows, cols = np.array(motif.counts).shape
if rows != 4:
sys.exit("ERROR: Motif {0} has an unexpected format and could not be read")