Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/bin/env python
import os
import sys
import re
import getopt
def fasta(hin=sys.stdin):
"""
A simple generator to traverse over a FastA file
"""
seqname = None
seq = None
for line in hin:
line = line.rstrip()
if len(line) == 0:
continue
if line[0] == ">":
if seqname is not None:
yield seqname, seq
seqname = line[1:]
seq = ""
else:
seq += line
if seqname is not None:
yield seqname, seq
def iupac_code(seq):
'''
decodes a sequence which is IUPAC and convert
this to a regular expression friendly sequence
'''
# convert IUPAC bases
seq = seq.replace('R', '[A,G]')
seq = seq.replace('Y', '[C,T]')
seq = seq.replace('S', '[G,C]')
seq = seq.replace('W', '[A,T]')
seq = seq.replace('K', '[G,T]')
seq = seq.replace('M', '[A,C]')
seq = seq.replace('B', '[C,G,T]')
seq = seq.replace('D', '[A,G,T]')
seq = seq.replace('H', '[A,C,T]')
seq = seq.replace('V', '[A,C,G]')
seq = seq.replace('N', '[A,C,T,G]')
# returns the sequence
return seq
def main(hin=sys.stdin, hout=sys.stdout, recognition_sequences=None):
"""
the main program loop
"""
regexps = []
for label,site in recognition_sequences:
expr = re.compile(iupac_code(site), flags=re.IGNORECASE)
regexps.append((label, expr))
for name, sequence in fasta(hin):
matches = []
# find the matches for all the recognition sites
for label, expr in regexps:
for match in expr.finditer(sequence):
matches.append((match.start(), match.end(), label))
# sort the matches on coordinate
matches.sort()
# write the output
for start, end, matchseq in matches:
hout.write("%s\t%d\t%d\t%s\n" % (name, start, end, matchseq))
if __name__ == "__main__":
def usage(message="", error=None):
"""
Print the usage information
"""
sys.stdout.write("""
Message %(message)s
Called as %(commandline)s
Usage:
python %(tool)s -f [fasta file] -b [output bed file] -s [sequence]
Options:
-f/--fasta [file] a fasta file with the sequences to search in.
-b/--bed [file] a bed file to which to write the found occurences
-s/--sequence [string] the label, sequence combination to search for (label:IUPAC sequence). This can be added multiple times
-h/--help [] print this message
""" % {
"message": message,
"tool": sys.argv[0],
"commandline": " ".join(sys.argv)
})
#
if error is not None:
sys.exit(error)
# main variables
fin = sys.stdin
fout = sys.stdout
recognition = []
# option parsing
shortopt = "f:b:s:h"
longopt = ["fasta=", "bed=", "sequence=", "help"]
try:
opts, args = getopt.getopt(sys.argv[1:], shortopt, longopt)
for o, a in opts:
if o in ('-f', '--fasta'):
fin = open(a, "r")
elif o in ('-b', '--bed'):
fout = open(a, "w")
elif o in ('-s', '--sequence'):
label = re.sub(":.*$", "", a)
site = re.sub("^.*:", "", a)
recognition.append((label, site))
elif o in ('-h', '--help'):
usage("Help was asked", error=0)
except getopt.GetoptError as err:
usage(str(err), error=2)
# run the main program loop
main(hin=fin, hout=fout, recognition_sequences=recognition)
if not fin.closed and fin is not sys.stdin:
fin.close()
if not fout.closed and fout is not sys.stdout:
fout.close()