Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
TOuCAN/bin/match_sequences.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
139 lines (113 sloc)
3.65 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
import os | |
import sys | |
import re | |
import getopt | |
def fasta(hin=sys.stdin): | |
""" | |
A simple generator to traverse over a FastA file | |
""" | |
seqname = None | |
seq = None | |
for line in hin: | |
line = line.rstrip() | |
if len(line) == 0: | |
continue | |
if line[0] == ">": | |
if seqname is not None: | |
yield seqname, seq | |
seqname = line[1:] | |
seq = "" | |
else: | |
seq += line | |
if seqname is not None: | |
yield seqname, seq | |
def iupac_code(seq): | |
''' | |
decodes a sequence which is IUPAC and convert | |
this to a regular expression friendly sequence | |
''' | |
# convert IUPAC bases | |
seq = seq.replace('R', '[A,G]') | |
seq = seq.replace('Y', '[C,T]') | |
seq = seq.replace('S', '[G,C]') | |
seq = seq.replace('W', '[A,T]') | |
seq = seq.replace('K', '[G,T]') | |
seq = seq.replace('M', '[A,C]') | |
seq = seq.replace('B', '[C,G,T]') | |
seq = seq.replace('D', '[A,G,T]') | |
seq = seq.replace('H', '[A,C,T]') | |
seq = seq.replace('V', '[A,C,G]') | |
seq = seq.replace('N', '[A,C,T,G]') | |
# returns the sequence | |
return seq | |
def main(hin=sys.stdin, hout=sys.stdout, recognition_sequences=None): | |
""" | |
the main program loop | |
""" | |
regexps = [] | |
for label,site in recognition_sequences: | |
expr = re.compile(iupac_code(site), flags=re.IGNORECASE) | |
regexps.append((label, expr)) | |
for name, sequence in fasta(hin): | |
matches = [] | |
# find the matches for all the recognition sites | |
for label, expr in regexps: | |
for match in expr.finditer(sequence): | |
matches.append((match.start(), match.end(), label)) | |
# sort the matches on coordinate | |
matches.sort() | |
# write the output | |
for start, end, matchseq in matches: | |
hout.write("%s\t%d\t%d\t%s\n" % (name, start, end, matchseq)) | |
if __name__ == "__main__": | |
def usage(message="", error=None): | |
""" | |
Print the usage information | |
""" | |
sys.stdout.write(""" | |
Message %(message)s | |
Called as %(commandline)s | |
Usage: | |
python %(tool)s -f [fasta file] -b [output bed file] -s [sequence] | |
Options: | |
-f/--fasta [file] a fasta file with the sequences to search in. | |
-b/--bed [file] a bed file to which to write the found occurences | |
-s/--sequence [string] the label, sequence combination to search for (label:IUPAC sequence). This can be added multiple times | |
-h/--help [] print this message | |
""" % { | |
"message": message, | |
"tool": sys.argv[0], | |
"commandline": " ".join(sys.argv) | |
}) | |
# | |
if error is not None: | |
sys.exit(error) | |
# main variables | |
fin = sys.stdin | |
fout = sys.stdout | |
recognition = [] | |
# option parsing | |
shortopt = "f:b:s:h" | |
longopt = ["fasta=", "bed=", "sequence=", "help"] | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], shortopt, longopt) | |
for o, a in opts: | |
if o in ('-f', '--fasta'): | |
fin = open(a, "r") | |
elif o in ('-b', '--bed'): | |
fout = open(a, "w") | |
elif o in ('-s', '--sequence'): | |
label = re.sub(":.*$", "", a) | |
site = re.sub("^.*:", "", a) | |
recognition.append((label, site)) | |
elif o in ('-h', '--help'): | |
usage("Help was asked", error=0) | |
except getopt.GetoptError as err: | |
usage(str(err), error=2) | |
# run the main program loop | |
main(hin=fin, hout=fout, recognition_sequences=recognition) | |
if not fin.closed and fin is not sys.stdin: | |
fin.close() | |
if not fout.closed and fout is not sys.stdout: | |
fout.close() |