Skip to content

Commit

Permalink
added filter
Browse files Browse the repository at this point in the history
  • Loading branch information
proost committed May 22, 2017
1 parent 17f8ff0 commit 556958a
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 9 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,3 @@ element proteins.
# TODO

* Add support for cluster
* Parameters for analyze (desired cutoff) + determine good thresholds
* Script to remove TEs from initial input producing a final *clean* fasta file
12 changes: 6 additions & 6 deletions TE_DB/maize_uniprotkb_TE.fasta
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>sp|P15268|MOSA_MAIZE Autonomous transposable element EN-1 mosaic protein OS=Zea mays PE=2 SV=1
>sp|P15268|MOSA_MAIZE
MFRMDSSGRRSRSRRSRGSSGAPNMFEGTTTSRSRQEQLLASLEQMRGSSGPSNTEGTTS
RAADLVAPTMAPTAEAAVDAEAAVDAEAEEAAAELDDGEETSGADASTEEAATQAPPRRA
IRYRRSLTLKPSKPFDQRRVIEPKGTRAWKEVSWDGTGHRTPILTELGICLRFAYPAMVT
Expand All @@ -10,7 +10,7 @@ GVVPRRSYTRRSNFSAGSNRPRRPSAREGELLEKMTQMEESMAQYKQQVQQQMQQMQNWM
LHQMYGGAGTQFGMPPFQQPPIITHPVSGQSSDRSTAAADGSQGSATSVQDQLMPLGVIG
GQMMPWAPRQPGIWPPMQTQMPPPMPWGFPPRGQSQSPGLPSHSPGSGSGSHHASPPPDQ
STFMDLLMNTSGGGSNDPPTE
>sp|P15718|POLB_MAIZE Putative Pol polyprotein from transposon element Bs1 OS=Zea mays PE=4 SV=1
>sp|P15718|POLB_MAIZE
MEPTLQSAMEEQLKILREISDRLAAQEARWRSRESTVTQHSRSIHDLEVAMASAPSATLR
AELDAPVAVTYERLDATEAASAARVSALESTTATFDMLFDNSDIVEKFNAEVVADDWGGL
FGQHAHPLDSGGARLLLPSAALPFASATDTAFSSIYGVECGTIHTYTAALTRPAALHKPL
Expand All @@ -24,7 +24,7 @@ SADTRLLEGDPLKIDQSALTGNFCICSIVAGMLVEFIVMYPIQDMVYRPRIDKLLVLLIG
GIPIAMPTVLSVTMSIGAYRLAQQGAITKRMTTIEEMAGMDVPCSDKTGTLPWTKLTVIK
SLVDVFQRGADQDAVILMDARASCTKNQDAIEATIVSMLAAPKEACAGVQEIQFLPFNPN
DKRTAVTYMSLIYALSPGKA
>sp|P08770|TRA1_MAIZE Putative AC transposase OS=Zea mays PE=2 SV=2
>sp|P08770|TRA1_MAIZE
MTPPVGNNPPSGSAIRLAKLMSTTRAPSTRKTNSVFSAYAQGLKRKAEASSSRIQNVRAR
ARGHGCGRTSPSSSTAEAERHFIQSVSSSNANGTATDPSQDDMAIVHEPQPQPQPQPEPQ
PQPQPEPEEEAPQKRAKKCTSDVWQHFTKKEIEVEVDGKKYVQVWGHCNFPNCKAKYRAE
Expand All @@ -39,7 +39,7 @@ GDSYKVHVDDFVRVIRKLYQFYSSCSPSAPKTKTTTNDSMDDTLMENEDDEFQNYLHELK
DYDQVESNELDKYMSEPLLKHSGQFDILSWWRGRVAEYPILTQIARDVLAIQVSTVASES
AFSAGGRVVDPYRNRLGSEIVEALICTKDWVAASRKGATYFPTMIGDLEVLDSVIAAATN
HENHMDEDEDAIEFSKNNEDVASGSSP
>sp|P03010|TRAC9_MAIZE Putative AC9 transposase OS=Zea mays PE=4 SV=1
>sp|P03010|TRAC9_MAIZE
MYVHVRVGMDVAAHHHHHQQLRRERHFIQSVSSSNANGTATDPSQDDMAIVHEPQPQPQP
QPEPQPQPQPEPEEEAPQKRAKKCTSDVWQHFTKKEIEVEVDGKKYVQVWGHCNFPNCKA
KYRAEGHHGTSGFRNHLRTSHSLVKGQLCLKSEKDHGKDINLIEPYKYDEVVSLKKLHLA
Expand All @@ -54,10 +54,10 @@ TKTTTNDSMDDTLMENEDDEFQNYLHELKDYDQVESNELDKYMSEPLLKHSGQFDILSWW
RGRVAEYPILTQIARDVLAIQVSTVASESAFSAGGRVVDPYRNRLGSEIVEALICTKDWV
AASRKGEWHICYNEVPIYSYSTIILLILMHICVIQGATYFPTMIGDLEVLDSVIAAATNH
ENHMDEVFKDYYLLRAWAINLLLFTTVLMHGLFLSPCFDACALLPSRVILPAWLALTDF
>sp|P08771|YAC1_MAIZE Transposable element activator uncharacterized 12 kDa protein OS=Zea mays PE=4 SV=1
>sp|P08771|YAC1_MAIZE
MQMVQLQIRVKMIWLLFMNHNHNHNHNQNHNHSHNLNPKKKHHRRGQRSAHRMYGSISPR
RKLKWRSMERNTFRYGDIATFLIARLSIGLRVIMEQADFEIT
>sp|P03936|YAC9_MAIZE Transposable element activator uncharacterized 23 kDa protein OS=Zea mays PE=4 SV=1
>sp|P03936|YAC9_MAIZE
MTQQLGAMVLTCCTAKCCEPVSSRGDRETADGRMGERRAVEVWRTADRRWRMADGRTADG
RAVEWRSGRMGGPRRDGRVASSGVEGGPWMAASASGVPRHGRHRVWCLVQPSGRPAGRQG
ESERAGESETRVGVGVRLAASGLRRGRVAACECVMLLLVWCLPPGREAEQRSLGISYMGW
Expand Down
66 changes: 65 additions & 1 deletion detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import subprocess
import shlex

from collections import OrderedDict

BUILD_DB_CMD = 'makeblastdb -dbtype prot -in %s -out %s'
RUN_BLAST_CMD = 'blastp -query %s -db %s -out %s -outfmt 6 -num_threads %d'
Expand All @@ -25,6 +25,45 @@ def read_blast_line(line):
return {h: t(v) for h, v, t in zip(header, line.strip().split(), types)}


def read_fasta(fh):
"""
Reads a fasta file to an ordered dict
:param fh: fasta file to read
:return:
"""
output = OrderedDict()
name = ''
sequence = []

for line in fh:
line = line.rstrip()
if line.startswith(">"):
if not name == '':
output[name] = ''.join(sequence)
name = line.lstrip('>')
sequence = []
else:
sequence.append(line)

# add last gene
output[name] = ''.join(sequence)

return output


def read_list(fh):
"""
Reads list from filehandle
:param fh: filehandle
:return: returns list of elements in the file
"""
return [item.strip() for item in fh.readlines()]




@click.group()
def cli():
"""
Expand Down Expand Up @@ -133,3 +172,28 @@ def analyze(blast_results, output):
click.secho("All done!", fg='green', bold=True)


@cli.command()
@click.argument('fasta', type=click.File())
@click.argument('te', type=click.File())
@click.argument('output', type=click.File(mode='w'))
def filter(fasta, te, output):
"""
Reads a fasta file and list with putative TEs and removes those elements from the fasta file
:param fasta: path to fasta file
:param te: path to list with putative TEs
:param output: path of new filtered fasta file
"""
click.secho("Reading fasta-file %s ... " % fasta.name, fg='green', bold=True)
fasta_data = read_fasta(fasta)

click.secho("Reading TEs from %s ... " % te.name, fg='green', bold=True)
transposable_elements = read_list(te)

for k, v in fasta_data.items():
if k not in transposable_elements:
print('>%s' % k, file=output)
print(v, file=output)

# done
click.secho("All done!", fg='green', bold=True)

0 comments on commit 556958a

Please sign in to comment.