From 556958a469d2d7a36fb277c6452544253f1f04b1 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 22 May 2017 10:32:22 +0200 Subject: [PATCH] added filter --- README.md | 2 -- TE_DB/maize_uniprotkb_TE.fasta | 12 +++---- detector.py | 66 +++++++++++++++++++++++++++++++++- 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 636dd43..f3a0e94 100644 --- a/README.md +++ b/README.md @@ -56,5 +56,3 @@ element proteins. # TODO * Add support for cluster - * Parameters for analyze (desired cutoff) + determine good thresholds - * Script to remove TEs from initial input producing a final *clean* fasta file diff --git a/TE_DB/maize_uniprotkb_TE.fasta b/TE_DB/maize_uniprotkb_TE.fasta index 1645471..6dc4f76 100644 --- a/TE_DB/maize_uniprotkb_TE.fasta +++ b/TE_DB/maize_uniprotkb_TE.fasta @@ -1,4 +1,4 @@ ->sp|P15268|MOSA_MAIZE Autonomous transposable element EN-1 mosaic protein OS=Zea mays PE=2 SV=1 +>sp|P15268|MOSA_MAIZE MFRMDSSGRRSRSRRSRGSSGAPNMFEGTTTSRSRQEQLLASLEQMRGSSGPSNTEGTTS RAADLVAPTMAPTAEAAVDAEAAVDAEAEEAAAELDDGEETSGADASTEEAATQAPPRRA IRYRRSLTLKPSKPFDQRRVIEPKGTRAWKEVSWDGTGHRTPILTELGICLRFAYPAMVT @@ -10,7 +10,7 @@ GVVPRRSYTRRSNFSAGSNRPRRPSAREGELLEKMTQMEESMAQYKQQVQQQMQQMQNWM LHQMYGGAGTQFGMPPFQQPPIITHPVSGQSSDRSTAAADGSQGSATSVQDQLMPLGVIG GQMMPWAPRQPGIWPPMQTQMPPPMPWGFPPRGQSQSPGLPSHSPGSGSGSHHASPPPDQ STFMDLLMNTSGGGSNDPPTE ->sp|P15718|POLB_MAIZE Putative Pol polyprotein from transposon element Bs1 OS=Zea mays PE=4 SV=1 +>sp|P15718|POLB_MAIZE MEPTLQSAMEEQLKILREISDRLAAQEARWRSRESTVTQHSRSIHDLEVAMASAPSATLR AELDAPVAVTYERLDATEAASAARVSALESTTATFDMLFDNSDIVEKFNAEVVADDWGGL FGQHAHPLDSGGARLLLPSAALPFASATDTAFSSIYGVECGTIHTYTAALTRPAALHKPL @@ -24,7 +24,7 @@ SADTRLLEGDPLKIDQSALTGNFCICSIVAGMLVEFIVMYPIQDMVYRPRIDKLLVLLIG GIPIAMPTVLSVTMSIGAYRLAQQGAITKRMTTIEEMAGMDVPCSDKTGTLPWTKLTVIK SLVDVFQRGADQDAVILMDARASCTKNQDAIEATIVSMLAAPKEACAGVQEIQFLPFNPN DKRTAVTYMSLIYALSPGKA ->sp|P08770|TRA1_MAIZE Putative AC transposase OS=Zea mays PE=2 SV=2 +>sp|P08770|TRA1_MAIZE MTPPVGNNPPSGSAIRLAKLMSTTRAPSTRKTNSVFSAYAQGLKRKAEASSSRIQNVRAR ARGHGCGRTSPSSSTAEAERHFIQSVSSSNANGTATDPSQDDMAIVHEPQPQPQPQPEPQ PQPQPEPEEEAPQKRAKKCTSDVWQHFTKKEIEVEVDGKKYVQVWGHCNFPNCKAKYRAE @@ -39,7 +39,7 @@ GDSYKVHVDDFVRVIRKLYQFYSSCSPSAPKTKTTTNDSMDDTLMENEDDEFQNYLHELK DYDQVESNELDKYMSEPLLKHSGQFDILSWWRGRVAEYPILTQIARDVLAIQVSTVASES AFSAGGRVVDPYRNRLGSEIVEALICTKDWVAASRKGATYFPTMIGDLEVLDSVIAAATN HENHMDEDEDAIEFSKNNEDVASGSSP ->sp|P03010|TRAC9_MAIZE Putative AC9 transposase OS=Zea mays PE=4 SV=1 +>sp|P03010|TRAC9_MAIZE MYVHVRVGMDVAAHHHHHQQLRRERHFIQSVSSSNANGTATDPSQDDMAIVHEPQPQPQP QPEPQPQPQPEPEEEAPQKRAKKCTSDVWQHFTKKEIEVEVDGKKYVQVWGHCNFPNCKA KYRAEGHHGTSGFRNHLRTSHSLVKGQLCLKSEKDHGKDINLIEPYKYDEVVSLKKLHLA @@ -54,10 +54,10 @@ TKTTTNDSMDDTLMENEDDEFQNYLHELKDYDQVESNELDKYMSEPLLKHSGQFDILSWW RGRVAEYPILTQIARDVLAIQVSTVASESAFSAGGRVVDPYRNRLGSEIVEALICTKDWV AASRKGEWHICYNEVPIYSYSTIILLILMHICVIQGATYFPTMIGDLEVLDSVIAAATNH ENHMDEVFKDYYLLRAWAINLLLFTTVLMHGLFLSPCFDACALLPSRVILPAWLALTDF ->sp|P08771|YAC1_MAIZE Transposable element activator uncharacterized 12 kDa protein OS=Zea mays PE=4 SV=1 +>sp|P08771|YAC1_MAIZE MQMVQLQIRVKMIWLLFMNHNHNHNHNQNHNHSHNLNPKKKHHRRGQRSAHRMYGSISPR RKLKWRSMERNTFRYGDIATFLIARLSIGLRVIMEQADFEIT ->sp|P03936|YAC9_MAIZE Transposable element activator uncharacterized 23 kDa protein OS=Zea mays PE=4 SV=1 +>sp|P03936|YAC9_MAIZE MTQQLGAMVLTCCTAKCCEPVSSRGDRETADGRMGERRAVEVWRTADRRWRMADGRTADG RAVEWRSGRMGGPRRDGRVASSGVEGGPWMAASASGVPRHGRHRVWCLVQPSGRPAGRQG ESERAGESETRVGVGVRLAASGLRRGRVAACECVMLLLVWCLPPGREAEQRSLGISYMGW diff --git a/detector.py b/detector.py index 244102c..72598a1 100644 --- a/detector.py +++ b/detector.py @@ -5,7 +5,7 @@ import subprocess import shlex - +from collections import OrderedDict BUILD_DB_CMD = 'makeblastdb -dbtype prot -in %s -out %s' RUN_BLAST_CMD = 'blastp -query %s -db %s -out %s -outfmt 6 -num_threads %d' @@ -25,6 +25,45 @@ def read_blast_line(line): return {h: t(v) for h, v, t in zip(header, line.strip().split(), types)} +def read_fasta(fh): + """ + Reads a fasta file to an ordered dict + + :param fh: fasta file to read + :return: + """ + output = OrderedDict() + name = '' + sequence = [] + + for line in fh: + line = line.rstrip() + if line.startswith(">"): + if not name == '': + output[name] = ''.join(sequence) + name = line.lstrip('>') + sequence = [] + else: + sequence.append(line) + + # add last gene + output[name] = ''.join(sequence) + + return output + + +def read_list(fh): + """ + Reads list from filehandle + + :param fh: filehandle + :return: returns list of elements in the file + """ + return [item.strip() for item in fh.readlines()] + + + + @click.group() def cli(): """ @@ -133,3 +172,28 @@ def analyze(blast_results, output): click.secho("All done!", fg='green', bold=True) +@cli.command() +@click.argument('fasta', type=click.File()) +@click.argument('te', type=click.File()) +@click.argument('output', type=click.File(mode='w')) +def filter(fasta, te, output): + """ + Reads a fasta file and list with putative TEs and removes those elements from the fasta file + + :param fasta: path to fasta file + :param te: path to list with putative TEs + :param output: path of new filtered fasta file + """ + click.secho("Reading fasta-file %s ... " % fasta.name, fg='green', bold=True) + fasta_data = read_fasta(fasta) + + click.secho("Reading TEs from %s ... " % te.name, fg='green', bold=True) + transposable_elements = read_list(te) + + for k, v in fasta_data.items(): + if k not in transposable_elements: + print('>%s' % k, file=output) + print(v, file=output) + + # done + click.secho("All done!", fg='green', bold=True)