Skip to content

Commit

Permalink
updated parse_gff.py, now also works in case the ID attribute is miss…
Browse files Browse the repository at this point in the history
…ing for the lowest level features (CDS, exons). An ID will be added.
  • Loading branch information
proost committed May 11, 2017
1 parent 9dc8cf5 commit 8de768d
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion helper/parse_gff.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import sys
import json
from collections import OrderedDict
from collections import OrderedDict, defaultdict

LOCUS_FEATURES = ['gene']
TRANSCRIPT_FEATURES = ['mRNA', 'transcript']
Expand Down Expand Up @@ -65,12 +65,22 @@ def parse_gff3(filename):
genes = OrderedDict()
transcript_to_locus = {}

count_per_transcript = defaultdict(lambda: 1)

with open(filename) as gff_in:
for line in gff_in:
# Skip comments
if not line.strip()[0] == '#':
line_data = parse_line(line)

# Parts (e.g. CDS or Exon) might not have an ID. One will be added here
if ID_ATTRIBUTE not in line_data['attributes'].keys() and line_data['feature'] in PARTS_FEATURES:
if PARENT_ATTRIBUTE in line_data['attributes'].keys():
counter_id = line_data['attributes'][PARENT_ATTRIBUTE] + '.' + line_data['feature'] + '.'
new_id = counter_id + str(count_per_transcript[counter_id])
count_per_transcript[counter_id] += 1
line_data['attributes'][ID_ATTRIBUTE] = new_id

# Every line needs a valid ID
if ID_ATTRIBUTE in line_data['attributes'].keys():

Expand Down

0 comments on commit 8de768d

Please sign in to comment.