From 118cbfacd4c5e6b6480a66bfbb08a1742d4f7f76 Mon Sep 17 00:00:00 2001
From: sepro <sebastian.proost@gmail.com>
Date: Mon, 13 Mar 2017 17:59:40 +0100
Subject: [PATCH] first implementation of a gff3 parser

---
 helper/parse_gff.py | 100 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 helper/parse_gff.py

diff --git a/helper/parse_gff.py b/helper/parse_gff.py
new file mode 100644
index 0000000..ed06da7
--- /dev/null
+++ b/helper/parse_gff.py
@@ -0,0 +1,100 @@
+import argparse
+import sys
+import json
+
+LOCUS_FEATURES = ['gene']
+TRANSCRIPT_FEATURES = ['mRNA', 'transcript']
+PARTS_FEATURES = ['exon', 'CDS', 'three_prime_UTR', 'five_prime_UTR']
+
+COLUMNS = ['chr', 'source', 'feature', 'start', 'stop', 'score', 'strand', 'phase', 'attributes']
+
+ID_ATTRIBUTE = 'ID'
+PARENT_ATTRIBUTE = 'Parent'
+
+
+def parse_attributes(attribute_str):
+    attributes = attribute_str.split(';')
+    output = {}
+
+    for attribute in attributes:
+        key, value = attribute.split('=')
+        output[key] = value
+
+    return output
+
+
+def parse_line(line):
+    parts = line.strip().split('\t')
+
+    output = {}
+
+    if len(parts) != len(COLUMNS):
+        raise Exception('Incorrect number of columns in line.', parts, COLUMNS)
+
+    for key, value in zip(COLUMNS, parts):
+        if key == 'attributes':
+            output[key] = parse_attributes(value)
+        elif key == 'start' or key == 'stop':
+            output[key] = int(value)
+        else:
+            output[key] = value
+
+    return output
+
+
+def parse_gff3(filename):
+    genes = {}
+    transcript_to_locus = {}
+
+    with open(filename) as gff_in:
+        for line in gff_in:
+            # Skip comments
+            if not line.strip()[0] == '#':
+                line_data = parse_line(line)
+
+                # Every line needs a valid ID
+                if ID_ATTRIBUTE in line_data['attributes'].keys():
+
+                    if line_data['feature'] in LOCUS_FEATURES:
+                        genes[line_data['attributes'][ID_ATTRIBUTE]] = {
+                            'data': line_data,
+                            'transcripts': {}
+                        }
+
+                    elif line_data['feature'] in TRANSCRIPT_FEATURES:
+                        if PARENT_ATTRIBUTE in line_data['attributes'].keys():
+                            parent_id = line_data['attributes'][PARENT_ATTRIBUTE]
+
+                            if parent_id in genes.keys():
+                                genes[parent_id]['transcripts'][line_data['attributes'][ID_ATTRIBUTE]] = {
+                                        'data': line_data,
+                                        'parts': []
+                                    }
+
+                                transcript_to_locus[line_data['attributes'][ID_ATTRIBUTE]] = \
+                                    line_data['attributes'][PARENT_ATTRIBUTE]
+
+                    elif line_data['feature'] in PARTS_FEATURES:
+
+                        if PARENT_ATTRIBUTE in line_data['attributes'].keys():
+                            parent_id = line_data['attributes'][PARENT_ATTRIBUTE]
+                            grandparent_id = transcript_to_locus[parent_id]
+
+                            genes[grandparent_id]['transcripts'][parent_id]['parts'].append(line_data)
+
+    return genes
+
+
+def filter_genes(genes):
+    return genes
+
+
+def write_gff(genes, output=sys.stdout):
+    print(json.dumps(genes, sort_keys=True, indent=4, separators=(',', ': ')))
+
+if __name__ == "__main__":
+    data = parse_gff3(sys.argv[1])
+
+    filtered_genes = filter_genes(data)
+
+    write_gff(filtered_genes)