From e5f61e717c51234f9ac3f323baa5f732cea1c719 Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 1 Dec 2016 17:31:12 +0100 Subject: [PATCH] removed some obsolete parsers modified gene family models to support deletes... --- master_build.template.py | 3 +- planet/controllers/admin/controls.py | 8 +- planet/forms/admin/add_family.py | 2 +- planet/models/gene_families.py | 105 +++++------------- .../mamut.families.orthofinder.txt | 2 + utils/parser/plaza/families.py | 16 --- utils/parser/plaza/interpro.py | 21 ---- 7 files changed, 34 insertions(+), 123 deletions(-) create mode 100644 tests/data/comparative_data/mamut.families.orthofinder.txt delete mode 100644 utils/parser/plaza/families.py delete mode 100644 utils/parser/plaza/interpro.py diff --git a/master_build.template.py b/master_build.template.py index 64db4b6..cae28ed 100644 --- a/master_build.template.py +++ b/master_build.template.py @@ -273,7 +273,8 @@ print("Adding Families") print("===============") - families_id = GeneFamily.add_families_from_plaza("data/genefamily_data.hom.csv", "PLAZA 2.5 Homologous gene families") + # TODO OBSOLETE + # families_id = GeneFamily.add_families_from_plaza("data/genefamily_data.hom.csv", "PLAZA 2.5 Homologous gene families") print("Adding Expression Plots") print("=======================") diff --git a/planet/controllers/admin/controls.py b/planet/controllers/admin/controls.py index cbdf81c..bee4aad 100644 --- a/planet/controllers/admin/controls.py +++ b/planet/controllers/admin/controls.py @@ -381,12 +381,12 @@ def add_family(): fd, temp_path = mkstemp() open(temp_path, 'wb').write(family_data) - if source == 'plaza': - GeneFamily.add_families_from_plaza(temp_path, method_description) - flash('Added Gene families from file %s' % form.file.name, 'success') - elif source == 'mcl': + if source == 'mcl': GeneFamily.add_families_from_mcl(temp_path, method_description) flash('Added Gene families from file %s' % form.file.name, 'success') + elif source == 'orthofinder': + GeneFamily.add_families_from_orthofinder(temp_path, method_description) + flash('Added Gene families from file %s' % form.file.name, 'success') else: flash('Method not implemented yet', 'danger') os.close(fd) diff --git a/planet/forms/admin/add_family.py b/planet/forms/admin/add_family.py index 28c7f61..06d677a 100644 --- a/planet/forms/admin/add_family.py +++ b/planet/forms/admin/add_family.py @@ -6,6 +6,6 @@ class AddFamiliesForm(FlaskForm): method_description = StringField('Description', [InputRequired]) - source = SelectField('Source', choices=[('plaza', 'PLAZA csv'), ('mcl', 'MCL'), ('orthofinder', 'OrthoFinder')]) + source = SelectField('Source', choices=[('mcl', 'MCL'), ('orthofinder', 'OrthoFinder')]) file = FileField() diff --git a/planet/models/gene_families.py b/planet/models/gene_families.py index 8a680e7..80caf36 100644 --- a/planet/models/gene_families.py +++ b/planet/models/gene_families.py @@ -2,8 +2,6 @@ from planet.models.relationships import sequence_family, family_xref, SequenceSequenceECCAssociation from planet.models.sequences import Sequence -from utils.parser.plaza.families import Parser as FamilyParser - import csv import re @@ -19,7 +17,9 @@ class GeneFamilyMethod(db.Model): method = db.Column(db.Text) family_count = db.Column(db.Integer) - families = db.relationship('GeneFamily', backref=db.backref('method', lazy='joined'), lazy='dynamic') + families = db.relationship('GeneFamily', backref=db.backref('method', lazy='joined'), + lazy='dynamic', + cascade='all, delete-orphan') def __init__(self, method): self.method = method @@ -161,25 +161,17 @@ def add_families_from_mcl(filename, description, handle_isoforms=True, prefix='m return method.id @staticmethod - def add_families_from_tab(filename, description, handle_isoforms=True): + def add_families_from_orthofinder(filename, description, handle_isoforms=True): """ - DEPRICATED IMPORT FROM MCL + Add gene families directly from MCL output (one line with all genes from one family) - :param filename: - :param description: - :param handle_isoforms: - :return: + :param filename: The file to load + :param description: Description of the method to store in the database + :param handle_isoforms: should isofroms (indicated by .1 at the end) be handled + :return the new methods internal ID """ - # Create new method for these families - method = GeneFamilyMethod(description) - - try: - db.session.add(method) - db.session.commit() - except Exception as e: - db.session.rollback() - quit() + method = GeneFamilyMethod.add(description) gene_hash = {} all_sequences = Sequence.query.all() @@ -191,71 +183,24 @@ def add_families_from_tab(filename, description, handle_isoforms=True): gene_id = re.sub('\.\d+$', '', sequence.name.lower()) gene_hash[gene_id] = sequence - family_hash = {} - - families = {} - genes = [] - - with open(filename) as csvfile: - reader = csv.DictReader(csvfile, delimiter='\t') - for row in reader: - family = row['family'] - gene = row['gene'] - - genes.append(gene) - - if family not in families.keys(): - families[family] = [] - family_hash[family] = GeneFamily(family) - family_hash[family].method_id = method.id - - families[family].append(gene) - - for name, f in family_hash.items(): - db.session.add(f) - - for name, f in family_hash.items(): - for gene in families[name]: - if gene.lower() in gene_hash.keys(): - gene_hash[gene.lower()].families.append(family_hash[name]) - - try: - db.session.commit() - except Exception as e: - db.session.rollback() - print(e) - - return method.id - - @staticmethod - def add_families_from_plaza(filename, description): - family_parser = FamilyParser() - family_parser.read(filename) - - method = GeneFamilyMethod(description) - - db.session.add(method) - - gene_hash = {} - all_sequences = Sequence.query.all() - - for sequence in all_sequences: - gene_hash[sequence.name] = sequence + with open(filename, "r") as f_in: + for line in f_in: + orthofinder_id, *parts = line.strip().split() - for family, genes in family_parser.families.items(): - new_family = GeneFamily(family) - new_family.method_id = method.id + orthofinder_id = orthofinder_id.rstrip(':') - db.session.add(new_family) + new_family = GeneFamily(orthofinder_id.replace('OG', 'OG_%02d_' % method.id)) + new_family.method_id = method.id - for gene in genes: - if gene in gene_hash: - gene_hash[gene].families.append(new_family) + for p in parts: + if p.lower() in gene_hash.keys(): + new_family.sequences.append(gene_hash[p.lower()]) - try: - db.session.commit() - except Exception as e: - db.session.rollback() - print(e) + try: + db.session.add(new_family) + db.session.commit() + except Exception as e: + db.session.rollback() + quit() return method.id \ No newline at end of file diff --git a/tests/data/comparative_data/mamut.families.orthofinder.txt b/tests/data/comparative_data/mamut.families.orthofinder.txt new file mode 100644 index 0000000..6c1307f --- /dev/null +++ b/tests/data/comparative_data/mamut.families.orthofinder.txt @@ -0,0 +1,2 @@ +OG00001: Gene01 Gene02 +OG00002: Gene03 \ No newline at end of file diff --git a/utils/parser/plaza/families.py b/utils/parser/plaza/families.py deleted file mode 100644 index 7b6ec79..0000000 --- a/utils/parser/plaza/families.py +++ /dev/null @@ -1,16 +0,0 @@ -import csv - - -class Parser: - def __init__(self): - self.families = {} - - def read(self, filename): - with open(filename) as csvfile: - reader = csv.DictReader(csvfile, delimiter=';') - for row in reader: - family = row['gf_id'] - genes = row['genes'] - - if family not in self.families.keys(): - self.families[family] = genes.split(',') diff --git a/utils/parser/plaza/interpro.py b/utils/parser/plaza/interpro.py deleted file mode 100644 index 082d957..0000000 --- a/utils/parser/plaza/interpro.py +++ /dev/null @@ -1,21 +0,0 @@ -import csv - - -class Parser: - def __init__(self): - self.annotation = {} - - def read_plaza_interpro(self, filename): - with open(filename) as csvfile: - reader = csv.DictReader(csvfile, delimiter=';') - for row in reader: - gene = row['gene_id'] - domain = {"id": row['motif_id'], - "start": row['start'], - "stop": row['stop']} - - if gene not in self.annotation.keys(): - self.annotation[gene] = [] - - if domain not in self.annotation[gene]: - self.annotation[gene].append(domain)