diff --git a/planet/models/clades.py b/planet/models/clades.py index 40e40fe..c334771 100644 --- a/planet/models/clades.py +++ b/planet/models/clades.py @@ -2,6 +2,8 @@ from planet.models.gene_families import GeneFamily from planet.models.interpro import Interpro +from utils.phylo import get_clade + import json SQL_COLLATION = 'NOCASE' if db.engine.name == 'sqlite' else '' @@ -57,6 +59,9 @@ def update_clades(): clades = Clade.query.all() families = GeneFamily.query.all() + clade_to_species = {c.name: json.loads(c.species) for c in clades} + clade_to_id = {c.name: c.id for c in clades} + for f in families: family_species = f.species_codes @@ -66,23 +71,11 @@ def update_clades(): continue # find the clade with the fewest species that contains all the codes - selected_clade = None - for c in clades: - clade_species = json.loads(c.species) - - overlap = set(family_species).intersection(clade_species) - - if len(overlap) == len(family_species): - if selected_clade is None: - selected_clade = c - else: - if selected_clade.species_count > c.species_count: - selected_clade = c + selected_clade, _ = get_clade(family_species, clade_to_species) + if selected_clade is None: + f.clade_id = None else: - if selected_clade is None: - print("An error occurred, no clades found, check the clades in the database!") - else: - f.clade_id = selected_clade.id + f.clade_id = clade_to_id[selected_clade] try: db.session.commit() @@ -98,6 +91,9 @@ def update_clades_interpro(): clades = Clade.query.all() interpro= Interpro.query.all() + clade_to_species = {c.name: json.loads(c.species) for c in clades} + clade_to_id = {c.name: c.id for c in clades} + for i in interpro: interpro_species = i.species_codes @@ -107,26 +103,14 @@ def update_clades_interpro(): continue # find the clade with the fewest species that contains all the codes - selected_clade = None - for c in clades: - clade_species = json.loads(c.species) - - overlap = set(interpro_species).intersection(clade_species) - - if len(overlap) == len(interpro_species): - if selected_clade is None: - selected_clade = c - else: - if selected_clade.species_count > c.species_count: - selected_clade = c + selected_clade, _ = get_clade(interpro_species, clade_to_species) + if selected_clade is None: + i.clade_id = None else: - if selected_clade is None: - print("An error occurred, no clades found, check the clades in the database!") - else: - i.clade_id = selected_clade.id + i.clade_id = clade_to_id[selected_clade] try: db.session.commit() except Exception as e: db.session.rollback() - print(e) \ No newline at end of file + print(e) diff --git a/planet/models/trees.py b/planet/models/trees.py index f39bc5e..4c4be32 100644 --- a/planet/models/trees.py +++ b/planet/models/trees.py @@ -25,8 +25,6 @@ class TreeMethod(db.Model): cascade="all, delete-orphan", passive_deletes=True) - - def reconcile_trees(self): # Fetch required data from the database sequences = Sequence.query.all() @@ -40,12 +38,11 @@ def reconcile_trees(self): tree = newick.loads(t.data_newick)[0] for node in tree.walk(): - if not node.is_binary: - print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) - continue - if len(node.descendants) != 2: - # no need to reconcile leaf nodes + if not node.is_binary: + # Print warning in case there is a non-binary node + print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) + # Otherwise it is a leaf node and can be skipped continue branch_one_seq = [l.name for l in node.descendants[0].get_leaves()]