From cda68334d249d90b337aa79918e9cbb8212fadc4 Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 10 Dec 2015 15:57:04 +0100 Subject: [PATCH] started work on cluster similarity --- planet/models/coexpression_clusters.py | 38 ++++++++++++++ planet/models/relationships.py | 69 +++++++++++++++++--------- 2 files changed, 84 insertions(+), 23 deletions(-) diff --git a/planet/models/coexpression_clusters.py b/planet/models/coexpression_clusters.py index feb3a67..5f74aa9 100644 --- a/planet/models/coexpression_clusters.py +++ b/planet/models/coexpression_clusters.py @@ -2,11 +2,14 @@ from planet import db from planet.models.expression_networks import ExpressionNetwork +from planet.models.gene_families import GeneFamily from planet.models.relationships import sequence_coexpression_cluster, SequenceGOAssociation, ClusterGOEnrichment +from planet.models.relationships import CoexpressionClusterSimilarity, SequenceCoexpressionClusterAssociation, SequenceFamilyAssociation from utils.enrichment import hypergeo_sf, fdr_correction from utils.benchmark import benchmark +from sqlalchemy import join from sqlalchemy.orm import joinedload, load_only import json @@ -189,3 +192,38 @@ def calculate_enrichment(empty=True): for i, cluster in enumerate(clusters): print(i, "\t cluster: ", cluster.method_id, cluster.name) cluster.__calculate_enrichment() + + @staticmethod + @benchmark + def calculate_similarities(gene_family_method_id=1): + + # sqlalchemy to fetch cluster associations + fields = [SequenceCoexpressionClusterAssociation.__table__.c.sequence_id, + SequenceCoexpressionClusterAssociation.__table__.c.coexpression_cluster_id] + condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None + cluster_associations = db.engine.execute(db.select(fields).where(condition)).fetchall() + + # sqlalchemy to fetch sequence family associations + fields = [SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id] + condition = GeneFamily.__table__.c.method_id == gene_family_method_id + table = join(SequenceFamilyAssociation.__table__, GeneFamily.__table__, SequenceFamilyAssociation.__table__.c.gene_family_id == GeneFamily.__table__.c.id) + sequence_families = db.engine.execute(db.select(fields).select_from(table).where(condition)).fetchall() + + # convert sqlachemy results into dictionary + sequence_to_family = {seq_id: fam_id for seq_id, fam_id, method_id in sequence_families} + + cluster_to_sequences = {} + cluster_to_families = {} + + for seq_id, cluster_id in cluster_associations: + if cluster_id not in cluster_to_sequences.keys(): + cluster_to_sequences[cluster_id] = [] + cluster_to_sequences[cluster_id].append(seq_id) + + for cluster_id, sequences in cluster_to_sequences.items(): + families = list(set([sequence_to_family[s] for s in sequences if s in sequence_to_family.keys()])) + if len(families) > 0: + cluster_to_families[cluster_id] = families + + for c, f in cluster_to_families.items(): + print(c,f) \ No newline at end of file diff --git a/planet/models/relationships.py b/planet/models/relationships.py index ec28aa9..8a6ef6b 100644 --- a/planet/models/relationships.py +++ b/planet/models/relationships.py @@ -26,6 +26,13 @@ db.Column('coexpression_cluster_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True) ) +coexpression_cluster_similarity = \ + db.Table('coexpression_cluster_similarity', + db.Column('id', db.Integer, primary_key=True), + db.Column('source_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True), + db.Column('target_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True) + ) + sequence_xref = db.Table('sequence_xref', db.Column('id', db.Integer, primary_key=True), db.Column('sequence_id', db.Integer, db.ForeignKey('sequences.id'), index=True), @@ -55,6 +62,22 @@ class SequenceCoexpressionClusterAssociation(db.Model): coexpression_cluster_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id')) +class CoexpressionClusterSimilarity(db.Model): + __tablename__ = 'coexpression_cluster_similarity' + __table_args__ = {'extend_existing': True} + + id = db.Column(db.Integer, primary_key=True) + source_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id')) + target_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id')) + + gene_family_method_id = db.Column('gene_family_method_id', db.Integer, db.ForeignKey('gene_family_methods.id'), + index=True) + + jaccard_index = db.Column(db.Float, index=True) + p_value = db.Column(db.Float, index=True) + corrected_p_value = db.Column(db.Float, index=True) + + class SequenceFamilyAssociation(db.Model): __tablename__ = 'sequence_family' __table_args__ = {'extend_existing': True} @@ -128,26 +151,26 @@ def genome_percentage(self): return self.go_count*100/self.go_size -class ProbeGOEnrichment(db.Model): - __tablename__ = 'probe_go_enrichment' - - id = db.Column(db.Integer, primary_key=True) - probe_id = db.Column(db.Integer, db.ForeignKey('expression_network.id'), index=True) - go_id = db.Column(db.Integer, db.ForeignKey('go.id'), index=True) - - """ - Counts required to calculate the enrichment, - store here for quick access - """ - cluster_count = db.Column(db.Integer) - cluster_size = db.Column(db.Integer) - go_count = db.Column(db.Integer) - go_size = db.Column(db.Integer) - - """ - Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric - distribution and applying FDR correction (aka. BH) - """ - enrichment = db.Column(db.Float) - p_value = db.Column(db.Float) - corrected_p_value = db.Column(db.Float) +# class ProbeGOEnrichment(db.Model): +# __tablename__ = 'probe_go_enrichment' +# +# id = db.Column(db.Integer, primary_key=True) +# probe_id = db.Column(db.Integer, db.ForeignKey('expression_network.id'), index=True) +# go_id = db.Column(db.Integer, db.ForeignKey('go.id'), index=True) +# +# """ +# Counts required to calculate the enrichment, +# store here for quick access +# """ +# cluster_count = db.Column(db.Integer) +# cluster_size = db.Column(db.Integer) +# go_count = db.Column(db.Integer) +# go_size = db.Column(db.Integer) +# +# """ +# Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric +# distribution and applying FDR correction (aka. BH) +# """ +# enrichment = db.Column(db.Float) +# p_value = db.Column(db.Float) +# corrected_p_value = db.Column(db.Float)