From cda68334d249d90b337aa79918e9cbb8212fadc4 Mon Sep 17 00:00:00 2001
From: proost <proost@mpimp-golm.mpg.de>
Date: Thu, 10 Dec 2015 15:57:04 +0100
Subject: [PATCH] started work on cluster similarity

---
 planet/models/coexpression_clusters.py | 38 ++++++++++++++
 planet/models/relationships.py         | 69 +++++++++++++++++---------
 2 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/planet/models/coexpression_clusters.py b/planet/models/coexpression_clusters.py
index feb3a67..5f74aa9 100644
--- a/planet/models/coexpression_clusters.py
+++ b/planet/models/coexpression_clusters.py
@@ -2,11 +2,14 @@
 
 from planet import db
 from planet.models.expression_networks import ExpressionNetwork
+from planet.models.gene_families import GeneFamily
 from planet.models.relationships import sequence_coexpression_cluster, SequenceGOAssociation, ClusterGOEnrichment
+from planet.models.relationships import CoexpressionClusterSimilarity, SequenceCoexpressionClusterAssociation, SequenceFamilyAssociation
 
 from utils.enrichment import hypergeo_sf, fdr_correction
 from utils.benchmark import benchmark
 
+from sqlalchemy import join
 from sqlalchemy.orm import joinedload, load_only
 
 import json
@@ -189,3 +192,38 @@ def calculate_enrichment(empty=True):
                 for i, cluster in enumerate(clusters):
                     print(i, "\t cluster: ", cluster.method_id, cluster.name)
                     cluster.__calculate_enrichment()
+
+    @staticmethod
+    @benchmark
+    def calculate_similarities(gene_family_method_id=1):
+
+        # sqlalchemy to fetch cluster associations
+        fields = [SequenceCoexpressionClusterAssociation.__table__.c.sequence_id,
+                  SequenceCoexpressionClusterAssociation.__table__.c.coexpression_cluster_id]
+        condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None
+        cluster_associations = db.engine.execute(db.select(fields).where(condition)).fetchall()
+
+        # sqlalchemy to fetch sequence family associations
+        fields = [SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id]
+        condition = GeneFamily.__table__.c.method_id == gene_family_method_id
+        table = join(SequenceFamilyAssociation.__table__, GeneFamily.__table__, SequenceFamilyAssociation.__table__.c.gene_family_id == GeneFamily.__table__.c.id)
+        sequence_families = db.engine.execute(db.select(fields).select_from(table).where(condition)).fetchall()
+
+        # convert sqlachemy results into dictionary
+        sequence_to_family = {seq_id: fam_id for seq_id, fam_id, method_id in sequence_families}
+
+        cluster_to_sequences = {}
+        cluster_to_families = {}
+
+        for seq_id, cluster_id in cluster_associations:
+            if cluster_id not in cluster_to_sequences.keys():
+                cluster_to_sequences[cluster_id] = []
+            cluster_to_sequences[cluster_id].append(seq_id)
+
+        for cluster_id, sequences in cluster_to_sequences.items():
+            families = list(set([sequence_to_family[s] for s in sequences if s in sequence_to_family.keys()]))
+            if len(families) > 0:
+                cluster_to_families[cluster_id] = families
+
+        for c, f in cluster_to_families.items():
+            print(c,f)
\ No newline at end of file
diff --git a/planet/models/relationships.py b/planet/models/relationships.py
index ec28aa9..8a6ef6b 100644
--- a/planet/models/relationships.py
+++ b/planet/models/relationships.py
@@ -26,6 +26,13 @@
              db.Column('coexpression_cluster_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True)
              )
 
+coexpression_cluster_similarity = \
+    db.Table('coexpression_cluster_similarity',
+             db.Column('id', db.Integer, primary_key=True),
+             db.Column('source_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True),
+             db.Column('target_id', db.Integer, db.ForeignKey('coexpression_clusters.id'), index=True)
+             )
+
 sequence_xref = db.Table('sequence_xref',
                          db.Column('id', db.Integer, primary_key=True),
                          db.Column('sequence_id', db.Integer, db.ForeignKey('sequences.id'), index=True),
@@ -55,6 +62,22 @@ class SequenceCoexpressionClusterAssociation(db.Model):
     coexpression_cluster_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id'))
 
 
+class CoexpressionClusterSimilarity(db.Model):
+    __tablename__ = 'coexpression_cluster_similarity'
+    __table_args__ = {'extend_existing': True}
+
+    id = db.Column(db.Integer, primary_key=True)
+    source_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id'))
+    target_id = db.Column(db.Integer, db.ForeignKey('coexpression_clusters.id'))
+
+    gene_family_method_id = db.Column('gene_family_method_id', db.Integer, db.ForeignKey('gene_family_methods.id'),
+                                      index=True)
+
+    jaccard_index = db.Column(db.Float, index=True)
+    p_value = db.Column(db.Float, index=True)
+    corrected_p_value = db.Column(db.Float, index=True)
+
+
 class SequenceFamilyAssociation(db.Model):
     __tablename__ = 'sequence_family'
     __table_args__ = {'extend_existing': True}
@@ -128,26 +151,26 @@ def genome_percentage(self):
         return self.go_count*100/self.go_size
 
 
-class ProbeGOEnrichment(db.Model):
-    __tablename__ = 'probe_go_enrichment'
-
-    id = db.Column(db.Integer, primary_key=True)
-    probe_id = db.Column(db.Integer, db.ForeignKey('expression_network.id'), index=True)
-    go_id = db.Column(db.Integer, db.ForeignKey('go.id'), index=True)
-
-    """
-    Counts required to calculate the enrichment,
-    store here for quick access
-    """
-    cluster_count = db.Column(db.Integer)
-    cluster_size = db.Column(db.Integer)
-    go_count = db.Column(db.Integer)
-    go_size = db.Column(db.Integer)
-
-    """
-    Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric
-    distribution and applying FDR correction (aka. BH)
-    """
-    enrichment = db.Column(db.Float)
-    p_value = db.Column(db.Float)
-    corrected_p_value = db.Column(db.Float)
+# class ProbeGOEnrichment(db.Model):
+#     __tablename__ = 'probe_go_enrichment'
+#
+#     id = db.Column(db.Integer, primary_key=True)
+#     probe_id = db.Column(db.Integer, db.ForeignKey('expression_network.id'), index=True)
+#     go_id = db.Column(db.Integer, db.ForeignKey('go.id'), index=True)
+#
+#     """
+#     Counts required to calculate the enrichment,
+#     store here for quick access
+#     """
+#     cluster_count = db.Column(db.Integer)
+#     cluster_size = db.Column(db.Integer)
+#     go_count = db.Column(db.Integer)
+#     go_size = db.Column(db.Integer)
+#
+#     """
+#     Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric
+#     distribution and applying FDR correction (aka. BH)
+#     """
+#     enrichment = db.Column(db.Float)
+#     p_value = db.Column(db.Float)
+#     corrected_p_value = db.Column(db.Float)