Skip to content
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Cannot retrieve contributors at this time
188 lines (148 sloc) 7.98 KB
from planet import db, whooshee
from planet.models.relationships import sequence_go, sequence_interpro, sequence_family, sequence_coexpression_cluster
from planet.models.relationships import sequence_xref, sequence_sequence_ecc
from utils.sequence import translate
from utils.parser.fasta import Fasta
from sqlalchemy.orm import undefer
import operator
import sys
SQL_COLLATION = 'NOCASE' if == 'sqlite' else ''
class Sequence(db.Model):
__tablename__ = 'sequences'
id = db.Column(db.Integer, primary_key=True)
species_id = db.Column(db.Integer, db.ForeignKey('', ondelete='CASCADE'), index=True)
name = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
description = db.Column(db.Text)
coding_sequence = db.deferred(db.Column(db.Text))
type = db.Column(db.Enum('protein_coding', 'TE', 'RNA', name='sequence_type'), default='protein_coding')
is_mitochondrial = db.Column(db.Boolean, default=False)
is_chloroplast = db.Column(db.Boolean, default=False)
expression_profiles = db.relationship('ExpressionProfile', backref=db.backref('sequence', lazy='joined'),
cascade="all, delete-orphan",
network_nodes = db.relationship('ExpressionNetwork',
backref=db.backref('sequence', lazy='joined'),
cascade="all, delete-orphan",
# Other properties
# coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation'
# interpro_associations declared in 'SequenceInterproAssociation'
# go_associations declared in 'SequenceGOAssociation'
# family_associations declared in 'SequenceFamilyAssociation'
go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic')
interpro_domains = db.relationship('Interpro', secondary=sequence_interpro, lazy='dynamic')
families = db.relationship('GeneFamily', secondary=sequence_family, lazy='dynamic')
coexpression_clusters = db.relationship('CoexpressionCluster', secondary=sequence_coexpression_cluster,
backref=db.backref('sequences', lazy='dynamic'),
ecc_query_associations = db.relationship('SequenceSequenceECCAssociation',
primaryjoin="SequenceSequenceECCAssociation.query_id ==",
backref=db.backref('query_sequence', lazy='joined'),
ecc_target_associations = db.relationship('SequenceSequenceECCAssociation',
primaryjoin="SequenceSequenceECCAssociation.target_id ==",
backref=db.backref('target_sequence', lazy='joined'),
clade_associations_one = db.relationship('SequenceSequenceCladeAssociation',
primaryjoin="SequenceSequenceCladeAssociation.sequence_one_id ==",
backref=db.backref('sequence_one', lazy='joined'),
clade_associations_two = db.relationship('SequenceSequenceCladeAssociation',
primaryjoin="SequenceSequenceCladeAssociation.sequence_two_id ==",
backref=db.backref('sequence_two', lazy='joined'),
xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined')
def __init__(self, species_id, name, coding_sequence, type='protein_coding', is_chloroplast=False,
is_mitochondrial=False, description=None):
self.species_id = species_id = name
self.description = description
self.coding_sequence = coding_sequence
self.type = type
self.is_chloroplast = is_chloroplast
self.is_mitochondrial = is_mitochondrial
def protein_sequence(self):
Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and
break after adding a stop codon (indicated by '*')
:return: The amino acid sequence based on the coding sequence
return translate(self.coding_sequence)
def aliases(self):
Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs
:return: human readable string with aliases or None
t = [ for x in self.xrefs if x.platform == 'token']
return ", ".join(t) if len(t) > 0 else None
def readable_type(self):
Converts the type table to a readable string
:return: string with readable version of the sequence type
conversion = {'protein_coding': 'protein coding',
'TE': 'transposable element',
'RNA': 'RNA'}
if self.type in conversion.keys():
return conversion[self.type]
return 'other'
def add_from_fasta(filename, species_id, compressed=False):
fasta_data = Fasta()
fasta_data.readfile(filename, compressed=compressed)
new_sequences = []
# Loop over sequences, sorted by name (key here) and add to db
for name, sequence in sorted(fasta_data.sequences.items(), key=operator.itemgetter(0)):
new_sequence = {"species_id": species_id,
"name": name,
"description": None,
"coding_sequence": sequence,
"type": "protein_coding",
"is_mitochondrial": False,
"is_chloroplast": False}
# add 400 sequences at the time, more can cause problems with some database engines
if len(new_sequences) > 400:
db.engine.execute(Sequence.__table__.insert(), new_sequences)
new_sequences = []
# add the last set of sequences
db.engine.execute(Sequence.__table__.insert(), new_sequences)
return len(fasta_data.sequences.keys())
def add_descriptions(filename, species_id):
sequences = Sequence.query.filter_by(species_id=species_id).all()
seq_dict = {}
for s in sequences:
seq_dict[] = s
with open(filename, "r") as f_in:
for i, line in enumerate(f_in):
name, description = line.strip().split('\t')
except ValueError:
print("Cannot parse line %d: \"%s\"" % (i, line), file=sys.stderr)
if name in seq_dict.keys():
seq_dict[name].description = description
if i % 400 == 0:
def export_cds(filename):
sequences = Sequence.query.options(undefer('coding_sequence')).all()
with open(filename, "w") as f_out:
for s in sequences:
print(">%s\n%s" % (, s.coding_sequence), file=f_out)
def export_protein(filename):
sequences = Sequence.query.options(undefer('coding_sequence')).all()
with open(filename, "w") as f_out:
for s in sequences:
print(">%s\n%s" % (, s.protein_sequence), file=f_out)