diff --git a/bin/draw_model_graph b/bin/draw_model_graph new file mode 100755 index 0000000..c8c6be7 --- /dev/null +++ b/bin/draw_model_graph @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import argparse +import sys + +from sshmm.MyHMM import MyHMMOpenXML + +def parseArguments(args): + """Sets up the command-line parser and calls it on the command-line arguments to the program. + + arguments: + args -- command-line arguments to the program""" + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, + description='Takes a ssHMM model file in XML format and produces a model graph in PNG format.') + parser.add_argument('model', type=str, help='model file in XML format') + parser.add_argument('sequence_number', type=int, help='number of training sequences that were used to generate the model. This number can be found in the verbose log file.') + parser.add_argument('output', type=str, help='model graph output') + return parser.parse_args() + +options = parseArguments(sys.argv) +model = MyHMMOpenXML(options.model) +model.printAsGraph(options.output, options.sequence_number) +print "Success: Wrote model graph ", options.output diff --git a/setup.py b/setup.py index 3ffc4e5..0488719 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = """RNA-binding proteins (RBPs) play a vital role in the post-transcriptional control of RNAs. They are known to recognize RNA molecules by their nucleotide sequence as well as their three-dimensional structure. ssHMM is an RNA motif finder that combines a hidden Markov model (HMM) with Gibbs sampling to learn the joint sequence and structure binding preferences of RBPs from high-throughput RNA-binding experiments, such as CLIP-Seq. The model can be visualized as an intuitive graph illustrating the interplay between RNA sequence and structure.""" setup(name='sshmm', - version='1.0.4', + version='1.0.5', description='A sequence-structure hidden Markov model for the analysis of RNA-binding protein data.', long_description=long_description, url='https://github.molgen.mpg.de/heller/ssHMM', @@ -24,5 +24,5 @@ package_data={'sshmm': ['img/*.png'],}, zip_safe=False, install_requires=['numpy', 'graphviz', 'pygraphviz', 'weblogo', 'future', 'logging_exceptions', 'forgi'], - scripts=['bin/preprocess_dataset', 'bin/train_seqstructhmm', 'bin/batch_seqstructhmm'], + scripts=['bin/preprocess_dataset', 'bin/train_seqstructhmm', 'bin/batch_seqstructhmm', 'bin/draw_model_graph'], ext_modules = [Extension("cEstimate", ["sshmm/cEstimate.c"])]) diff --git a/sshmm/MyHMM.py b/sshmm/MyHMM.py index 8443773..97a37fa 100644 --- a/sshmm/MyHMM.py +++ b/sshmm/MyHMM.py @@ -679,5 +679,250 @@ def constructSwitchingTransitions(self, cmodel, pi, A): state.in_id = trans[1] state.in_a = trans[2] +GHMM_FILETYPE_SMO = 'smo' #obsolete +GHMM_FILETYPE_XML = 'xml' +GHMM_FILETYPE_HMMER = 'hmm' + +class MyHMMOpenFactory(HMMFactory): + """ Opens a HMM from a file. + + Currently four formats are supported: + HMMer, our smo file format, and two xml formats. + @note the support for smo files and the old xml format will phase out + """ + def __init__(self, defaultFileType=None): + self.defaultFileType = defaultFileType + + def guessFileType(self, filename): + """ guesses the file format from the filename """ + if filename.endswith('.'+GHMM_FILETYPE_XML): + return GHMM_FILETYPE_XML + elif filename.endswith('.'+GHMM_FILETYPE_SMO):#obsolete + return GHMM_FILETYPE_SMO + elif filename.endswith('.'+GHMM_FILETYPE_HMMER):#obsolete + return GHMM_FILETYPE_HMMER + else: + return None + + def __call__(self, fileName, modelIndex=None, filetype=None): + + if not isinstance(fileName,StringIO.StringIO): + if not os.path.exists(fileName): + raise IOError('File ' + str(fileName) + ' not found.') + + if not filetype: + if self.defaultFileType: + log.warning("HMMOpenHMMER, HMMOpenSMO and HMMOpenXML are deprecated. " + + "Use HMMOpen and the filetype parameter if needed.") + filetype = self.defaultFileType + else: + filetype = self.guessFileType(fileName) + if not filetype: + raise WrongFileType("Could not guess the type of file " + str(fileName) + + " and no filetype specified") + + # XML file: both new and old format + if filetype == GHMM_FILETYPE_XML: + # try to validate against ghmm.dtd + if ghmmwrapper.ghmm_xmlfile_validate(fileName): + return self.openNewXML(fileName, modelIndex) + elif filetype == GHMM_FILETYPE_SMO: + return self.openSMO(fileName, modelIndex) + elif filetype == GHMM_FILETYPE_HMMER: + return self.openHMMER(fileName) + else: + raise TypeError("Invalid file type " + str(filetype)) + + + def openNewXML(self, fileName, modelIndex): + """ Open one ore more HMM in the new xml format """ + # opens and parses the file + file = ghmmwrapper.ghmm_xmlfile_parse(fileName) + if file == None: + log.debug( "XML has file format problems!") + raise WrongFileType("file is not in GHMM xml format") + + nrModels = file.noModels + modelType = file.modelType + + # we have a continuous HMM, prepare for hmm creation + if (modelType & ghmmwrapper.kContinuousHMM): + emission_domain = Float() + if (modelType & ghmmwrapper.kMultivariate): + distribution = MultivariateGaussianDistribution + hmmClass = MultivariateGaussianMixtureHMM + else: + distribution = ContinuousMixtureDistribution + hmmClass = ContinuousMixtureHMM + getModel = file.get_cmodel + + # we have a discrete HMM, prepare for hmm creation + elif ((modelType & ghmmwrapper.kDiscreteHMM) + and not (modelType & ghmmwrapper.kTransitionClasses) + and not (modelType & ghmmwrapper.kPairHMM)): + emission_domain = 'd' + distribution = DiscreteDistribution + getModel = file.get_dmodel + if (modelType & ghmmwrapper.kLabeledStates): + hmmClass = StateLabelHMM + else: + hmmClass = DiscreteEmissionHMM + + # currently not supported + else: + raise UnsupportedFeature("Non-supported model type") + + + # read all models to list at first + result = [] + for i in range(nrModels): + cmodel = getModel(i) + if emission_domain is 'd': + emission_domain = Alphabet(cmodel.alphabet) + if modelType & ghmmwrapper.kLabeledStates: + labelDomain = LabelDomain(cmodel.label_alphabet) + m = MyHMM(emission_domain, distribution(emission_domain), labelDomain, cmodel) + else: + m = MyHMM(emission_domain, distribution(emission_domain), cmodel) + + result.append(m) + + # for a single + if modelIndex != None: + if modelIndex < nrModels: + result = result[modelIndex] + else: + raise IndexError("the file %s has only %s models"% fileName, str(nrModels)) + elif nrModels == 1: + result = result[0] + + return result + + + def openSingleHMMER(self, fileName): + # HMMER format models + h = modhmmer.hmmer(fileName) + + if h.m == 4: # DNA model + emission_domain = DNA + elif h.m == 20: # Peptide model + emission_domain = AminoAcids + else: # some other model + emission_domain = IntegerRange(0,h.m) + distribution = DiscreteDistribution(emission_domain) + + # XXX TODO: Probably slow for large matrices (Rewrite for 0.9) + [A,B,pi,modelName] = h.getGHMMmatrices() + return HMMFromMatrices(emission_domain, distribution, A, B, pi, hmmName=modelName) + + + def openHMMER(self, fileName): + """ + Reads a file containing multiple HMMs in HMMER format, returns list of + HMM objects or a single HMM object. + """ + if not os.path.exists(fileName): + raise IOError('File ' + str(fileName) + ' not found.') + + modelList = [] + string = "" + f = open(fileName,"r") + + res = re.compile("^//") + stat = re.compile("^ACC\s+(\w+)") + for line in f.readlines(): + string = string + line + m = stat.match(line) + if m: + name = m.group(1) + log.info( "Reading model " + str(name) + ".") + + match = res.match(line) + if match: + fileLike = StringIO.StringIO(string) + modelList.append(self.openSingleHMMER(fileLike)) + string = "" + match = None + + if len(modelList) == 1: + return modelList[0] + return modelList + + + def determineHMMClass(self, fileName): + # + # smo files. Obsolete + # + file = open(fileName,'r') + + hmmRe = re.compile("^HMM\s*=") + shmmRe = re.compile("^SHMM\s*=") + mvalueRe = re.compile("M\s*=\s*([0-9]+)") + densityvalueRe = re.compile("density\s*=\s*([0-9]+)") + cosvalueRe = re.compile("cos\s*=\s*([0-9]+)") + emission_domain = None + + while 1: + l = file.readline() + if not l: + break + l = l.strip() + if len(l) > 0 and l[0] != '#': # Not a comment line + hmm = hmmRe.search(l) + shmm = shmmRe.search(l) + mvalue = mvalueRe.search(l) + densityvalue = densityvalueRe.search(l) + cosvalue = cosvalueRe.search(l) + + if hmm != None: + if emission_domain != None and emission_domain != 'int': + log.error( "HMMOpenFactory:determineHMMClass: both HMM and SHMM? " + str(emission_domain)) + else: + emission_domain = 'int' + + if shmm != None: + if emission_domain != None and emission_domain != 'double': + log.error( "HMMOpenFactory:determineHMMClass: both HMM and SHMM? " + str(emission_domain)) + else: + emission_domain = 'double' + + if mvalue != None: + M = int(mvalue.group(1)) + + if densityvalue != None: + density = int(densityvalue.group(1)) + + if cosvalue != None: + cos = int(cosvalue.group(1)) + + file.close() + if emission_domain == 'int': + # only integer alphabet + emission_domain = IntegerRange(0,M) + distribution = DiscreteDistribution + hmm_class = DiscreteEmissionHMM + return (hmm_class, emission_domain, distribution) + + elif emission_domain == 'double': + # M number of mixture components + # density component type + # cos number of state transition classes + if M == 1 and density == 0: + emission_domain = Float() + distribution = GaussianDistribution + hmm_class = GaussianEmissionHMM + return (hmm_class, emission_domain, distribution) + + elif M > 1 and density == 0: + emission_domain = Float() + distribution = GaussianMixtureDistribution + hmm_class = GaussianMixtureHMM + return (hmm_class, emission_domain, distribution) + + else: + raise TypeError("Model type can not be determined.") + + return (None, None, None) +MyHMMOpenXML = MyHMMOpenFactory() MyHMMFromMatrices = MyHMMFromMatricesFactory()