From b1938d6c04aa011f48b53f623dfdb8f5a5145869 Mon Sep 17 00:00:00 2001 From: David Heller Date: Thu, 23 Feb 2017 14:47:41 +0100 Subject: [PATCH] improve usability: simplify output directory structure; clarify info messages on stdout and in log; clarify output file names --- bin/batch_seqstructhmm | 42 ++++++++++++++++++++++---------------- bin/train_seqstructhmm | 46 ++++++++++++++++++++++-------------------- sshmm/seqstructhmm.py | 27 +++++++++++-------------- 3 files changed, 61 insertions(+), 54 deletions(-) diff --git a/bin/batch_seqstructhmm b/bin/batch_seqstructhmm index 6709de5..2e6e416 100755 --- a/bin/batch_seqstructhmm +++ b/bin/batch_seqstructhmm @@ -54,7 +54,7 @@ def parseArguments(args): def do_training_for_configuration(data_directory, batch_job_directory, protein, structure_type, motif_length, baum_welch, flexibility, block_size, termination_interval, threshold): # Create output directory for job - job_directory = '{0}/{1}_{2}_ml{3}_fl{4}_bs{5}_ti{6}_tt{7}'.format(batch_job_directory, protein, structure_type, motif_length, flexibility, block_size, termination_interval, threshold) + job_directory = '{0}/{1}_{2}_ml{3}_fl{4}_bs{5}_ti{6}_tt{7}/'.format(batch_job_directory, protein, structure_type, motif_length, flexibility, block_size, termination_interval, threshold) os.mkdir(job_directory) random.seed() #reinitialize the random number generator because else random numbers would repeat @@ -70,29 +70,33 @@ def do_training_for_configuration(data_directory, batch_job_directory, protein, main_logger.info('Read %s training sequences', training_sequence_container.get_length()) # Initialize model - model0 = SeqStructHMM(0, job_directory, main_logger, numbers_logger, training_sequence_container, motif_length, flexibility, block_size) + model = SeqStructHMM(job_directory, main_logger, numbers_logger, training_sequence_container, motif_length, + flexibility, block_size) if baum_welch: best_baumwelch_sequence_model = seq_hmm.find_best_baumwelch_sequence_models(motif_length, training_sequence_container, main_logger) best_viterbi_paths = best_baumwelch_sequence_model[1] - model0.prepare_model_with_viterbi(best_viterbi_paths) + model.prepare_model_with_viterbi(best_viterbi_paths) else: - model0.prepare_model_randomly() + model.prepare_model_randomly() # Train model - key_statistics = model0.do_training(termination_interval, threshold, write_model_state=True)[0] + key_statistics = model.do_training(termination_interval, threshold, write_model_state=True)[0] # Write results - pwm_global = model0.get_pwm_global() - pwm_global.write_to_file(job_directory + 'recovered_pwm_global.txt') - pwm_global.write_weblogo(job_directory + 'recovered_pwm_global.png') - pwm_best_sequences = model0.get_pwm_best_sequences() - pwm_best_sequences.write_to_file(job_directory + 'recovered_pwm_best_sequences.txt') - pwm_best_sequences.write_weblogo(job_directory + 'recovered_pwm_best_sequences.png') - - model0.model.printAsGraph(job_directory + 'model_at_end.png', model0.sequence_container.get_length()) - model0.model.write(job_directory + 'model_at_end.xml') + pwm_global = model.get_pwm_global() + pwm_global.write_to_file(job_directory + 'logo_global.txt') + pwm_global.write_weblogo(job_directory + 'logo_global.png') + pwm_best_sequences = model.get_pwm_best_sequences() + pwm_best_sequences.write_to_file(job_directory + 'logo_best_sequences.txt') + pwm_best_sequences.write_weblogo(job_directory + 'logo_best_sequences.png') + pwm_hairpin = model.get_pwm_hairpin() + pwm_hairpin.write_to_file(job_directory + 'logo_hairpin.txt') + pwm_hairpin.write_weblogo(job_directory + 'logo_hairpin.png') + + model.model.printAsGraph(job_directory + 'final_graph.png', model.sequence_container.get_length()) + model.model.write(job_directory + 'final_model.xml') return [protein] + key_statistics @@ -117,12 +121,15 @@ def main(args): jobs = itertools.product([options.data_directory], [batch_job_directory], proteins, [options.structure_type], [options.motif_length], [True], [options.flexibility], [options.block_size], [options.termination_interval], [options.threshold]) + batch_logger.info('Started batch run on {0} proteins.'.format(len(proteins))) #Determine number of cores to use if options.cores: pool_size = options.cores else: pool_size = min(multiprocessing.cpu_count() - 1, len(proteins)) + + batch_logger.info('Using {0} cores.'.format(pool_size)) pool = multiprocessing.Pool(processes=pool_size, initializer=start_process, initargs=[batch_logger], @@ -134,10 +141,11 @@ def main(args): pool.close() # no more tasks pool.join() # wrap up current tasks - batch_logger.info('All jobs done! Results:') - batch_logger.info('protein,iteration_number,sequence_ll,sequence_structure_ll') + batch_logger.info('Finished all proteins.') + batch_logger.info('Result statistics:') + batch_logger.info('\t'.join(['Protein', 'Number of iterations', 'Sequence loglikelihood', 'Sequence-structure loglikelihood'])) for configuration in results: - batch_logger.info(','.join(map(str, configuration))) + batch_logger.info('\t'.join(map(str, configuration))) if __name__ == "__main__" : sys.exit(main(sys.argv)) diff --git a/bin/train_seqstructhmm b/bin/train_seqstructhmm index 194523c..f3ae53b 100755 --- a/bin/train_seqstructhmm +++ b/bin/train_seqstructhmm @@ -81,38 +81,40 @@ def main(args): main_logger.info('Finished reading %s training sequences', training_sequence_container.get_length()) #Initialize model - model0 = SeqStructHMM(0, job_directory, main_logger, numbers_logger, training_sequence_container, options.motif_length, options.flexibility, options.block_size) + model = SeqStructHMM(job_directory, main_logger, numbers_logger, training_sequence_container, options.motif_length, + options.flexibility, options.block_size) if options.baum_welch: best_baumwelch_sequence_model = seq_hmm.find_best_baumwelch_sequence_models(options.motif_length, training_sequence_container, main_logger) best_viterbi_paths = best_baumwelch_sequence_model[1] - model0.prepare_model_with_viterbi(best_viterbi_paths) + model.prepare_model_with_viterbi(best_viterbi_paths) else: - model0.prepare_model_randomly() - main_logger.info('Completed initialisation. Begin training.') + model.prepare_model_randomly() + main_logger.info('Completed initialisation. Begin training..') #Train model - model0.do_training(options.termination_interval, options.threshold, not options.no_model_state) - main_logger.info('Completed training. Write PWMs.') + model.do_training(options.termination_interval, options.threshold, not options.no_model_state) + main_logger.info('Completed training. Write sequence logos..') #Write results - pwm_global = model0.get_pwm_global() - pwm_global.write_to_file(job_directory + 'recovered_pwm_global.txt') - pwm_global.write_weblogo(job_directory + 'recovered_pwm_global.png') - pwm_best_sequences = model0.get_pwm_best_sequences() - pwm_best_sequences.write_to_file(job_directory + 'recovered_pwm_best_sequences.txt') - pwm_best_sequences.write_weblogo(job_directory + 'recovered_pwm_best_sequences.png') - pwm_hairpin = model0.get_pwm_hairpin() - pwm_hairpin.write_to_file(job_directory + 'recovered_pwm_hairpin.txt') - pwm_hairpin.write_weblogo(job_directory + 'recovered_pwm_hairpin.png') - main_logger.info('Completed writing PWMs. Print model graph.') + pwm_global = model.get_pwm_global() + pwm_global.write_to_file(job_directory + 'logo_global.txt') + pwm_global.write_weblogo(job_directory + 'logo_global.png') + pwm_best_sequences = model.get_pwm_best_sequences() + pwm_best_sequences.write_to_file(job_directory + 'logo_best_sequences.txt') + pwm_best_sequences.write_weblogo(job_directory + 'logo_best_sequences.png') + pwm_hairpin = model.get_pwm_hairpin() + pwm_hairpin.write_to_file(job_directory + 'logo_hairpin.txt') + pwm_hairpin.write_weblogo(job_directory + 'logo_hairpin.png') + main_logger.info('Completed writing sequence logos. Print model graph..') - graph_path = job_directory + 'model_at_end.png' - model0.model.printAsGraph(graph_path, model0.sequence_container.get_length()) - main_logger.info('Printed model graph: {0} Write model XML.'.format(graph_path)) + graph_path = job_directory + 'final_graph.png' + model.model.printAsGraph(graph_path, model.sequence_container.get_length()) + main_logger.info('Printed model graph: {0}. Write model file..'.format(graph_path)) - xml_path = job_directory + 'model_at_end.xml' - model0.model.write(xml_path) - main_logger.info('Wrote model XML:' + xml_path) + xml_path = job_directory + 'final_model.xml' + model.model.write(xml_path) + main_logger.info('Wrote model file: ' + xml_path) + main_logger.info('Finished ssHMM successfully.') if __name__ == "__main__" : sys.exit(main(sys.argv)) diff --git a/sshmm/seqstructhmm.py b/sshmm/seqstructhmm.py index 95df3ea..8979018 100644 --- a/sshmm/seqstructhmm.py +++ b/sshmm/seqstructhmm.py @@ -86,10 +86,10 @@ def generateSequenceStructureHMM(cls, alphabet, motif_length): return m - def __init__(self, identifier, jobDirectory, mainLogger, numbersLogger, sequenceContainer, motifLength, flexibility, sample_size, temperature = 1.0, delta = 0.001): - self.identifier = identifier + def __init__(self, jobDirectory, mainLogger, numbersLogger, sequenceContainer, motifLength, flexibility, + sample_size, temperature=1.0, delta=0.001): self.job_directory = jobDirectory - self.model_directory = '{jobDirectory}/model{identifier}/'.format(jobDirectory=self.job_directory, identifier=self.identifier) + self.model_directory = self.job_directory self.main_logger = mainLogger self.numbers_logger = numbersLogger self.sequence_container = sequenceContainer @@ -103,7 +103,6 @@ def __init__(self, identifier, jobDirectory, mainLogger, numbersLogger, sequence self.iteration = 0 #Create output directory for this model - os.mkdir(self.model_directory) os.mkdir('{modelDirectory}graphs'.format(modelDirectory=self.model_directory)) os.mkdir('{modelDirectory}models'.format(modelDirectory=self.model_directory)) @@ -236,7 +235,7 @@ def do_training(self, output_frequency, termination_threshold, write_model_state progressMade = True if not progressMade: - self.main_logger.info("Break model %s after %s iterations", 0, self.getIteration()) + self.main_logger.info("Terminate model after %s iterations.", self.getIteration()) last_model = last_models.as_list()[-1] return last_model @@ -279,13 +278,11 @@ def write_current_model(self): def output_current_statistics(self): - self.main_logger.info('Generate output for iteration %s', self.iteration) - #output loglikelihoods sequence_likelihood = self.calculate_sequence_loglikelihood() seq_str_likelihood = self.calculateSequenceStructureLoglikelihood() - self.main_logger.info('Current sequence loglikelihood: %s', sequence_likelihood) - self.main_logger.info('Current sequence+structure loglikelihood: %s', seq_str_likelihood) + self.main_logger.info('[Iteration {0}] Sequence loglikelihood: {1}'.format(self.iteration, sequence_likelihood)) + self.main_logger.info('[Iteration {0}] Sequence-structure loglikelihood: {1}'.format(self.iteration, seq_str_likelihood)) #output information contents (ic_1000_sequence, ic_1000_structure, ic_1000_combined) = self.calculate_information_contents_1000_sequences() @@ -293,13 +290,13 @@ def output_current_statistics(self): ic_global_structure = self.model.calculateInformationContentStructure(self.sequence_container.get_length()) ic_global_combined = self.model.calculateInformationContentCombined(self.sequence_container.get_length()) - self.main_logger.info('Average IC of sequence per position (1000 sequences): %s', sum(ic_1000_sequence) / float(self.motif_length)) - self.main_logger.info('Average IC of structure per position (1000 sequences): %s', sum(ic_1000_structure) / float(self.motif_length)) - self.main_logger.info('Average IC of sequence + structure per position (1000 sequences): %s', sum(ic_1000_combined) / float(self.motif_length)) + self.main_logger.info('[Iteration {0}] Avg. IC of sequence per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_sequence) / float(self.motif_length))) + self.main_logger.info('[Iteration {0}] Avg. IC of structure per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_structure) / float(self.motif_length))) + self.main_logger.info('[Iteration {0}] Avg. IC of sequence-structure per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_combined) / float(self.motif_length))) - self.main_logger.info('Average IC of sequence per position (global weighted average from HMM): %s', sum(ic_global_sequence) / float(self.motif_length)) - self.main_logger.info('Average IC of structure per position (global weighted average from HMM): %s', sum(ic_global_structure) / float(self.motif_length)) - self.main_logger.info('Average IC of sequence + structure per position (global weighted average from HMM): %s', sum(ic_global_combined) / float(self.motif_length)) + self.main_logger.info('[Iteration {0}] Avg. IC of sequence per position (model): {1}'.format(self.iteration, sum(ic_global_sequence) / float(self.motif_length))) + self.main_logger.info('[Iteration {0}] Avg. IC of structure per position (model): {1}'.format(self.iteration, sum(ic_global_structure) / float(self.motif_length))) + self.main_logger.info('[Iteration {0}] Avg. IC of sequence-structure per position (model): {1}'.format(self.iteration, sum(ic_global_combined) / float(self.motif_length))) #output key statistics into concise logger self.numbers_logger.info("{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(self.iteration, sequence_likelihood, seq_str_likelihood,