Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
improve usability: simplify output directory structure; clarify info …
…messages on stdout and in log; clarify output file names
  • Loading branch information
heller committed Feb 23, 2017
1 parent 4bd34ff commit b1938d6
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 54 deletions.
42 changes: 25 additions & 17 deletions bin/batch_seqstructhmm
Expand Up @@ -54,7 +54,7 @@ def parseArguments(args):

def do_training_for_configuration(data_directory, batch_job_directory, protein, structure_type, motif_length, baum_welch, flexibility, block_size, termination_interval, threshold):
# Create output directory for job
job_directory = '{0}/{1}_{2}_ml{3}_fl{4}_bs{5}_ti{6}_tt{7}'.format(batch_job_directory, protein, structure_type, motif_length, flexibility, block_size, termination_interval, threshold)
job_directory = '{0}/{1}_{2}_ml{3}_fl{4}_bs{5}_ti{6}_tt{7}/'.format(batch_job_directory, protein, structure_type, motif_length, flexibility, block_size, termination_interval, threshold)
os.mkdir(job_directory)

random.seed() #reinitialize the random number generator because else random numbers would repeat
Expand All @@ -70,29 +70,33 @@ def do_training_for_configuration(data_directory, batch_job_directory, protein,
main_logger.info('Read %s training sequences', training_sequence_container.get_length())

# Initialize model
model0 = SeqStructHMM(0, job_directory, main_logger, numbers_logger, training_sequence_container, motif_length, flexibility, block_size)
model = SeqStructHMM(job_directory, main_logger, numbers_logger, training_sequence_container, motif_length,
flexibility, block_size)
if baum_welch:
best_baumwelch_sequence_model = seq_hmm.find_best_baumwelch_sequence_models(motif_length,
training_sequence_container,
main_logger)
best_viterbi_paths = best_baumwelch_sequence_model[1]
model0.prepare_model_with_viterbi(best_viterbi_paths)
model.prepare_model_with_viterbi(best_viterbi_paths)
else:
model0.prepare_model_randomly()
model.prepare_model_randomly()

# Train model
key_statistics = model0.do_training(termination_interval, threshold, write_model_state=True)[0]
key_statistics = model.do_training(termination_interval, threshold, write_model_state=True)[0]

# Write results
pwm_global = model0.get_pwm_global()
pwm_global.write_to_file(job_directory + 'recovered_pwm_global.txt')
pwm_global.write_weblogo(job_directory + 'recovered_pwm_global.png')
pwm_best_sequences = model0.get_pwm_best_sequences()
pwm_best_sequences.write_to_file(job_directory + 'recovered_pwm_best_sequences.txt')
pwm_best_sequences.write_weblogo(job_directory + 'recovered_pwm_best_sequences.png')

model0.model.printAsGraph(job_directory + 'model_at_end.png', model0.sequence_container.get_length())
model0.model.write(job_directory + 'model_at_end.xml')
pwm_global = model.get_pwm_global()
pwm_global.write_to_file(job_directory + 'logo_global.txt')
pwm_global.write_weblogo(job_directory + 'logo_global.png')
pwm_best_sequences = model.get_pwm_best_sequences()
pwm_best_sequences.write_to_file(job_directory + 'logo_best_sequences.txt')
pwm_best_sequences.write_weblogo(job_directory + 'logo_best_sequences.png')
pwm_hairpin = model.get_pwm_hairpin()
pwm_hairpin.write_to_file(job_directory + 'logo_hairpin.txt')
pwm_hairpin.write_weblogo(job_directory + 'logo_hairpin.png')

model.model.printAsGraph(job_directory + 'final_graph.png', model.sequence_container.get_length())
model.model.write(job_directory + 'final_model.xml')
return [protein] + key_statistics


Expand All @@ -117,12 +121,15 @@ def main(args):
jobs = itertools.product([options.data_directory], [batch_job_directory], proteins, [options.structure_type],
[options.motif_length], [True], [options.flexibility],
[options.block_size], [options.termination_interval], [options.threshold])
batch_logger.info('Started batch run on {0} proteins.'.format(len(proteins)))

#Determine number of cores to use
if options.cores:
pool_size = options.cores
else:
pool_size = min(multiprocessing.cpu_count() - 1, len(proteins))

batch_logger.info('Using {0} cores.'.format(pool_size))
pool = multiprocessing.Pool(processes=pool_size,
initializer=start_process,
initargs=[batch_logger],
Expand All @@ -134,10 +141,11 @@ def main(args):
pool.close() # no more tasks
pool.join() # wrap up current tasks

batch_logger.info('All jobs done! Results:')
batch_logger.info('protein,iteration_number,sequence_ll,sequence_structure_ll')
batch_logger.info('Finished all proteins.')
batch_logger.info('Result statistics:')
batch_logger.info('\t'.join(['Protein', 'Number of iterations', 'Sequence loglikelihood', 'Sequence-structure loglikelihood']))
for configuration in results:
batch_logger.info(','.join(map(str, configuration)))
batch_logger.info('\t'.join(map(str, configuration)))

if __name__ == "__main__" :
sys.exit(main(sys.argv))
46 changes: 24 additions & 22 deletions bin/train_seqstructhmm
Expand Up @@ -81,38 +81,40 @@ def main(args):
main_logger.info('Finished reading %s training sequences', training_sequence_container.get_length())

#Initialize model
model0 = SeqStructHMM(0, job_directory, main_logger, numbers_logger, training_sequence_container, options.motif_length, options.flexibility, options.block_size)
model = SeqStructHMM(job_directory, main_logger, numbers_logger, training_sequence_container, options.motif_length,
options.flexibility, options.block_size)
if options.baum_welch:
best_baumwelch_sequence_model = seq_hmm.find_best_baumwelch_sequence_models(options.motif_length, training_sequence_container, main_logger)
best_viterbi_paths = best_baumwelch_sequence_model[1]
model0.prepare_model_with_viterbi(best_viterbi_paths)
model.prepare_model_with_viterbi(best_viterbi_paths)
else:
model0.prepare_model_randomly()
main_logger.info('Completed initialisation. Begin training.')
model.prepare_model_randomly()
main_logger.info('Completed initialisation. Begin training..')

#Train model
model0.do_training(options.termination_interval, options.threshold, not options.no_model_state)
main_logger.info('Completed training. Write PWMs.')
model.do_training(options.termination_interval, options.threshold, not options.no_model_state)
main_logger.info('Completed training. Write sequence logos..')

#Write results
pwm_global = model0.get_pwm_global()
pwm_global.write_to_file(job_directory + 'recovered_pwm_global.txt')
pwm_global.write_weblogo(job_directory + 'recovered_pwm_global.png')
pwm_best_sequences = model0.get_pwm_best_sequences()
pwm_best_sequences.write_to_file(job_directory + 'recovered_pwm_best_sequences.txt')
pwm_best_sequences.write_weblogo(job_directory + 'recovered_pwm_best_sequences.png')
pwm_hairpin = model0.get_pwm_hairpin()
pwm_hairpin.write_to_file(job_directory + 'recovered_pwm_hairpin.txt')
pwm_hairpin.write_weblogo(job_directory + 'recovered_pwm_hairpin.png')
main_logger.info('Completed writing PWMs. Print model graph.')
pwm_global = model.get_pwm_global()
pwm_global.write_to_file(job_directory + 'logo_global.txt')
pwm_global.write_weblogo(job_directory + 'logo_global.png')
pwm_best_sequences = model.get_pwm_best_sequences()
pwm_best_sequences.write_to_file(job_directory + 'logo_best_sequences.txt')
pwm_best_sequences.write_weblogo(job_directory + 'logo_best_sequences.png')
pwm_hairpin = model.get_pwm_hairpin()
pwm_hairpin.write_to_file(job_directory + 'logo_hairpin.txt')
pwm_hairpin.write_weblogo(job_directory + 'logo_hairpin.png')
main_logger.info('Completed writing sequence logos. Print model graph..')

graph_path = job_directory + 'model_at_end.png'
model0.model.printAsGraph(graph_path, model0.sequence_container.get_length())
main_logger.info('Printed model graph: {0} Write model XML.'.format(graph_path))
graph_path = job_directory + 'final_graph.png'
model.model.printAsGraph(graph_path, model.sequence_container.get_length())
main_logger.info('Printed model graph: {0}. Write model file..'.format(graph_path))

xml_path = job_directory + 'model_at_end.xml'
model0.model.write(xml_path)
main_logger.info('Wrote model XML:' + xml_path)
xml_path = job_directory + 'final_model.xml'
model.model.write(xml_path)
main_logger.info('Wrote model file: ' + xml_path)
main_logger.info('Finished ssHMM successfully.')

if __name__ == "__main__" :
sys.exit(main(sys.argv))
27 changes: 12 additions & 15 deletions sshmm/seqstructhmm.py
Expand Up @@ -86,10 +86,10 @@ def generateSequenceStructureHMM(cls, alphabet, motif_length):
return m


def __init__(self, identifier, jobDirectory, mainLogger, numbersLogger, sequenceContainer, motifLength, flexibility, sample_size, temperature = 1.0, delta = 0.001):
self.identifier = identifier
def __init__(self, jobDirectory, mainLogger, numbersLogger, sequenceContainer, motifLength, flexibility,
sample_size, temperature=1.0, delta=0.001):
self.job_directory = jobDirectory
self.model_directory = '{jobDirectory}/model{identifier}/'.format(jobDirectory=self.job_directory, identifier=self.identifier)
self.model_directory = self.job_directory
self.main_logger = mainLogger
self.numbers_logger = numbersLogger
self.sequence_container = sequenceContainer
Expand All @@ -103,7 +103,6 @@ def __init__(self, identifier, jobDirectory, mainLogger, numbersLogger, sequence
self.iteration = 0

#Create output directory for this model
os.mkdir(self.model_directory)
os.mkdir('{modelDirectory}graphs'.format(modelDirectory=self.model_directory))
os.mkdir('{modelDirectory}models'.format(modelDirectory=self.model_directory))

Expand Down Expand Up @@ -236,7 +235,7 @@ def do_training(self, output_frequency, termination_threshold, write_model_state
progressMade = True

if not progressMade:
self.main_logger.info("Break model %s after %s iterations", 0, self.getIteration())
self.main_logger.info("Terminate model after %s iterations.", self.getIteration())
last_model = last_models.as_list()[-1]
return last_model

Expand Down Expand Up @@ -279,27 +278,25 @@ def write_current_model(self):


def output_current_statistics(self):
self.main_logger.info('Generate output for iteration %s', self.iteration)

#output loglikelihoods
sequence_likelihood = self.calculate_sequence_loglikelihood()
seq_str_likelihood = self.calculateSequenceStructureLoglikelihood()
self.main_logger.info('Current sequence loglikelihood: %s', sequence_likelihood)
self.main_logger.info('Current sequence+structure loglikelihood: %s', seq_str_likelihood)
self.main_logger.info('[Iteration {0}] Sequence loglikelihood: {1}'.format(self.iteration, sequence_likelihood))
self.main_logger.info('[Iteration {0}] Sequence-structure loglikelihood: {1}'.format(self.iteration, seq_str_likelihood))

#output information contents
(ic_1000_sequence, ic_1000_structure, ic_1000_combined) = self.calculate_information_contents_1000_sequences()
ic_global_sequence = self.model.calculateInformationContentSequenceGlobal(self.sequence_container.get_length())
ic_global_structure = self.model.calculateInformationContentStructure(self.sequence_container.get_length())
ic_global_combined = self.model.calculateInformationContentCombined(self.sequence_container.get_length())

self.main_logger.info('Average IC of sequence per position (1000 sequences): %s', sum(ic_1000_sequence) / float(self.motif_length))
self.main_logger.info('Average IC of structure per position (1000 sequences): %s', sum(ic_1000_structure) / float(self.motif_length))
self.main_logger.info('Average IC of sequence + structure per position (1000 sequences): %s', sum(ic_1000_combined) / float(self.motif_length))
self.main_logger.info('[Iteration {0}] Avg. IC of sequence per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_sequence) / float(self.motif_length)))
self.main_logger.info('[Iteration {0}] Avg. IC of structure per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_structure) / float(self.motif_length)))
self.main_logger.info('[Iteration {0}] Avg. IC of sequence-structure per position (top 1000 sequences): {1}'.format(self.iteration, sum(ic_1000_combined) / float(self.motif_length)))

self.main_logger.info('Average IC of sequence per position (global weighted average from HMM): %s', sum(ic_global_sequence) / float(self.motif_length))
self.main_logger.info('Average IC of structure per position (global weighted average from HMM): %s', sum(ic_global_structure) / float(self.motif_length))
self.main_logger.info('Average IC of sequence + structure per position (global weighted average from HMM): %s', sum(ic_global_combined) / float(self.motif_length))
self.main_logger.info('[Iteration {0}] Avg. IC of sequence per position (model): {1}'.format(self.iteration, sum(ic_global_sequence) / float(self.motif_length)))
self.main_logger.info('[Iteration {0}] Avg. IC of structure per position (model): {1}'.format(self.iteration, sum(ic_global_structure) / float(self.motif_length)))
self.main_logger.info('[Iteration {0}] Avg. IC of sequence-structure per position (model): {1}'.format(self.iteration, sum(ic_global_combined) / float(self.motif_length)))

#output key statistics into concise logger
self.numbers_logger.info("{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(self.iteration, sequence_likelihood, seq_str_likelihood,
Expand Down

0 comments on commit b1938d6

Please sign in to comment.