diff --git a/bin/preprocess_dataset b/bin/preprocess_dataset index a7b33ff..94bcedd 100755 --- a/bin/preprocess_dataset +++ b/bin/preprocess_dataset @@ -306,7 +306,7 @@ def main(args): print>>sys.stderr, 'INPUT:', fastaPositiveFileName print>>sys.stderr, 'OUTPUT:', shapePositiveFileName - calculate_rna_shapes_from_file(shapePositiveFileName, fastaPositiveFileName) + calculate_rna_shapes_from_file(shapePositiveFileName, fastaPositiveFileName, 10) print>>sys.stderr, 'STEP 6 finished' @@ -317,7 +317,7 @@ def main(args): print>>sys.stderr, 'INPUT:', fastaPositiveFileName print>>sys.stderr, 'OUTPUT:', shapePositiveFileName - calculate_rna_shapes_from_file(shapeNegativeFileName, fastaNegativeFileName) + calculate_rna_shapes_from_file(shapeNegativeFileName, fastaNegativeFileName, 10) print>>sys.stderr, 'STEP 6 finished' diff --git a/sshmm/structure_prediction.py b/sshmm/structure_prediction.py index 580fd14..c0b7fab 100644 --- a/sshmm/structure_prediction.py +++ b/sshmm/structure_prediction.py @@ -13,7 +13,7 @@ def translateIntoContexts(dotBracketString): contextString = contextString1.replace('T', 'E') return contextString -def calculate_rna_shapes_from_file(output, fastaFileName): +def calculate_rna_shapes_from_file(output, fastaFileName, shapesPerSequence): contextsFile = open(output, 'w') proc = Popen(['RNAshapes', '-r', '-o', '1', '-f', fastaFileName], stdout=PIPE, bufsize=-1) @@ -26,12 +26,15 @@ def calculate_rna_shapes_from_file(output, fastaFileName): firstField = fields[0] if firstField.startswith('>'): print>>contextsFile, firstField + num_shapes = 0 else: match1 = dotBracketPattern.search(firstField) if match1 and match1.end() == len(firstField): - contexts = translateIntoContexts(firstField) - prob = float(fields[2][1:-1]) - print>>contextsFile, contexts, prob + num_shapes += 1 + if num_shapes <= shapesPerSequence: + contexts = translateIntoContexts(firstField) + prob = float(fields[2][1:-1]) + print>>contextsFile, contexts, prob def calculate_rna_shapes_from_sequence(nucleotide_sequence):