From c623cb3b99c9670720e4b0f7a6a25d95e667d380 Mon Sep 17 00:00:00 2001 From: David Heller Date: Tue, 18 Jul 2017 15:49:11 +0200 Subject: [PATCH] add elongation span option to preprocess_dataset script --- bin/preprocess_dataset | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/preprocess_dataset b/bin/preprocess_dataset index 2f51efd..d800aec 100755 --- a/bin/preprocess_dataset +++ b/bin/preprocess_dataset @@ -72,9 +72,10 @@ The version of the genome can be given as an optional parameter. It defaults to '3 - enlongate bed, 4 - fetch sequences, 5 - format FASTA, ' '6 - calculate RNA shapes, 7 - calculate RNA structures') parser.add_argument('min_score', type=float, default=0.0, help='minimum score for binding site (default: 0.0)') - parser.add_argument('--genome', type=str, default='hg19', help='genome version to use (default: hg19)') + parser.add_argument('--genome', '-g', type=str, default='hg19', help='genome version to use (default: hg19)') parser.add_argument('--min_length', type=int, default=8, help='minimum binding site length (default: 8)') parser.add_argument('--max_length', type=int, default=75, help='maximum binding site length (default: 75)') + parser.add_argument('--elongation', '-e', type=int, default=20, help='span for up- and downstream elongation of binding sites (default: 20)') return parser.parse_args() def checkPrereqs(): @@ -152,10 +153,10 @@ def shuffleBed(bedPositiveFileName, bedHg19FileName, bedGenesFileName, bedNegati bedNegativeFile.close() print>>sys.stderr, 'Written', bedNegativeFileName -def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName): +def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName, elongation): bedIntervalLongFile = open(bedIntervalLongFileName, 'w') print>>sys.stderr, '-->Elongating', bedIntervalFileName - call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', '20'], stdout=bedIntervalLongFile) + call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', str(elongation)], stdout=bedIntervalLongFile) bedIntervalLongFile.close() print>>sys.stderr, 'Written', bedIntervalLongFileName @@ -219,12 +220,12 @@ def main(args): #Elongate positive bed bedPositiveLongFileName = options.directory + '/bed/' + options.dataset_name + '/positive_long.bed' if options.jump_to <= 3: - elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName) + elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName, options.elongation) #Elongate negative bed bedNegativeLongFileName = options.directory + '/bed/' + options.dataset_name + '/negative_long.bed' if options.jump_to <= 3: - elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName) + elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName, options.elongation) #Fetch positive sequences fastaTempPositiveFileName = options.directory + '/temp/' + options.dataset_name + '/positive_long.fasta'