From c623cb3b99c9670720e4b0f7a6a25d95e667d380 Mon Sep 17 00:00:00 2001
From: David Heller <heller_d@molgen.mpg.de>
Date: Tue, 18 Jul 2017 15:49:11 +0200
Subject: [PATCH] add elongation span option to preprocess_dataset script

---
 bin/preprocess_dataset | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/bin/preprocess_dataset b/bin/preprocess_dataset
index 2f51efd..d800aec 100755
--- a/bin/preprocess_dataset
+++ b/bin/preprocess_dataset
@@ -72,9 +72,10 @@ The version of the genome can be given as an optional parameter. It defaults to
                                                              '3 - enlongate bed, 4 - fetch sequences, 5 - format FASTA, '
                                                              '6 - calculate RNA shapes, 7 - calculate RNA structures')
     parser.add_argument('min_score', type=float, default=0.0, help='minimum score for binding site (default: 0.0)')
-    parser.add_argument('--genome', type=str, default='hg19', help='genome version to use (default: hg19)')
+    parser.add_argument('--genome', '-g', type=str, default='hg19', help='genome version to use (default: hg19)')
     parser.add_argument('--min_length', type=int, default=8, help='minimum binding site length (default: 8)')
     parser.add_argument('--max_length', type=int, default=75, help='maximum binding site length (default: 75)')
+    parser.add_argument('--elongation', '-e', type=int, default=20, help='span for up- and downstream elongation of binding sites (default: 20)')
     return parser.parse_args()
 
 def checkPrereqs():
@@ -152,10 +153,10 @@ def shuffleBed(bedPositiveFileName, bedHg19FileName, bedGenesFileName, bedNegati
     bedNegativeFile.close()
     print>>sys.stderr, 'Written', bedNegativeFileName
 
-def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName):
+def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName, elongation):
     bedIntervalLongFile = open(bedIntervalLongFileName, 'w')
     print>>sys.stderr, '-->Elongating', bedIntervalFileName
-    call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', '20'], stdout=bedIntervalLongFile)
+    call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', str(elongation)], stdout=bedIntervalLongFile)
     bedIntervalLongFile.close()
     print>>sys.stderr, 'Written', bedIntervalLongFileName
 
@@ -219,12 +220,12 @@ def main(args):
     #Elongate positive bed
     bedPositiveLongFileName = options.directory + '/bed/' + options.dataset_name + '/positive_long.bed'
     if options.jump_to <= 3:
-        elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName)
+        elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName, options.elongation)
 
     #Elongate negative bed
     bedNegativeLongFileName = options.directory + '/bed/' + options.dataset_name + '/negative_long.bed'
     if options.jump_to <= 3:
-        elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName)
+        elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName, options.elongation)
 
     #Fetch positive sequences
     fastaTempPositiveFileName = options.directory + '/temp/' + options.dataset_name + '/positive_long.fasta'