Skip to content

Commit

Permalink
add elongation span option to preprocess_dataset script
Browse files Browse the repository at this point in the history
  • Loading branch information
heller committed Jul 18, 2017
1 parent 2f99df0 commit c623cb3
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions bin/preprocess_dataset
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ The version of the genome can be given as an optional parameter. It defaults to
'3 - enlongate bed, 4 - fetch sequences, 5 - format FASTA, '
'6 - calculate RNA shapes, 7 - calculate RNA structures')
parser.add_argument('min_score', type=float, default=0.0, help='minimum score for binding site (default: 0.0)')
parser.add_argument('--genome', type=str, default='hg19', help='genome version to use (default: hg19)')
parser.add_argument('--genome', '-g', type=str, default='hg19', help='genome version to use (default: hg19)')
parser.add_argument('--min_length', type=int, default=8, help='minimum binding site length (default: 8)')
parser.add_argument('--max_length', type=int, default=75, help='maximum binding site length (default: 75)')
parser.add_argument('--elongation', '-e', type=int, default=20, help='span for up- and downstream elongation of binding sites (default: 20)')
return parser.parse_args()

def checkPrereqs():
Expand Down Expand Up @@ -152,10 +153,10 @@ def shuffleBed(bedPositiveFileName, bedHg19FileName, bedGenesFileName, bedNegati
bedNegativeFile.close()
print>>sys.stderr, 'Written', bedNegativeFileName

def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName):
def elongatingBed(bedIntervalLongFileName, bedHg19FileName, bedIntervalFileName, elongation):
bedIntervalLongFile = open(bedIntervalLongFileName, 'w')
print>>sys.stderr, '-->Elongating', bedIntervalFileName
call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', '20'], stdout=bedIntervalLongFile)
call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', bedHg19FileName, '-b', str(elongation)], stdout=bedIntervalLongFile)
bedIntervalLongFile.close()
print>>sys.stderr, 'Written', bedIntervalLongFileName

Expand Down Expand Up @@ -219,12 +220,12 @@ def main(args):
#Elongate positive bed
bedPositiveLongFileName = options.directory + '/bed/' + options.dataset_name + '/positive_long.bed'
if options.jump_to <= 3:
elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName)
elongatingBed(bedPositiveLongFileName, bedHgFileName, bedPositiveFileName, options.elongation)

#Elongate negative bed
bedNegativeLongFileName = options.directory + '/bed/' + options.dataset_name + '/negative_long.bed'
if options.jump_to <= 3:
elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName)
elongatingBed(bedNegativeLongFileName, bedHgFileName, bedNegativeFileName, options.elongation)

#Fetch positive sequences
fastaTempPositiveFileName = options.directory + '/temp/' + options.dataset_name + '/positive_long.fasta'
Expand Down

0 comments on commit c623cb3

Please sign in to comment.