diff --git a/bin/preprocess_dataset b/bin/preprocess_dataset index 236ccaa..2c4dc7f 100755 --- a/bin/preprocess_dataset +++ b/bin/preprocess_dataset @@ -5,6 +5,7 @@ import argparse import threading import os from subprocess import call, check_call, CalledProcessError +from shutil import copyfile from itertools import izip from sshmm.structure_prediction import calculate_rna_shapes_from_file, calculate_rna_structures_from_file @@ -26,56 +27,43 @@ def parseArguments(args): arguments: args -- command-line arguments to the program""" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, - description="""Prepare a CLIP-Seq dataset in BED format for the training of an ssHMM. The following preprocessing steps are taken: -1 - Filter (positive) BED file, 2 - Shuffle (positive) BED file to generate negative dataset, 3 - Elongate positive and negative BED files for later structure prediction, 4 - Fetch genomic sequences for elongated BED files, 5 - Produce FASTA files with genomic sequences in viewpoint format, 6 - Calculate RNA shapes, 7 - Calculate RNA structures - -This script requires bedtools (shuffle, slop, getfasta), RNAshapes, and RNAstructures. - -A root directory for the datasets and a dataset name (e.g., the protein name) has to be given. The following files will be created in the root directory and its subdirectories: -- /bed//positive_raw.bed - positive BED file from CLIP-Seq experiment -- /bed//positive.bed - filtered positive BED file -- /bed//negative.bed - filtered negative BED file -- /bed//positive_long.bed - elongated positive BED file -- /bed//negative_long.bed - elongated negative BED file -- /temp//positive_long.fasta - genomic sequences of elongated positive BED file -- /temp//negative_long.fasta - genomic sequences of elongated negative BED file -- /fasta//positive.fasta - positive genomic sequences in viewpoint format -- /fasta//negative.fasta - negative genomic sequences in viewpoint format -- /shapes//positive.txt - secondary structures of positive genomic sequence (predicted by RNAshapes) -- /shapes//negative.txt - secondary structures of negative genomic sequence (predicted by RNAshapes) -- /structures//positive.txt - secondary structures of positive genomic sequence (predicted by RNAstructures) -- /structures//negative.txt - secondary structures of negative genomic sequence (predicted by RNAstructures) - -The preprocessing step to begin with can be chosen. For each step, the files generated by the previous step need to be present. To execute all steps, only the positive_raw.bed must be present. - -For the filtering step, the minimum score and binding site lengths can be defined with parameters. To fetch genomic sequences in step 4, the following file must be present in the genomes/ subdirectory: - -- /genomes/[version]/[version].genome -> BED file defining the size of the chromosomes -- /genomes/[version]/UCSCGenesTrack.bed -> BED file defining the gene intervals -- /genomes/[version]/[version].fa -> FASTA file containing the human genome - -The version of the genome can be given as an optional parameter. It defaults to 'hg19'. The files for the genomes/ directory can be obtained from UCSC: + description="""Pipeline for the preparation of a CLIP-Seq dataset in BED format. The pipeline consists of the following steps: +1 - Filter BED file +2 - Elongate BED file for later structure prediction +3 - Fetch genomic sequences for elongated BED file +4 - Produce FASTA file with genomic sequences in viewpoint format +5 - Secondary structure prediction with RNAshapes +6 - Secondary structure prediction with RNAstructures + +DEPENDENCIES: This script requires bedtools (shuffle, slop, getfasta), RNAshapes, and RNAstructures. + +A working directory and a dataset name (e.g., the protein name) have to be given. The output files can be found in: +- /fasta//positive.fasta - genomic sequences in viewpoint format +- /shapes//positive.txt - secondary structures of genomic sequence (predicted by RNAshapes) +- /structures//positive.txt - secondary structures of genomic sequence (predicted by RNAstructures) + +For classification, a negative set of binding sites with shuffled coordinates can be generated with the --generate_negative option. +For this option, gene boundaries are required and need to be given as --genome_genes. They can be downloaded e.g. from the UCSC table +browser (http://genome.ucsc.edu/cgi-bin/hgTables). Choose the most recent GENCODE track (currently GENCODE Gene V24lift37->Basic (for hg19) +and All GENCODE V24->Basic (for hg38)) and 'BED' as output format. +""") + parser.add_argument('working_dir', type=str, help='working/output directory') + parser.add_argument('dataset_name', type=str, help='dataset name') + parser.add_argument('input', type=str, help='input file in .bed format') + parser.add_argument('genome', type=str, help='reference genome in FASTA format') + parser.add_argument('genome_sizes', type=str, help='chromosome sizes of reference genome (e.g. from http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes)') -/genomes/[version]/[version].genome: --> download from http://hgdownload.soe.ucsc.edu/downloads.html#human (Full data set), e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes + parser.add_argument('--disable_filtering', '-f', action='store_true', help='skip the filtering step') + parser.add_argument('--disable_RNAshapes', action='store_true', help='skip secondary structure prediction with RNAshapes') + parser.add_argument('--disable_RNAstructure', action='store_true', help='skip secondary structure prediction with RNAstructures') + parser.add_argument('--generate_negative', '-n', action='store_true', help='generate a negative set for classification') -/genomes/[version]/UCSCGenesTrack.bed: --> download in table browser (http://genome.ucsc.edu/cgi-bin/hgTables); choose most recent GENCODE track (currently GENCODE Gene V24lift37->Basic (for hg19) and All GENCODE V24->Basic (for hg38)) and 'BED' as output format + parser.add_argument('--min_score', type=float, default=0.0, help='filtering: minimum score for binding site (default: 0.0)') + parser.add_argument('--min_length', type=int, default=8, help='filtering: minimum binding site length (default: 8)') + parser.add_argument('--max_length', type=int, default=75, help='filtering: maximum binding site length (default: 75)') + parser.add_argument('--elongation', type=int, default=20, help='elongation: span for up- and downstream elongation of binding sites (default: 20)') + parser.add_argument('--genome_genes', type=str, default = "", help='negative set generation: gene boundaries') -/genomes/[version]/[version].fa: --> download chromosomes from http://hgdownload.soe.ucsc.edu/downloads.html; e.g. wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/*'; concatenate chromosomes with cat and print into .fa file (e.g. with 'zcat chr* > hg19.fa') -""") - parser.add_argument('directory', type=str, help='root directory for data') - parser.add_argument('dataset_name', type=str, help='dataset name') - parser.add_argument('jump_to', type=int, default=1, help='preprocessing step to jump to (as integer): ' - '1 - filter bed, 2 - shuffle bed, ' - '3 - enlongate bed, 4 - fetch sequences, 5 - format FASTA, ' - '6 - calculate RNA shapes, 7 - calculate RNA structures') - parser.add_argument('min_score', type=float, default=0.0, help='minimum score for binding site (default: 0.0)') - parser.add_argument('--genome', '-g', type=str, default='hg19', help='genome version to use (default: hg19)') - parser.add_argument('--min_length', type=int, default=8, help='minimum binding site length (default: 8)') - parser.add_argument('--max_length', type=int, default=75, help='maximum binding site length (default: 75)') - parser.add_argument('--elongation', '-e', type=int, default=20, help='span for up- and downstream elongation of binding sites (default: 20)') parser.add_argument('--skip_check', '-s', action='store_true', help='skip check for installed prerequisites') return parser.parse_args() @@ -87,19 +75,18 @@ def checkPrereqs(): if e.errno == os.errno.ENOENT: print>>sys.stderr, "ERROR: bedtools not found. Tried to execute 'bedtools --version'." return False - else: - # Something else went wrong while trying to run `bedtools` - raise + except CalledProcessError: + print>>sys.stderr, "ERROR: bedtools fails with option --version." + return False try: check_call(["RNAshapes", "-v"], stdout=devnull, stderr=devnull) except OSError as e: - if e.errno == os.errno.ENOENT: - print>>sys.stderr, "ERROR: RNAshapes not found. Tried to execute 'RNAshapes -v'." - return False - else: - # Something else went wrong while trying to run `RNAshapes` - raise + print>>sys.stderr, "ERROR: RNAshapes not found. Tried to execute 'RNAshapes -v'." + return False + except CalledProcessError: + print>>sys.stderr, "ERROR: RNAshapes fails with option -v. Is a wrong version installed?" + return False try: check_call(["which", "Fold"], stdout=devnull, stderr=devnull) @@ -127,7 +114,7 @@ def prepareFolderStructure(directory, proteinName): return True def filteringBed(filteredBedFileName, rawBedFileName, genomeFileName, lowest_score, min_length, max_length): - print>>sys.stderr, '###STEP 1: Filtering positive raw bed-file' + print>>sys.stderr, '###STEP 1: Filter raw bed-file' print>>sys.stderr, 'INPUT:', rawBedFileName print>>sys.stderr, 'OUTPUT:', filteredBedFileName @@ -158,38 +145,38 @@ def filteringBed(filteredBedFileName, rawBedFileName, genomeFileName, lowest_sco print>>sys.stderr, 'STEP 1 finished (filtering resulted in {0} good binding sites)'.format(num_filtered) def shuffleBed(bedPositiveFileName, bedHg19FileName, genesFileName, bedNegativeFileName): - print>>sys.stderr, '###STEP 2: Shuffling positive binding sites to obtain negative binding sites' + print>>sys.stderr, '###STEP 1: Shuffle positive binding sites to obtain negative binding sites' print>>sys.stderr, 'INPUT:', bedPositiveFileName - print>>sys.stderr, 'OUTPUT:', bedNegativeFileName - + print>>sys.stderr, 'OUTPUT:', bedNegativeFileName + bedNegativeFile = open(bedNegativeFileName, 'w') call(['bedtools', 'shuffle', '-i', bedPositiveFileName, '-g', bedHg19FileName, '-incl', genesFileName], stdout=bedNegativeFile) bedNegativeFile.close() - print>>sys.stderr, 'STEP 2 finished' + print>>sys.stderr, 'STEP 1 finished' def elongatingBed(bedIntervalLongFileName, genomeFileName, bedIntervalFileName, elongation): bedIntervalLongFile = open(bedIntervalLongFileName, 'w') - print>>sys.stderr, '###STEP 3: Elongating binding sites by', elongation, 'nt for structure prediction' + print>>sys.stderr, '###STEP 2: Elongate binding sites by', elongation, 'nt for structure prediction' print>>sys.stderr, 'INPUT:', bedIntervalFileName print>>sys.stderr, 'OUTPUT:', bedIntervalLongFileName call(['bedtools', 'slop', '-i', bedIntervalFileName, '-g', genomeFileName, '-b', str(elongation)], stdout=bedIntervalLongFile) bedIntervalLongFile.close() - print>>sys.stderr, 'STEP 3 finished' + print>>sys.stderr, 'STEP 2 finished' def fetchingSequences(genomeFastaFileName, fastaTempFileName, bedIntervalLongFileName): - print>>sys.stderr, '###STEP 4: Fetching nucleotide sequence for elongated binding sites' + print>>sys.stderr, '###STEP 3: Fetch nucleotide sequence for elongated binding sites' print>>sys.stderr, 'INPUT:', bedIntervalLongFileName print>>sys.stderr, 'OUTPUT:', fastaTempFileName call(['fastaFromBed', '-s', '-fi', genomeFastaFileName, '-bed', bedIntervalLongFileName, '-fo', fastaTempFileName]) - print>>sys.stderr, 'STEP 4 finished' + print>>sys.stderr, 'STEP 3 finished' def formatFasta(fastaFormattedFileName, bedIntervalFileName, bedIntervalLongFileName, fastaTempFileName, genome): - print>>sys.stderr, '###STEP 5: Reformatting nucleotide sequences into viewpoint format (binding sites in uppercase, elongation in lowercase)' + print>>sys.stderr, '###STEP 4: Reformat nucleotide sequences into viewpoint format (binding sites in uppercase, elongation in lowercase)' print>>sys.stderr, 'INPUT:', fastaTempFileName print>>sys.stderr, 'OUTPUT:', fastaFormattedFileName @@ -240,7 +227,7 @@ def formatFasta(fastaFormattedFileName, bedIntervalFileName, bedIntervalLongFile fastaFormattedFile.close() fastaTempFile.close() - print>>sys.stderr, 'STEP 5 finished' + print>>sys.stderr, 'STEP 4 finished' return True @@ -252,101 +239,104 @@ def main(args): if not options.skip_check and not checkPrereqs(): return + #Check for gene boundaries when negative set shall be generated + if options.generate_negative and options.genome_genes == "": + print>>sys.stderr, "ERROR: Generating a negative set requires gene boundaries for the genome (see help message)" + return + #Prepare folder structure - if not(prepareFolderStructure(options.directory, options.dataset_name)): + if not(prepareFolderStructure(options.working_dir, options.dataset_name)): return - #Genome paths - genomeFileName = '{0}/genomes/{1}/{1}.genome'.format(options.directory, options.genome) - genesFileName = '{0}/genomes/{1}/UCSCGenesTrack.bed'.format(options.directory, options.genome) - genomeFastaFileName = '{0}/genomes/{1}/{1}.fa'.format(options.directory, options.genome) - - #Filtering bed - bedFileName = options.directory + '/bed/' + options.dataset_name + '/positive_raw.bed' - bedPositiveFileName = options.directory + '/bed/' + options.dataset_name + '/positive.bed' - if options.jump_to <= 1: - filteringBed(bedPositiveFileName, bedFileName, genomeFileName, options.min_score, options.min_length, options.max_length) - - #Shuffle positive bed to get negative bed - bedNegativeFileName = options.directory + '/bed/' + options.dataset_name + '/negative.bed' - if options.jump_to <= 2: - shuffleBed(bedPositiveFileName, genomeFileName, genesFileName, bedNegativeFileName) - - #Elongate positive bed - bedPositiveLongFileName = options.directory + '/bed/' + options.dataset_name + '/positive_long.bed' - if options.jump_to <= 3: - elongatingBed(bedPositiveLongFileName, genomeFileName, bedPositiveFileName, options.elongation) - - #Elongate negative bed - bedNegativeLongFileName = options.directory + '/bed/' + options.dataset_name + '/negative_long.bed' - if options.jump_to <= 3: - elongatingBed(bedNegativeLongFileName, genomeFileName, bedNegativeFileName, options.elongation) - - #Fetch positive sequences - fastaTempPositiveFileName = options.directory + '/temp/' + options.dataset_name + '/positive_long.fasta' - if options.jump_to <= 4: - fetchingSequences(genomeFastaFileName, fastaTempPositiveFileName, bedPositiveLongFileName) - - #Fetch negative sequences - fastaTempNegativeFileName = options.directory + '/temp/' + options.dataset_name + '/negative_long.fasta' - if options.jump_to <= 4: - fetchingSequences(genomeFastaFileName, fastaTempNegativeFileName, bedNegativeLongFileName) - - #Format positive FASTA - fastaPositiveFileName = options.directory + '/fasta/' + options.dataset_name + '/positive.fasta' - if options.jump_to <= 5: - if not formatFasta(fastaPositiveFileName, bedPositiveFileName, bedPositiveLongFileName, fastaTempPositiveFileName, options.genome): - return - - #Format negative FASTA - fastaNegativeFileName = options.directory + '/fasta/' + options.dataset_name + '/negative.fasta' - if options.jump_to <= 5: - if not formatFasta(fastaNegativeFileName, bedNegativeFileName, bedNegativeLongFileName, fastaTempNegativeFileName, options.genome): - return + #Copy input file into working directory + bedFileName = options.working_dir + '/bed/' + options.dataset_name + '/positive_raw.bed' + bedPositiveFileName = options.working_dir + '/bed/' + options.dataset_name + '/positive.bed' + if options.disable_filtering: + copyfile(options.input, bedPositiveFileName) + else: + copyfile(options.input, bedFileName) + + #Filter bed + if not options.disable_filtering: + filteringBed(bedPositiveFileName, bedFileName, options.genome_sizes, options.min_score, options.min_length, options.max_length) + + #Elongate bed + bedPositiveLongFileName = options.working_dir + '/bed/' + options.dataset_name + '/positive_long.bed' + elongatingBed(bedPositiveLongFileName, options.genome_sizes, bedPositiveFileName, options.elongation) + + #Fetch sequences + fastaTempPositiveFileName = options.working_dir + '/temp/' + options.dataset_name + '/positive_long.fasta' + fetchingSequences(options.genome, fastaTempPositiveFileName, bedPositiveLongFileName) + + #Format FASTA + fastaPositiveFileName = options.working_dir + '/fasta/' + options.dataset_name + '/positive.fasta' + if not formatFasta(fastaPositiveFileName, bedPositiveFileName, bedPositiveLongFileName, fastaTempPositiveFileName, options.genome): + return - #Calculate positive RNA shapes - shapePositiveFileName = options.directory + '/shapes/' + options.dataset_name + '/positive.txt' - if options.jump_to <= 6: - print>>sys.stderr, '###STEP 6: Calculating RNAshapes' + #Calculate RNA shapes + if not options.disable_RNAshapes: + shapePositiveFileName = options.working_dir + '/shapes/' + options.dataset_name + '/positive.txt' + print>>sys.stderr, '###STEP 5: Calculate RNAshapes' print>>sys.stderr, 'INPUT:', fastaPositiveFileName print>>sys.stderr, 'OUTPUT:', shapePositiveFileName calculate_rna_shapes_from_file(shapePositiveFileName, fastaPositiveFileName, 10) - print>>sys.stderr, 'STEP 6 finished' + print>>sys.stderr, 'STEP 5 finished' - #Calculate negative RNA shapes - shapeNegativeFileName = options.directory + '/shapes/' + options.dataset_name + '/negative.txt' - if options.jump_to <= 6: - print>>sys.stderr, '###STEP 6: Calculating RNAshapes' - print>>sys.stderr, 'INPUT:', fastaNegativeFileName - print>>sys.stderr, 'OUTPUT:', shapeNegativeFileName - - calculate_rna_shapes_from_file(shapeNegativeFileName, fastaNegativeFileName, 10) - - print>>sys.stderr, 'STEP 6 finished' - - #Calculate positive RNA structures - structuresPositiveFileName = options.directory + '/structures/' + options.dataset_name + '/positive.txt' - if options.jump_to <= 7: - print>>sys.stderr, '###STEP 7: Calculating RNAstructures' + #Calculate RNA structures + if not options.disable_RNAstructure: + structuresPositiveFileName = options.working_dir + '/structures/' + options.dataset_name + '/positive.txt' + print>>sys.stderr, '###STEP 6: Calculate RNAstructures' print>>sys.stderr, 'INPUT:', fastaPositiveFileName print>>sys.stderr, 'OUTPUT:', structuresPositiveFileName calculate_rna_structures_from_file(structuresPositiveFileName, fastaPositiveFileName) - print>>sys.stderr, 'STEP 7 finished' - - #Calculate negative RNA structures - structuresNegativeFileName = options.directory + '/structures/' + options.dataset_name + '/negative.txt' - if options.jump_to <= 7: - print>>sys.stderr, '###STEP 7: Calculating RNAstructures' - print>>sys.stderr, 'INPUT:', fastaNegativeFileName - print>>sys.stderr, 'OUTPUT:', structuresNegativeFileName - - calculate_rna_structures_from_file(structuresNegativeFileName, fastaNegativeFileName) - - print>>sys.stderr, 'STEP 7 finished' + print>>sys.stderr, 'STEP 6 finished' + + + if options.generate_negative: + print>>sys.stderr, '###Generate a negative set for classification' + + #Shuffle positive bed to get negative bed + bedNegativeFileName = options.working_dir + '/bed/' + options.dataset_name + '/negative.bed' + shuffleBed(bedPositiveFileName, options.genome_sizes, options.genome_genes, bedNegativeFileName) + + #Elongate negative bed + bedNegativeLongFileName = options.working_dir + '/bed/' + options.dataset_name + '/negative_long.bed' + elongatingBed(bedNegativeLongFileName, options.genome_sizes, bedNegativeFileName, options.elongation) + + #Fetch negative sequences + fastaTempNegativeFileName = options.working_dir + '/temp/' + options.dataset_name + '/negative_long.fasta' + fetchingSequences(options.genome, fastaTempNegativeFileName, bedNegativeLongFileName) + + #Format negative FASTA + fastaNegativeFileName = options.working_dir + '/fasta/' + options.dataset_name + '/negative.fasta' + if not formatFasta(fastaNegativeFileName, bedNegativeFileName, bedNegativeLongFileName, fastaTempNegativeFileName, options.genome): + return + + #Calculate negative RNA shapes + shapeNegativeFileName = options.working_dir + '/shapes/' + options.dataset_name + '/negative.txt' + if not options.disable_RNAshapes: + print>>sys.stderr, '###STEP 6: Calculating RNAshapes' + print>>sys.stderr, 'INPUT:', fastaNegativeFileName + print>>sys.stderr, 'OUTPUT:', shapeNegativeFileName + + calculate_rna_shapes_from_file(shapeNegativeFileName, fastaNegativeFileName, 10) + + print>>sys.stderr, 'STEP 6 finished' + + #Calculate negative RNA structures + structuresNegativeFileName = options.working_dir + '/structures/' + options.dataset_name + '/negative.txt' + if not options.disable_RNAstructure: + print>>sys.stderr, '###STEP 7: Calculating RNAstructures' + print>>sys.stderr, 'INPUT:', fastaNegativeFileName + print>>sys.stderr, 'OUTPUT:', structuresNegativeFileName + + calculate_rna_structures_from_file(structuresNegativeFileName, fastaNegativeFileName) + + print>>sys.stderr, 'STEP 7 finished' if __name__ == "__main__" : sys.exit(main(sys.argv))