This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
SBatchGenerator/batchAlignment.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
257 lines (217 sloc)
6.97 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use Getopt::Long(); | |
use POSIX qw(strftime); | |
sub usage($); | |
sub version($); | |
sub createBatchJob($$); | |
sub preparePath($$); | |
sub printSBatchHeader($$$$$$); | |
sub printSBatchScript($$$$$); | |
MAIN: | |
{ | |
# define inputs | |
my $version_tag = "version 1.0, October 2018"; | |
my $version; | |
my $help; | |
my $path_fastq; | |
my $path_genome_index; | |
my $file_genome_annotation; | |
my $path_output; | |
my $opts; | |
my $sbatch_nodes = 1; | |
my $sbatch_partition = "cuttlefish"; | |
my $sbatch_time = "100:00:00"; | |
my $sbatch_ntasks = 1; | |
my $sbatch_cpus = 16; | |
my $sbatch_name = "braintest"; | |
# set-up paramters | |
Getopt::Long::GetOptions( | |
"f|fastq=s" => \$path_fastq, | |
"o|output-dir=s" => \$path_output, | |
"g|genome=s" => \$path_genome_index, | |
"a|annotation=s" => \$file_genome_annotation, | |
"x|opts=s" => \$opts, | |
"n|nodes=i" => \$sbatch_nodes, | |
"p|partition=s" => \$sbatch_partition, | |
"t|time=s" => \$sbatch_time, | |
"T|ntasks=i" => \$sbatch_ntasks, | |
"c|cpus=i" => \$sbatch_cpus, | |
"l|name=s" => \$sbatch_name, | |
"h|help" => \$help, | |
"v|version" => \$version | |
) or usage("Error::invalid command line options"); | |
# parse inputs | |
version($version_tag) if($version); | |
usage($version_tag) if($help); | |
usage("Error::path to FastQ files is required") unless defined($path_fastq); | |
usage("Error::path to output location is required") unless defined($path_output); | |
usage("Error::path to genome index is required") unless defined($path_genome_index); | |
usage("Error::genome annotation file is required") unless defined($file_genome_annotation); | |
# create batch job | |
my $batch_job = createBatchJob($path_fastq, $path_output); | |
# print SBatch header | |
printSBatchHeader($sbatch_nodes, | |
$sbatch_partition, | |
$sbatch_time, | |
$sbatch_ntasks, | |
$sbatch_cpus, | |
$sbatch_name); | |
# change working directory | |
print "cd $path_output\n\n"; | |
# create sbatch script | |
printSBatchScript($batch_job, $path_genome_index, $file_genome_annotation, $sbatch_cpus, $opts); | |
} | |
sub printSBatchScript($$$$$) | |
{ | |
my $batch_job = $_[0]; | |
my $path_genome_index = $_[1]; | |
my $file_genome_annotation = $_[2]; | |
my $cpus = $_[3]; | |
my $opts = $_[4]; | |
my $command = "STAR" . | |
" --runMode alignReads" . | |
" --genomeDir $path_genome_index" . | |
" --readFilesCommand zcat" . | |
" --outStd Log" . | |
" --outSAMtype BAM SortedByCoordinate" . | |
" --outSAMstrandField intronMotif" . | |
" --outFilterIntronMotifs RemoveNoncanonical" . | |
" --alignSoftClipAtReferenceEnds No" . | |
" --outFilterScoreMinOverLread 0.25" . | |
" --outFilterMatchNminOverLread 0.25"; | |
foreach my $tag (sort keys %{$batch_job->{"file"}{"fastq"}}) | |
{ | |
my $file_fastq = $batch_job->{"file"}{"fastq"}{$tag}; | |
my $file_bam = $batch_job->{"path"}{"bam"} . "/" . $batch_job->{"file"}{"bam"}{$tag}; | |
my $file_log = $batch_job->{"path"}{"log"} . "/" . $batch_job->{"file"}{"log"}{$tag}; | |
print "echo $tag\n"; | |
print $command, " --readFilesIn ", $file_fastq, "\n"; | |
print "mv Aligned.sortedByCoord.out.bam ", $file_bam,"\n"; | |
print "mv Log.final.out ",$file_log,"\n"; | |
print "samtools index ", $file_bam,"\n"; | |
print "rm -f SJ.out.tab\n"; | |
print "rm -f Log.*out\n"; | |
print "\n"; | |
} | |
# count command | |
my $file_count = $batch_job->{"path"}{"count"} . "/" . $batch_job->{"file"}{"count"}; | |
$command = "featureCounts -T $cpus -t exon -g gene_id -Q 255 -a " . $file_genome_annotation . " -o " . $file_count . " \$(ls " . $batch_job->{"path"}{"bam"} . "/*.bam)"; | |
print "echo featureCounts\n"; | |
print $command,"\n"; | |
} | |
sub printSBatchHeader($$$$$$) | |
{ | |
my $nodes = $_[0]; | |
my $partition = $_[1]; | |
my $time = $_[2]; | |
my $ntasks = $_[3]; | |
my $cpus = $_[4]; | |
my $name = $_[5]; | |
# print header | |
print "#!/bin/bash\n"; | |
print "\n"; | |
print "#SBATCH --nodes=$nodes\n"; | |
print "#SBATCH --partition=$partition\n"; | |
print "#SBATCH --time=$time\n"; | |
print "#SBATCH --ntasks=$ntasks\n"; | |
print "#SBATCH --cpus-per-task=$cpus\n"; | |
print "#SBATCH --job-name=$name\n"; | |
print "#SBATCH --error=error_\%j.out\n"; | |
print "#SBATCH --output=output_\%j.out\n"; | |
print "\n"; | |
print "echo \$SLURM_SUBMIT_DIR\n"; | |
print "echo \"Running on \`hostname\`\"\n"; | |
print "\n"; | |
} | |
sub createBatchJob($$) | |
{ | |
my $path_fastq = $_[0]; | |
my $path_output = $_[1]; | |
my %job = (); | |
# clean trailing separator | |
$path_fastq =~ s/\/$//g; | |
$path_output =~ s/\/$//g; | |
# generate bam and log output paths | |
$job{"path"}{"fastq"} = $path_fastq; | |
$job{"path"}{"bam"} = preparePath($path_output, "bam"); | |
$job{"path"}{"log"} = preparePath($path_output, "log"); | |
$job{"path"}{"count"} = preparePath($path_output, "count"); | |
my @files_fastq = glob($path_fastq . "/*.fa*.gz"); | |
if (!@files_fastq) | |
{ | |
print STDERR "Error::could not retrieve FASTQ files.\n"; | |
die("\n"); | |
} | |
foreach my $file_fastq (@files_fastq) | |
{ | |
my $file_name = ($file_fastq =~ m/.*fastq\/(.*)\.fa.*\.gz$/) ? $1 : "unknown"; | |
if ($file_name eq "unknown") | |
{ | |
print STDERR "Error::could not retrieve FASTQ file name $file_fastq.\n"; | |
die("\n"); | |
} | |
$job{"file"}{"fastq"}{$file_name} = $file_fastq; | |
$job{"file"}{"bam"}{$file_name} = $file_name . ".bam"; | |
$job{"file"}{"log"}{$file_name} = $file_name . "_starlog.txt"; | |
} | |
my $datestring = strftime('%Y%m%d',localtime); | |
$job{"file"}{"count"} = "counts_" . $datestring . ".txt"; | |
return \%job; | |
} | |
sub preparePath($$) | |
{ | |
my $path = $_[0]; | |
my $tag = $_[1]; | |
$path =~ s/\/$//g; | |
my $path_new = $path . '/' . $tag; | |
if (-d $path_new) | |
{ | |
print STDERR "Warning: $path_new already exists.\n"; | |
} | |
else | |
{ | |
mkdir $path_new; | |
} | |
return $path_new; | |
} | |
sub usage($) | |
{ | |
my $message = $_[0]; | |
if (defined $message && length($message)) | |
{ | |
$message .= "\n" unless($message =~/\n$/); | |
} | |
my $command = $0; | |
$command =~ s#^.*/##; | |
print STDERR ( | |
$message, | |
"usage: $command\n" . | |
"-f|--fastq\n" . | |
"\t path to FastQ files (required)\n" . | |
"-o|--output-dir\n" . | |
"\t path to output directory (required)\n" . | |
"-g|--genome\n" . | |
"\t path to genome index (required)\n" . | |
"-p|--opts\n" . | |
"\t additional Cellranger Count parameters\n" . | |
"-h|--help\n" . | |
"\t print help message\n" . | |
"-v|--version\n" . | |
"\t print current version\n" | |
); | |
die("\n"); | |
} | |
sub version($) | |
{ | |
my $message = $_[0]; | |
print STDERR ( | |
$message,"\n", | |
"Scientific Computing Facility\n", | |
"Max-Planck Institute For Brain Research\n", | |
"bug reports to:\n", | |
"\tsciclist\@brain.mpg.de\n" | |
); | |
die("\n"); | |
} |