batchAlignment.pl

#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Long();
use POSIX qw(strftime);

sub usage($);
sub version($);

sub createBatchJob($$);
sub preparePath($$);
sub printSBatchHeader($$$$$$);
sub printSBatchScript($$$$$);


MAIN:
{
    # define inputs
    my $version_tag = "version 1.0, October 2018";
    my $version;
    my $help;
    my $path_fastq;
    my $path_genome_index;
    my $file_genome_annotation;
    my $path_output;
    my $opts;
    my $sbatch_nodes = 1;
    my $sbatch_partition = "cuttlefish";
    my $sbatch_time = "100:00:00";
    my $sbatch_ntasks = 1;
    my $sbatch_cpus = 16;
    my $sbatch_name = "braintest";


    # set-up paramters
    Getopt::Long::GetOptions(
        "f|fastq=s" => \$path_fastq,
        "o|output-dir=s" => \$path_output,
        "g|genome=s" => \$path_genome_index,
        "a|annotation=s" => \$file_genome_annotation,
        "x|opts=s" => \$opts,
        "n|nodes=i" => \$sbatch_nodes,
        "p|partition=s" => \$sbatch_partition,
        "t|time=s" => \$sbatch_time,
        "T|ntasks=i" => \$sbatch_ntasks,
        "c|cpus=i" => \$sbatch_cpus,
        "l|name=s" => \$sbatch_name,
        "h|help" => \$help,
        "v|version" => \$version
    ) or usage("Error::invalid command line options");

    # parse inputs
    version($version_tag) if($version);
    usage($version_tag) if($help);
    usage("Error::path to FastQ files is required") unless defined($path_fastq);
    usage("Error::path to output location is required") unless defined($path_output);
    usage("Error::path to genome index is required") unless defined($path_genome_index);
    usage("Error::genome annotation file is required") unless defined($file_genome_annotation);

    # create batch job
    my $batch_job = createBatchJob($path_fastq, $path_output);

    # print SBatch header
    printSBatchHeader($sbatch_nodes,
                      $sbatch_partition,
                      $sbatch_time,
                      $sbatch_ntasks,
                      $sbatch_cpus,
                      $sbatch_name);

    # change working directory
    print "cd $path_output\n\n";

    # create sbatch script
    printSBatchScript($batch_job, $path_genome_index, $file_genome_annotation, $sbatch_cpus, $opts);

}


sub printSBatchScript($$$$$)
{
    my $batch_job = $_[0];
    my $path_genome_index = $_[1];
    my $file_genome_annotation = $_[2];
    my $cpus = $_[3];
    my $opts = $_[4];

    my $command = "STAR" .
    " --runMode alignReads" .
    " --genomeDir $path_genome_index" .
    " --readFilesCommand zcat" .
    " --outStd Log" .
    " --outSAMtype BAM SortedByCoordinate" .
    " --outSAMstrandField intronMotif" .
    " --outFilterIntronMotifs RemoveNoncanonical" .
    " --alignSoftClipAtReferenceEnds No" .
    " --outFilterScoreMinOverLread 0.25" .
    " --outFilterMatchNminOverLread 0.25";

    foreach my $tag (sort keys %{$batch_job->{"file"}{"fastq"}})
    {
        my $file_fastq = $batch_job->{"file"}{"fastq"}{$tag};
        my $file_bam = $batch_job->{"path"}{"bam"} . "/" . $batch_job->{"file"}{"bam"}{$tag};
        my $file_log = $batch_job->{"path"}{"log"} . "/" . $batch_job->{"file"}{"log"}{$tag};
        print "echo $tag\n";
        print $command, " --readFilesIn ", $file_fastq, "\n";
        print "mv Aligned.sortedByCoord.out.bam ", $file_bam,"\n";
        print "mv Log.final.out ",$file_log,"\n";
        print "samtools index ", $file_bam,"\n";
        print "rm -f SJ.out.tab\n";
        print "rm -f Log.*out\n";
        print "\n";
    }

    # count command
    my $file_count = $batch_job->{"path"}{"count"} . "/" . $batch_job->{"file"}{"count"};
    $command = "featureCounts -T $cpus -t exon -g gene_id -Q 255 -a " . $file_genome_annotation . " -o " . $file_count . " \$(ls " . $batch_job->{"path"}{"bam"} . "/*.bam)";
    print "echo featureCounts\n";
    print $command,"\n";
}


sub printSBatchHeader($$$$$$)
{
    my $nodes = $_[0];
    my $partition = $_[1];
    my $time = $_[2];
    my $ntasks = $_[3];
    my $cpus = $_[4];
    my $name = $_[5];

    # print header
    print "#!/bin/bash\n";
    print "\n";
    print "#SBATCH --nodes=$nodes\n";
    print "#SBATCH --partition=$partition\n";
    print "#SBATCH --time=$time\n";
    print "#SBATCH --ntasks=$ntasks\n";
    print "#SBATCH --cpus-per-task=$cpus\n";
    print "#SBATCH --job-name=$name\n";
    print "#SBATCH --error=error_\%j.out\n";
    print "#SBATCH --output=output_\%j.out\n";
    print "\n";
    print "echo \$SLURM_SUBMIT_DIR\n";
    print "echo \"Running on \`hostname\`\"\n";
    print "\n";
}


sub createBatchJob($$)
{
    my $path_fastq = $_[0];
    my $path_output = $_[1];
    my %job = ();

    # clean trailing separator
    $path_fastq =~ s/\/$//g;
    $path_output =~ s/\/$//g;

    # generate bam and log output paths
    $job{"path"}{"fastq"} = $path_fastq;
    $job{"path"}{"bam"} = preparePath($path_output, "bam");
    $job{"path"}{"log"} = preparePath($path_output, "log");
    $job{"path"}{"count"} = preparePath($path_output, "count");

    my @files_fastq = glob($path_fastq . "/*.fa*.gz");
    if (!@files_fastq)
    {
        print STDERR "Error::could not retrieve FASTQ files.\n";
        die("\n");
    }

    foreach my $file_fastq (@files_fastq)
    {
        my $file_name = ($file_fastq =~ m/.*fastq\/(.*)\.fa.*\.gz$/) ? $1 : "unknown";
        if ($file_name eq "unknown")
        {
            print STDERR "Error::could not retrieve FASTQ file name $file_fastq.\n";
            die("\n");
        }

        $job{"file"}{"fastq"}{$file_name} = $file_fastq;
        $job{"file"}{"bam"}{$file_name} = $file_name . ".bam";
        $job{"file"}{"log"}{$file_name} = $file_name . "_starlog.txt";
    }

    my $datestring = strftime('%Y%m%d',localtime);
    $job{"file"}{"count"} = "counts_" . $datestring . ".txt";
    return \%job;
}


sub preparePath($$)
{
    my $path = $_[0];
    my $tag = $_[1];

    $path =~ s/\/$//g;
    my $path_new = $path . '/' . $tag;
    if (-d $path_new)
    {
        print STDERR "Warning: $path_new already exists.\n";
    }
    else
    {
        mkdir $path_new;
    }

    return $path_new;
}


sub usage($)
{
    my $message = $_[0];
    if (defined $message && length($message))
    {
        $message .= "\n" unless($message =~/\n$/);
    }
    my $command = $0;
    $command =~ s#^.*/##;

    print STDERR (
        $message,
        "usage: $command\n" .
        "-f|--fastq\n" .
        "\t path to FastQ files (required)\n" .
        "-o|--output-dir\n" .
        "\t path to output directory (required)\n" .
        "-g|--genome\n" .
        "\t path to genome index (required)\n" .
        "-p|--opts\n" .
        "\t additional Cellranger Count parameters\n" .
        "-h|--help\n" .
        "\t print help message\n" .
        "-v|--version\n" .
        "\t print current version\n"
    );

    die("\n");
}

sub version($)
{
    my $message = $_[0];
    print STDERR (
    $message,"\n",
    "Scientific Computing Facility\n",
    "Max-Planck Institute For Brain Research\n",
    "bug reports to:\n",
    "\tsciclist\@brain.mpg.de\n"
    );
    die("\n");
}
	#!/usr/bin/perl

	use warnings;
	use strict;
	use Getopt::Long();
	use POSIX qw(strftime);

	sub usage($);
	sub version($);

	sub createBatchJob($$);
	sub preparePath($$);
	sub printSBatchHeader($$$$$$);
	sub printSBatchScript($$$$$);


	MAIN:
	{
	# define inputs
	my $version_tag = "version 1.0, October 2018";
	my $version;
	my $help;
	my $path_fastq;
	my $path_genome_index;
	my $file_genome_annotation;
	my $path_output;
	my $opts;
	my $sbatch_nodes = 1;
	my $sbatch_partition = "cuttlefish";
	my $sbatch_time = "100:00:00";
	my $sbatch_ntasks = 1;
	my $sbatch_cpus = 16;
	my $sbatch_name = "braintest";


	# set-up paramters
	Getopt::Long::GetOptions(
	"f\|fastq=s" => \$path_fastq,
	"o\|output-dir=s" => \$path_output,
	"g\|genome=s" => \$path_genome_index,
	"a\|annotation=s" => \$file_genome_annotation,
	"x\|opts=s" => \$opts,
	"n\|nodes=i" => \$sbatch_nodes,
	"p\|partition=s" => \$sbatch_partition,
	"t\|time=s" => \$sbatch_time,
	"T\|ntasks=i" => \$sbatch_ntasks,
	"c\|cpus=i" => \$sbatch_cpus,
	"l\|name=s" => \$sbatch_name,
	"h\|help" => \$help,
	"v\|version" => \$version
	) or usage("Error::invalid command line options");

	# parse inputs
	version($version_tag) if($version);
	usage($version_tag) if($help);
	usage("Error::path to FastQ files is required") unless defined($path_fastq);
	usage("Error::path to output location is required") unless defined($path_output);
	usage("Error::path to genome index is required") unless defined($path_genome_index);
	usage("Error::genome annotation file is required") unless defined($file_genome_annotation);

	# create batch job
	my $batch_job = createBatchJob($path_fastq, $path_output);

	# print SBatch header
	printSBatchHeader($sbatch_nodes,
	$sbatch_partition,
	$sbatch_time,
	$sbatch_ntasks,
	$sbatch_cpus,
	$sbatch_name);

	# change working directory
	print "cd $path_output\n\n";

	# create sbatch script
	printSBatchScript($batch_job, $path_genome_index, $file_genome_annotation, $sbatch_cpus, $opts);

	}




	sub printSBatchScript($$$$$)
	{
	my $batch_job = $_[0];
	my $path_genome_index = $_[1];
	my $file_genome_annotation = $_[2];
	my $cpus = $_[3];
	my $opts = $_[4];

	my $command = "STAR" .
	" --runMode alignReads" .
	" --genomeDir $path_genome_index" .
	" --readFilesCommand zcat" .
	" --outStd Log" .
	" --outSAMtype BAM SortedByCoordinate" .
	" --outSAMstrandField intronMotif" .
	" --outFilterIntronMotifs RemoveNoncanonical" .
	" --alignSoftClipAtReferenceEnds No" .
	" --outFilterScoreMinOverLread 0.25" .
	" --outFilterMatchNminOverLread 0.25";

	foreach my $tag (sort keys %{$batch_job->{"file"}{"fastq"}})
	{
	my $file_fastq = $batch_job->{"file"}{"fastq"}{$tag};
	my $file_bam = $batch_job->{"path"}{"bam"} . "/" . $batch_job->{"file"}{"bam"}{$tag};
	my $file_log = $batch_job->{"path"}{"log"} . "/" . $batch_job->{"file"}{"log"}{$tag};
	print "echo $tag\n";
	print $command, " --readFilesIn ", $file_fastq, "\n";
	print "mv Aligned.sortedByCoord.out.bam ", $file_bam,"\n";
	print "mv Log.final.out ",$file_log,"\n";
	print "samtools index ", $file_bam,"\n";
	print "rm -f SJ.out.tab\n";
	print "rm -f Log.*out\n";
	print "\n";
	}

	# count command
	my $file_count = $batch_job->{"path"}{"count"} . "/" . $batch_job->{"file"}{"count"};
	$command = "featureCounts -T $cpus -t exon -g gene_id -Q 255 -a " . $file_genome_annotation . " -o " . $file_count . " \$(ls " . $batch_job->{"path"}{"bam"} . "/*.bam)";
	print "echo featureCounts\n";
	print $command,"\n";
	}


	sub printSBatchHeader($$$$$$)
	{
	my $nodes = $_[0];
	my $partition = $_[1];
	my $time = $_[2];
	my $ntasks = $_[3];
	my $cpus = $_[4];
	my $name = $_[5];

	# print header
	print "#!/bin/bash\n";
	print "\n";
	print "#SBATCH --nodes=$nodes\n";
	print "#SBATCH --partition=$partition\n";
	print "#SBATCH --time=$time\n";
	print "#SBATCH --ntasks=$ntasks\n";
	print "#SBATCH --cpus-per-task=$cpus\n";
	print "#SBATCH --job-name=$name\n";
	print "#SBATCH --error=error_\%j.out\n";
	print "#SBATCH --output=output_\%j.out\n";
	print "\n";
	print "echo \$SLURM_SUBMIT_DIR\n";
	print "echo \"Running on \`hostname\`\"\n";
	print "\n";
	}


	sub createBatchJob($$)
	{
	my $path_fastq = $_[0];
	my $path_output = $_[1];
	my %job = ();

	# clean trailing separator
	$path_fastq =~ s/\/$//g;
	$path_output =~ s/\/$//g;

	# generate bam and log output paths
	$job{"path"}{"fastq"} = $path_fastq;
	$job{"path"}{"bam"} = preparePath($path_output, "bam");
	$job{"path"}{"log"} = preparePath($path_output, "log");
	$job{"path"}{"count"} = preparePath($path_output, "count");

	my @files_fastq = glob($path_fastq . "/.fa.gz");
	if (!@files_fastq)
	{
	print STDERR "Error::could not retrieve FASTQ files.\n";
	die("\n");
	}

	foreach my $file_fastq (@files_fastq)
	{
	my $file_name = ($file_fastq =~ m/.fastq\/(.)\.fa.*\.gz$/) ? $1 : "unknown";
	if ($file_name eq "unknown")
	{
	print STDERR "Error::could not retrieve FASTQ file name $file_fastq.\n";
	die("\n");
	}

	$job{"file"}{"fastq"}{$file_name} = $file_fastq;
	$job{"file"}{"bam"}{$file_name} = $file_name . ".bam";
	$job{"file"}{"log"}{$file_name} = $file_name . "_starlog.txt";
	}

	my $datestring = strftime('%Y%m%d',localtime);
	$job{"file"}{"count"} = "counts_" . $datestring . ".txt";
	return \%job;
	}


	sub preparePath($$)
	{
	my $path = $_[0];
	my $tag = $_[1];

	$path =~ s/\/$//g;
	my $path_new = $path . '/' . $tag;
	if (-d $path_new)
	{
	print STDERR "Warning: $path_new already exists.\n";
	}
	else
	{
	mkdir $path_new;
	}

	return $path_new;
	}


	sub usage($)
	{
	my $message = $_[0];
	if (defined $message && length($message))
	{
	$message .= "\n" unless($message =~/\n$/);
	}
	my $command = $0;
	$command =~ s#^.*/##;

	print STDERR (
	$message,
	"usage: $command\n" .
	"-f\|--fastq\n" .
	"\t path to FastQ files (required)\n" .
	"-o\|--output-dir\n" .
	"\t path to output directory (required)\n" .
	"-g\|--genome\n" .
	"\t path to genome index (required)\n" .
	"-p\|--opts\n" .
	"\t additional Cellranger Count parameters\n" .
	"-h\|--help\n" .
	"\t print help message\n" .
	"-v\|--version\n" .
	"\t print current version\n"
	);

	die("\n");
	}

	sub version($)
	{
	my $message = $_[0];
	print STDERR (
	$message,"\n",
	"Scientific Computing Facility\n",
	"Max-Planck Institute For Brain Research\n",
	"bug reports to:\n",
	"\tsciclist\@brain.mpg.de\n"
	);
	die("\n");
	}