Skip to content
Permalink
290e9c18f6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
296 lines (243 sloc) 7.93 KB
#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long();
sub usage($);
sub version($);
sub createOutputDirectory($);
sub useBclToFastq($$);
sub useCutadapt($$);
sub useSTAR($$);
sub useFeatureCounts($$);
MAIN:
{
# define inputs
my $version_tag = "version 1.0, June 2018";
my $version;
my $help;
my $path_illumina;
my $path_output;
my $path_genome_index;
my $path_genome_annotation;
my $adapter_seq;
# set-up paramters
Getopt::Long::GetOptions(
"r|run=s" => \$path_illumina,
"o|output-dir=s" => \$path_output,
"g|genome=s" => \$path_genome_index,
"a|annotation=s" => \$path_genome_annotation,
"h|help" => \$help,
"v|version" => \$version,
"x|adapter=s" => \$adapter_seq
) or usage("Error::invalid command line options");
# parse inputs
version($version_tag) if($version);
usage($version_tag) if($help);
usage("Error::path to illumina run is required") unless defined($path_illumina);
usage("Error::path to output location is required") unless defined($path_output);
usage("Error::path to genome index is required") unless defined($path_genome_index);
usage("Error::path to genome annotation is required") unless defined($path_genome_annotation);
usage("Error::adaptor sequence is required") unless defined($adapter_seq);
# create directory structure
my $path_structure = createOutputDirectory($path_output);
# run BclToFastq
#useBclToFastq($path_illumina, $path_structure);
# run Cutadapt
useCutadapt($adapter_seq, $path_structure);
# run STAR aligner
useSTAR($path_genome_index, $path_structure);
# run FeatureCounts
useFeatureCounts($path_genome_annotation, $path_structure);
}
sub useFeatureCounts($$)
{
my $path_genome_annotation = $_[0];
my $path_structure = $_[1];
my $path_bam = $path_structure->{"bam"} . "/*.bam";
my $command = "featureCounts" .
" -a " . $path_genome_annotation .
" -o " . $path_structure->{"count"} . "/counts.txt" .
" -t exon" .
" -Q 255" .
" -T 12" .
" \$\(ls $path_bam)";
print $command,"\n";
#system($command);
}
sub useSTAR($$)
{
my $path_genome_index = $_[0];
my $path_structure = $_[1];
# loop through all fastq files
my @files_fastq = glob($path_structure->{"trim"} . "/*.fa*.gz");
if (!@files_fastq)
{
print STDERR "Error::could not retrieve FASTQ files.\n";
die("\n");
}
foreach my $file_fastq (@files_fastq)
{
my $file_name = ($file_fastq =~ m/.*fastq\/(.*)\.fa.*\.gz$/) ? $1 : "unknown";
if ($file_name eq "unknown")
{
print STDERR "Error::could not retrieve FASTQ file name $file_fastq.\n";
die("\n");
}
my $file_bam = $path_structure->{"bam"} . "/". $file_name . ".bam";
my $file_log = $path_structure->{"log"} . "/". $file_name . "_starlog.txt";
my $command = "STAR" .
" --runMode alignReads" .
" --runThreadN 6" .
" --genomeDir $path_genome_index" .
" --readFilesIn $file_fastq" .
" --readFilesCommand zcat" .
" --outStd Log" .
" --outSAMtype BAM SortedByCoordinate" .
" --outSAMstrandField intronMotif" .
" --outFilterIntronMotifs RemoveNoncanonical" .
" --alignSoftClipAtReferenceEnds No" .
" --outFilterScoreMinOverLread 0.25" .
" --outFilterMatchNminOverLread 0.25";
print $command,"\n";
system($command);
$command = "mv Aligned.sortedByCoord.out.bam " . $file_bam;
print $command,"\n";
system($command);
$command = "samtools index " . $file_bam;
print $command,"\n";
system($command);
$command = "mv Log.final.out " . $file_log;
print $command,"\n";
system($command);
$command = "rm -f SJ.out.tab";
print $command,"\n";
system($command);
$command = "rm -f Log.*out";
print $command,"\n";
system($command);
}
}
sub useCutadapt($$)
{
my $adapter_seq = $_[0];
my $path_structure = $_[1];
# loop through all fastq files
my @files_fastq = glob($path_structure->{"fastq"} . "/*.fa*.gz");
if (!@files_fastq)
{
print STDERR "Error::could not retrieve FASTQ files.\n";
die("\n");
}
foreach my $file_fastq (@files_fastq)
{
my $file_name = ($file_fastq =~ m/.*fastq\/(.*)\.fa.*\.gz$/) ? $1 : "unknown";
if ($file_name eq "unknown")
{
print STDERR "Error::could not retrieve FASTQ file name $file_fastq.\n";
die("\n");
}
my $file_trim = $path_structure->{"trim"} . "/". $file_name . ".fastq.gz";
my $file_log = $path_structure->{"log"} . "/". $file_name . "_cutadaptlog.txt";
my $command = "cutadapt" .
" --cores=32" .
" --anywhere" . $adapter_seq .
" --minimum-length=32" .
" --cut=1" .
" --nextseq-trim=20" .
" --quality-cutoff=15,10" .
" --trim-n";
print $command,"\n";
system($command);
}
}
sub useBclToFastq($$)
{
my $path_illumina = $_[0];
my $path_structure = $_[1];
# get SampleSheet file
$path_illumina =~ s/\/$//;
my @files_SampleSheet = glob("$path_illumina/SampleSheet*.csv");
if (!@files_SampleSheet)
{
print STDERR "Error:: SampleSheet file is missing from Illumina run folder\n";
die("\n");
}
# bcl2fastq command
my $command = "bcl2fastq" .
" --runfolder-dir " . $path_illumina .
" --output-dir " . $path_structure->{"fastq"} .
" --no-lane-splitting" .
" --loading-threads 8" .
" --demultiplexing-threads 8" .
" --writing-threads 8" .
" --sample-sheet " . $files_SampleSheet[0];
print $command,"\n";
system($command);
}
sub createOutputDirectory($)
{
my $path_output = $_[0];
my $file_separator = "/";
my %file_structure = ();
# define file structure
$path_output =~ s/\/$//;
$file_structure{"fastq"} = $path_output . $file_separator . "fastq";
$file_structure{"trim"} = $path_output . $file_separator . "trim";
$file_structure{"bam"} = $path_output . $file_separator . "bam";
$file_structure{"log"} = $path_output . $file_separator . "log";
#$file_structure{"gbed"} = $path_output . $file_separator . "gbed";
$file_structure{"count"} = $path_output . $file_separator . "count";
# check if file exist otherwise create
foreach my $key (sort keys %file_structure)
{
if (-d $file_structure{$key})
{
print STDERR "Error::output directory $file_structure{$key} already exists\n";
#die("\n");
}
else
{
mkdir $file_structure{$key};
}
}
return \%file_structure;
}
sub usage($)
{
my $message = $_[0];
if (defined $message && length($message))
{
$message .= "\n" unless($message =~/\n$/);
}
my $command = $0;
$command =~ s#^.*/##;
print STDERR (
$message,
"usage: $command " .
"-r|--run\n" .
"\t path to Illumina BCL files (required)\n" .
"-o|--output-dir\n" .
"\t path to output directory (required)\n" .
"-g|--genome\n" .
"\t path to genome index (required)\n" .
"-a|--annotation\n" .
"\t path to genome annotation (required)\n" .
"-h|--help\n" .
"\t print help message\n" .
"-v|--version\n" .
"\t print current version\n"
);
die("\n");
}
sub version($)
{
my $message = $_[0];
print STDERR (
$message,"\n",
"Scientific Computing Facility\n",
"Max-Planck Institute For Brain Research\n",
"bug reports to:\n",
"\tsciclist\@brain.mpg.de\n"
);
die("\n");
}