PASSAnnotate.pl

#!/usr/bin/perl

# PASSAnnotate
#
# link upstream genes to given PolyA supported sites
#
# Input
#   $1 file containing PASS clusters
#   $2 file containing reference gene feature annotation
#   $3 upstream window size
#
#
# Version 1.0
# Date Nov 2016
# Georgi Tushev
# Scientific Computing Facility
# Max-Planck Institute for Brain Research
# send bug reports to sciclist@brain.mpg.de
#

use warnings;
use strict;
use File::Basename;

sub parsePASSClusters($);
sub findClosest($$$);
sub reportClusters($$);

# main
MAIN:
{
    my $file_clusters = shift;
    my $file_annotation = shift;
    my $threshold_distance = shift;

    my $file_temp = parsePASSClusters($file_clusters);
    my ($data_ref, $accum_ref) = findClosest($file_temp, $file_annotation, $threshold_distance);
    reportClusters($data_ref, $accum_ref);

    system("rm -f $file_temp");

}

### --- subroutines --- ###

sub reportClusters($$)
{
    my $data_ref = $_[0];
    my $accum_ref = $_[1];

    foreach my $line (@{$data_ref})
    {
        my $key = $line->[0] . ";" . $line->[5] . ";" . $line->[15];
        my $filter = 0;

        if (exists($accum_ref->{$key}))
        {
            my $fraction = $line->[8] / $accum_ref->{$key};
            $filter = 1 if(($fraction >= 0.01) && ($line->[11] > 0));
        }

        push(@{$line}, $filter);
        print join("\t", @{$line}),"\n";
    }

}


sub findClosest($$$)
{
    my $file_temp = $_[0];
    my $file_annotation = $_[1];
    my $threshold_distance = $_[2];
    my @data = ();
    my %accum = ();


    # DEBUG
    print STDERR "parsing closest ...";
    my $start_stamp = time();

    open(my $fh, "bedtools closest -a $file_temp -b $file_annotation -s -D b -iu -t first |") or die $!;
    while(<$fh>)
    {
        chomp($_);

        my @line = split("\t", $_, 28);
        my ($transcript, $symbol, $feature) = split(";", $line[24], 3);
        my $dist = $line[-1];
        my $upstream_start = -1;
        my $upstream_span = -1;

        # define hit on closest distance
        # (D < 0) || (threshold < D) : intergenic
        # D == 0 : overlap
        # (0 < D) && (D <= threshold) : closest

        if (($dist < 0) || ($threshold_distance < $dist))
        {
            $symbol = "<unknown>";
            $feature = "intergenic";
        }
        elsif ($dist == 0)
        {
            # strand specific
            if ($line[5] eq "+")
            {
                $upstream_start = $line[22];
                $upstream_span = abs($upstream_start - $line[2]);
            }
            else
            {
                $upstream_start = $line[23];
                $upstream_span = abs($upstream_start - $line[2]);
            }
        }
        elsif ((0 < $dist) && ($dist <= $threshold_distance))
        {
            # strand specific
            if ($line[5] eq "+")
            {
                $upstream_start = ($feature eq "3pUTR") ? $line[22] : $line[23];
                $upstream_span = abs($upstream_start - $line[2]);
            }
            else
            {
                $upstream_start = ($feature eq "3pUTR") ? $line[23] : $line[22];
                $upstream_span = abs($upstream_start - $line[2]);
            }

            # update feature to extended
            $feature .= "_extended";
        }

        my @record = @line[6..20];
        push(@record, $symbol, $feature, $upstream_start, $upstream_span);

        push(@data, \@record);
        if ($symbol ne "<unknown>")
        {
            my $key = $line[0] . ";" . $line[5] . ";" . $symbol;
            $accum{$key} = exists($accum{$key}) ? $accum{$key} + $line[14] : $line[14];
        }

        #print join("\t",@line[6..20]),"\t",$symbol,"\t",$feature,"\t",$upstream_start,"\t",$upstream_span,"\n";

    }
    close($fh);

    # DEBUG
    my $stop_stamp = time();
    my $run_time = $stop_stamp - $start_stamp;
    print STDERR " done in $run_time seconds.\n";

    return (\@data, \%accum);

}

sub parsePASSClusters($)
{
    my $file_clusters = $_[0];

    # extract file name
    my $file_name = fileparse($file_clusters,(".bed.gz",".gbed.gz"));
    my $file_temp = "temp_" . $file_name . ".bed";

    # DEBUG
    print STDERR "parsing clusters ...";
    my $start_stamp = time();

    open(my $fh, "gunzip -c $file_clusters |") or die $!;
    open(my $fo, "| sort -k1,1 -k2,2n > $file_temp") or die $!;
    while(<$fh>)
    {
        chomp($_);

        # parse line
        my @line = split("\t", $_, 15);

        # determine current PASS base (0-based coordinates)
        my $base_end = ($line[14] == -1) ? $line[10] : $line[14];
        my $base_start = ($base_end == 0) ? 0 : ($base_end - 1);

        # reformat cluster line for bedtools
        print $fo $line[0],"\t",$base_start,"\t",$base_end,"\t",$line[3],"\t",$line[8],"\t",$line[5],"\t",join("\t", @line),"\n";

    }
    close($fh);
    close($fo);

    # DEBUG
    my $stop_stamp = time();
    my $run_time = $stop_stamp - $start_stamp;
    print STDERR " done in $run_time seconds.\n";

    return $file_temp;

}
	#!/usr/bin/perl

	# PASSAnnotate
	#
	# link upstream genes to given PolyA supported sites
	#
	# Input
	# $1 file containing PASS clusters
	# $2 file containing reference gene feature annotation
	# $3 upstream window size
	#
	#
	# Version 1.0
	# Date Nov 2016
	# Georgi Tushev
	# Scientific Computing Facility
	# Max-Planck Institute for Brain Research
	# send bug reports to sciclist@brain.mpg.de
	#

	use warnings;
	use strict;
	use File::Basename;

	sub parsePASSClusters($);
	sub findClosest($$$);
	sub reportClusters($$);

	# main
	MAIN:
	{
	my $file_clusters = shift;
	my $file_annotation = shift;
	my $threshold_distance = shift;

	my $file_temp = parsePASSClusters($file_clusters);
	my ($data_ref, $accum_ref) = findClosest($file_temp, $file_annotation, $threshold_distance);
	reportClusters($data_ref, $accum_ref);

	system("rm -f $file_temp");

	}

	### --- subroutines --- ###

	sub reportClusters($$)
	{
	my $data_ref = $_[0];
	my $accum_ref = $_[1];

	foreach my $line (@{$data_ref})
	{
	my $key = $line->[0] . ";" . $line->[5] . ";" . $line->[15];
	my $filter = 0;

	if (exists($accum_ref->{$key}))
	{
	my $fraction = $line->[8] / $accum_ref->{$key};
	$filter = 1 if(($fraction >= 0.01) && ($line->[11] > 0));
	}

	push(@{$line}, $filter);
	print join("\t", @{$line}),"\n";
	}

	}


	sub findClosest($$$)
	{
	my $file_temp = $_[0];
	my $file_annotation = $_[1];
	my $threshold_distance = $_[2];
	my @data = ();
	my %accum = ();


	# DEBUG
	print STDERR "parsing closest ...";
	my $start_stamp = time();

	open(my $fh, "bedtools closest -a $file_temp -b $file_annotation -s -D b -iu -t first \|") or die $!;
	while(<$fh>)
	{
	chomp($_);

	my @line = split("\t", $_, 28);
	my ($transcript, $symbol, $feature) = split(";", $line[24], 3);
	my $dist = $line[-1];
	my $upstream_start = -1;
	my $upstream_span = -1;

	# define hit on closest distance
	# (D < 0) \|\| (threshold < D) : intergenic
	# D == 0 : overlap
	# (0 < D) && (D <= threshold) : closest

	if (($dist < 0) \|\| ($threshold_distance < $dist))
	{
	$symbol = "<unknown>";
	$feature = "intergenic";
	}
	elsif ($dist == 0)
	{
	# strand specific
	if ($line[5] eq "+")
	{
	$upstream_start = $line[22];
	$upstream_span = abs($upstream_start - $line[2]);
	}
	else
	{
	$upstream_start = $line[23];
	$upstream_span = abs($upstream_start - $line[2]);
	}
	}
	elsif ((0 < $dist) && ($dist <= $threshold_distance))
	{
	# strand specific
	if ($line[5] eq "+")
	{
	$upstream_start = ($feature eq "3pUTR") ? $line[22] : $line[23];
	$upstream_span = abs($upstream_start - $line[2]);
	}
	else
	{
	$upstream_start = ($feature eq "3pUTR") ? $line[23] : $line[22];
	$upstream_span = abs($upstream_start - $line[2]);
	}

	# update feature to extended
	$feature .= "_extended";
	}

	my @record = @line[6..20];
	push(@record, $symbol, $feature, $upstream_start, $upstream_span);

	push(@data, \@record);
	if ($symbol ne "<unknown>")
	{
	my $key = $line[0] . ";" . $line[5] . ";" . $symbol;
	$accum{$key} = exists($accum{$key}) ? $accum{$key} + $line[14] : $line[14];
	}

	#print join("\t",@line[6..20]),"\t",$symbol,"\t",$feature,"\t",$upstream_start,"\t",$upstream_span,"\n";

	}
	close($fh);

	# DEBUG
	my $stop_stamp = time();
	my $run_time = $stop_stamp - $start_stamp;
	print STDERR " done in $run_time seconds.\n";

	return (\@data, \%accum);

	}

	sub parsePASSClusters($)
	{
	my $file_clusters = $_[0];

	# extract file name
	my $file_name = fileparse($file_clusters,(".bed.gz",".gbed.gz"));
	my $file_temp = "temp_" . $file_name . ".bed";

	# DEBUG
	print STDERR "parsing clusters ...";
	my $start_stamp = time();

	open(my $fh, "gunzip -c $file_clusters \|") or die $!;
	open(my $fo, "\| sort -k1,1 -k2,2n > $file_temp") or die $!;
	while(<$fh>)
	{
	chomp($_);

	# parse line
	my @line = split("\t", $_, 15);

	# determine current PASS base (0-based coordinates)
	my $base_end = ($line[14] == -1) ? $line[10] : $line[14];
	my $base_start = ($base_end == 0) ? 0 : ($base_end - 1);

	# reformat cluster line for bedtools
	print $fo $line[0],"\t",$base_start,"\t",$base_end,"\t",$line[3],"\t",$line[8],"\t",$line[5],"\t",join("\t", @line),"\n";

	}
	close($fh);
	close($fo);

	# DEBUG
	my $stop_stamp = time();
	my $run_time = $stop_stamp - $start_stamp;
	print STDERR " done in $run_time seconds.\n";

	return $file_temp;

	}