Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
5059372a14
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
198 lines (158 sloc) 5.09 KB
#!/usr/bin/perl
# PASSAnnotate
#
# link upstream genes to given PolyA supported sites
#
# Input
# $1 file containing PASS clusters
# $2 file containing reference gene feature annotation
# $3 upstream window size
#
#
# Version 1.0
# Date Nov 2016
# Georgi Tushev
# Scientific Computing Facility
# Max-Planck Institute for Brain Research
# send bug reports to sciclist@brain.mpg.de
#
use warnings;
use strict;
use File::Basename;
sub parsePASSClusters($);
sub findClosest($$$);
sub reportClusters($$);
# main
MAIN:
{
my $file_clusters = shift;
my $file_annotation = shift;
my $threshold_distance = shift;
my $file_temp = parsePASSClusters($file_clusters);
my ($data_ref, $accum_ref) = findClosest($file_temp, $file_annotation, $threshold_distance);
reportClusters($data_ref, $accum_ref);
system("rm -f $file_temp");
}
### --- subroutines --- ###
sub reportClusters($$)
{
my $data_ref = $_[0];
my $accum_ref = $_[1];
foreach my $line (@{$data_ref})
{
my $key = $line->[0] . ";" . $line->[5] . ";" . $line->[15];
my $filter = 0;
if (exists($accum_ref->{$key}))
{
my $fraction = $line->[8] / $accum_ref->{$key};
$filter = 1 if(($fraction >= 0.01) && ($line->[11] > 0));
}
push(@{$line}, $filter);
print join("\t", @{$line}),"\n";
}
}
sub findClosest($$$)
{
my $file_temp = $_[0];
my $file_annotation = $_[1];
my $threshold_distance = $_[2];
my @data = ();
my %accum = ();
# DEBUG
print STDERR "parsing closest ...";
my $start_stamp = time();
open(my $fh, "bedtools closest -a $file_temp -b $file_annotation -s -D b -iu -t first |") or die $!;
while(<$fh>)
{
chomp($_);
my @line = split("\t", $_, 28);
my ($transcript, $symbol, $feature) = split(";", $line[24], 3);
my $dist = $line[-1];
my $upstream_start = -1;
my $upstream_span = -1;
# define hit on closest distance
# (D < 0) || (threshold < D) : intergenic
# D == 0 : overlap
# (0 < D) && (D <= threshold) : closest
if (($dist < 0) || ($threshold_distance < $dist))
{
$symbol = "<unknown>";
$feature = "intergenic";
}
elsif ($dist == 0)
{
# strand specific
if ($line[5] eq "+")
{
$upstream_start = $line[22];
$upstream_span = abs($upstream_start - $line[2]);
}
else
{
$upstream_start = $line[23];
$upstream_span = abs($upstream_start - $line[2]);
}
}
elsif ((0 < $dist) && ($dist <= $threshold_distance))
{
# strand specific
if ($line[5] eq "+")
{
$upstream_start = ($feature eq "3pUTR") ? $line[22] : $line[23];
$upstream_span = abs($upstream_start - $line[2]);
}
else
{
$upstream_start = ($feature eq "3pUTR") ? $line[23] : $line[22];
$upstream_span = abs($upstream_start - $line[2]);
}
# update feature to extended
$feature .= "_extended";
}
my @record = @line[6..20];
push(@record, $symbol, $feature, $upstream_start, $upstream_span);
push(@data, \@record);
if ($symbol ne "<unknown>")
{
my $key = $line[0] . ";" . $line[5] . ";" . $symbol;
$accum{$key} = exists($accum{$key}) ? $accum{$key} + $line[14] : $line[14];
}
#print join("\t",@line[6..20]),"\t",$symbol,"\t",$feature,"\t",$upstream_start,"\t",$upstream_span,"\n";
}
close($fh);
# DEBUG
my $stop_stamp = time();
my $run_time = $stop_stamp - $start_stamp;
print STDERR " done in $run_time seconds.\n";
return (\@data, \%accum);
}
sub parsePASSClusters($)
{
my $file_clusters = $_[0];
# extract file name
my $file_name = fileparse($file_clusters,(".bed.gz",".gbed.gz"));
my $file_temp = "temp_" . $file_name . ".bed";
# DEBUG
print STDERR "parsing clusters ...";
my $start_stamp = time();
open(my $fh, "gunzip -c $file_clusters |") or die $!;
open(my $fo, "| sort -k1,1 -k2,2n > $file_temp") or die $!;
while(<$fh>)
{
chomp($_);
# parse line
my @line = split("\t", $_, 15);
# determine current PASS base (0-based coordinates)
my $base_end = ($line[14] == -1) ? $line[10] : $line[14];
my $base_start = ($base_end == 0) ? 0 : ($base_end - 1);
# reformat cluster line for bedtools
print $fo $line[0],"\t",$base_start,"\t",$base_end,"\t",$line[3],"\t",$line[8],"\t",$line[5],"\t",join("\t", @line),"\n";
}
close($fh);
close($fo);
# DEBUG
my $stop_stamp = time();
my $run_time = $stop_stamp - $start_stamp;
print STDERR " done in $run_time seconds.\n";
return $file_temp;
}