This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ReptilePallium/Extension_3UTR/scripts/PASSAnnotate.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
198 lines (158 sloc)
5.09 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# PASSAnnotate | |
# | |
# link upstream genes to given PolyA supported sites | |
# | |
# Input | |
# $1 file containing PASS clusters | |
# $2 file containing reference gene feature annotation | |
# $3 upstream window size | |
# | |
# | |
# Version 1.0 | |
# Date Nov 2016 | |
# Georgi Tushev | |
# Scientific Computing Facility | |
# Max-Planck Institute for Brain Research | |
# send bug reports to sciclist@brain.mpg.de | |
# | |
use warnings; | |
use strict; | |
use File::Basename; | |
sub parsePASSClusters($); | |
sub findClosest($$$); | |
sub reportClusters($$); | |
# main | |
MAIN: | |
{ | |
my $file_clusters = shift; | |
my $file_annotation = shift; | |
my $threshold_distance = shift; | |
my $file_temp = parsePASSClusters($file_clusters); | |
my ($data_ref, $accum_ref) = findClosest($file_temp, $file_annotation, $threshold_distance); | |
reportClusters($data_ref, $accum_ref); | |
system("rm -f $file_temp"); | |
} | |
### --- subroutines --- ### | |
sub reportClusters($$) | |
{ | |
my $data_ref = $_[0]; | |
my $accum_ref = $_[1]; | |
foreach my $line (@{$data_ref}) | |
{ | |
my $key = $line->[0] . ";" . $line->[5] . ";" . $line->[15]; | |
my $filter = 0; | |
if (exists($accum_ref->{$key})) | |
{ | |
my $fraction = $line->[8] / $accum_ref->{$key}; | |
$filter = 1 if(($fraction >= 0.01) && ($line->[11] > 0)); | |
} | |
push(@{$line}, $filter); | |
print join("\t", @{$line}),"\n"; | |
} | |
} | |
sub findClosest($$$) | |
{ | |
my $file_temp = $_[0]; | |
my $file_annotation = $_[1]; | |
my $threshold_distance = $_[2]; | |
my @data = (); | |
my %accum = (); | |
# DEBUG | |
print STDERR "parsing closest ..."; | |
my $start_stamp = time(); | |
open(my $fh, "bedtools closest -a $file_temp -b $file_annotation -s -D b -iu -t first |") or die $!; | |
while(<$fh>) | |
{ | |
chomp($_); | |
my @line = split("\t", $_, 28); | |
my ($transcript, $symbol, $feature) = split(";", $line[24], 3); | |
my $dist = $line[-1]; | |
my $upstream_start = -1; | |
my $upstream_span = -1; | |
# define hit on closest distance | |
# (D < 0) || (threshold < D) : intergenic | |
# D == 0 : overlap | |
# (0 < D) && (D <= threshold) : closest | |
if (($dist < 0) || ($threshold_distance < $dist)) | |
{ | |
$symbol = "<unknown>"; | |
$feature = "intergenic"; | |
} | |
elsif ($dist == 0) | |
{ | |
# strand specific | |
if ($line[5] eq "+") | |
{ | |
$upstream_start = $line[22]; | |
$upstream_span = abs($upstream_start - $line[2]); | |
} | |
else | |
{ | |
$upstream_start = $line[23]; | |
$upstream_span = abs($upstream_start - $line[2]); | |
} | |
} | |
elsif ((0 < $dist) && ($dist <= $threshold_distance)) | |
{ | |
# strand specific | |
if ($line[5] eq "+") | |
{ | |
$upstream_start = ($feature eq "3pUTR") ? $line[22] : $line[23]; | |
$upstream_span = abs($upstream_start - $line[2]); | |
} | |
else | |
{ | |
$upstream_start = ($feature eq "3pUTR") ? $line[23] : $line[22]; | |
$upstream_span = abs($upstream_start - $line[2]); | |
} | |
# update feature to extended | |
$feature .= "_extended"; | |
} | |
my @record = @line[6..20]; | |
push(@record, $symbol, $feature, $upstream_start, $upstream_span); | |
push(@data, \@record); | |
if ($symbol ne "<unknown>") | |
{ | |
my $key = $line[0] . ";" . $line[5] . ";" . $symbol; | |
$accum{$key} = exists($accum{$key}) ? $accum{$key} + $line[14] : $line[14]; | |
} | |
#print join("\t",@line[6..20]),"\t",$symbol,"\t",$feature,"\t",$upstream_start,"\t",$upstream_span,"\n"; | |
} | |
close($fh); | |
# DEBUG | |
my $stop_stamp = time(); | |
my $run_time = $stop_stamp - $start_stamp; | |
print STDERR " done in $run_time seconds.\n"; | |
return (\@data, \%accum); | |
} | |
sub parsePASSClusters($) | |
{ | |
my $file_clusters = $_[0]; | |
# extract file name | |
my $file_name = fileparse($file_clusters,(".bed.gz",".gbed.gz")); | |
my $file_temp = "temp_" . $file_name . ".bed"; | |
# DEBUG | |
print STDERR "parsing clusters ..."; | |
my $start_stamp = time(); | |
open(my $fh, "gunzip -c $file_clusters |") or die $!; | |
open(my $fo, "| sort -k1,1 -k2,2n > $file_temp") or die $!; | |
while(<$fh>) | |
{ | |
chomp($_); | |
# parse line | |
my @line = split("\t", $_, 15); | |
# determine current PASS base (0-based coordinates) | |
my $base_end = ($line[14] == -1) ? $line[10] : $line[14]; | |
my $base_start = ($base_end == 0) ? 0 : ($base_end - 1); | |
# reformat cluster line for bedtools | |
print $fo $line[0],"\t",$base_start,"\t",$base_end,"\t",$line[3],"\t",$line[8],"\t",$line[5],"\t",join("\t", @line),"\n"; | |
} | |
close($fh); | |
close($fo); | |
# DEBUG | |
my $stop_stamp = time(); | |
my $run_time = $stop_stamp - $start_stamp; | |
print STDERR " done in $run_time seconds.\n"; | |
return $file_temp; | |
} |