From 3d30e778b772466b87d66fa359f27f810e6ec3f5 Mon Sep 17 00:00:00 2001 From: Georgi Tushev Date: Tue, 20 Sep 2016 15:15:32 +0200 Subject: [PATCH] classify proteins step --- classify_proteins/ClassifyProteins.pl | 436 ++++++++++++++++++++++++++ 1 file changed, 436 insertions(+) create mode 100755 classify_proteins/ClassifyProteins.pl diff --git a/classify_proteins/ClassifyProteins.pl b/classify_proteins/ClassifyProteins.pl new file mode 100755 index 0000000..1212a29 --- /dev/null +++ b/classify_proteins/ClassifyProteins.pl @@ -0,0 +1,436 @@ +#!/usr/bin/perl + +# Classifies Proteins Based On: +# +# output of SignalP +# predicts the presence and location of signal peptide cleavage sites in amino acid sequences +# +# output of TMHMM +# prediction of transmembrane helices in proteins +# +# output of NetNGlyc +# prediction of N-glycosylation sites in proteins +# +# Protein Classes are determined based on: +# Nucleic Acids Res. 2006 Jan 1;34(Database issue):D213-7. +# LOCATE: a mouse protein subcellular localization database. +# Fink JL1, Aturaliya RN, Davis MJ, Zhang F, Hanson K, Teasdale MS, Kai C, Kawai J, Carninci P, Hayashizaki Y, Teasdale RD. +# +# created 30 Apr 2015 +# Georgi Tushev +# Scientific Computing Facility +# Max-Planck Institute For Brain Research +# bug report to sciclist@brain.mpg.de +# + +use warnings; +use strict; +use Getopt::Long(); + +# subroutines primitives +sub usage($); +sub readInSignalP($); +sub readInTMHMM($); +sub readInNetNGlyc($$); +sub readInCello($); +sub predictNGlycSites($$$$$); + +# main routine +MAIN: +{ + # define input selection + my $refann_file; + my $signalp_file; + my $tmhmm_file; + my $netnglyc_file; + my $cello_file; + my $netnglyc_thresh = 0.5; + my $help; + + Getopt::Long::GetOptions( + "refann=s" =>\$refann_file, + "signalp=s" => \$signalp_file, + "tmhmm=s" => \$tmhmm_file, + "netnglyc=s" => \$netnglyc_file, + "cello=s" => \$cello_file, + "thresh=f" => \$netnglyc_thresh, + "help" => \$help + ) or usage("Invalid command line options."); + + # define help output + usage("version 2.0, Nov 2015") if($help); + + # check input files + usage("Error :: reference annotation file is missing.") unless defined $refann_file; + usage("Error :: SignalP prediction file is missing.") unless defined $signalp_file; + usage("Error :: TMHMM prediction file is missing.") unless defined $tmhmm_file; + usage("Error :: NetNGlyc prediction file is missing.") unless defined $netnglyc_file; + usage("Error :: Cello prediction file is missing.") unless defined $cello_file; + + # check for correct NetNGlyc motif threshold + $netnglyc_thresh = 0.5 if(($netnglyc_thresh <= 0) | ($netnglyc_thresh >= 1)); + + # read in tools prediction + my $signalp_hash = readInSignalP($signalp_file); + my $tmhmm_hash = readInTMHMM($tmhmm_file); + my $netnglyc_hash = readInNetNGlyc($netnglyc_file, $netnglyc_thresh); + my $cello_hash = readInCello($cello_file); + + # process each record in the reference annotation + open(my $fh, "<", $refann_file) or die("Can't open $refann_file to read!\n"); + while(<$fh>) + { + # skip header line + next if($_ =~ /^#/); + + # remove new line space + chomp($_); + + # split line + my @ref_line = split("\t", $_, 8); + my $key = $ref_line[0] . ";" . $ref_line[1] . ";" . $ref_line[2]; + my $plng = ($ref_line[-1] < 4000) ? $ref_line[-1] : 4000; + + # default category + my $category = "unknown"; + + # hash data + my $signalp = exists($signalp_hash->{$key}) ? $signalp_hash->{$key} : 0; + my $tmhmm = exists($tmhmm_hash->{$key}) ? $tmhmm_hash->{$key}[0] : 0; + my $topo = exists($tmhmm_hash->{$key}) ? $tmhmm_hash->{$key}[1] : ""; + my $netnglyc = exists($netnglyc_hash->{$key}) ? $netnglyc_hash->{$key} : 0; + my $cello_class = exists($cello_hash->{$key}) ? $cello_hash->{$key}[0] : ""; + my $cello_score = exists($cello_hash->{$key}) ? $cello_hash->{$key}[1] : 0; + + # update category + $category = "soluble_intracellular" if(($signalp==0) & ($tmhmm==0)); + $category = "soluble_secreted" if(($signalp==1) & ($tmhmm==0)); + $category = "typeI_membrane" if(($signalp==1) & ($tmhmm==1)); + $category = "typeII_membrane" if(($signalp==0) & ($tmhmm==1)); + $category = "multi-pass_membrane" if($tmhmm>1); + + # Predict N-Glycosylation Sites + my $nglyc = predictNGlycSites($plng,$signalp,$tmhmm,$topo,$netnglyc); + + print $key,"\t",$category,"\t",$nglyc,"\t",$cello_class,"\t",$cello_score,"\n"; + + } + close($fh); + +} + + +### Subroutines ### + +# predictNGlycSites +# +# N-Glycosylation sites are predicted in +# outer transmembrane domains in proteins with either +# predicted signalP or at least one transmembrane domain +# +# inputs (5): +# 1.protein length +# 2.signal peptide presence [0|1] +# 3.number of predicted transmembrane domains +# 4.string depicting protein topology +# 5.hash ref of NetNGlyc prediction for N-Glyc motifs count and position +# +# outpus (1): +# number of predicted N-Glycosylation sites +# +sub predictNGlycSites($$$$$) +{ + my $plng = shift; + my $signalp = shift; + my $tmhmm = shift; + my $topology = shift; + my $netnglyc = shift; + my $nglyc = 0; + + # check if NetNGlyc predicts any motifs + if ($netnglyc == 0) + { + return 0; + } + + # N-Glycosilation is not possible without either a Signal Peptide or at least one TMD + if (($signalp == 0) & ($tmhmm == 0)) + { + return 0; + } + + # topology type + my $topotype = $topology; + $topotype =~ s/[^oi]+/t/g; + + # topology positions + my @topopos = $topology =~ /(\d+)/g; + unshift(@topopos,0); + push(@topopos,$plng); + + # concatenate sequence + my $seqcode = ""; + my $test_sum = 0; + for (my $k = 0; $k < scalar(@topopos)-1; $k++) + { + my $diff_ary = $topopos[$k+1] - $topopos[$k]; + $seqcode .= substr($topotype,$k,1) x $diff_ary; + $test_sum += $diff_ary; + } + + #print $topology,"\t",substr($seqcode,$topopos[1]-1,1),";",substr($seqcode,$topopos[1],1),"\n"; + + # check for N-Glycosylation sites + for (my $k = 0; $k < $netnglyc->{"CNT"}; $k++) + { + my $qry_pos = $netnglyc->{"POS"}[$k] - 1; + $nglyc ++ if(substr($seqcode,$qry_pos,3) eq "ooo"); + } + + # test output + #print $topology,"\t",$netnglyc->{"CNT"},";",$nglyc,"\t",join(";",@{$netnglyc->{"POS"}}),"\n"; + + + return $nglyc; +} + + +# readInSignalP +# +# SignalP prediction from: +# http://www.cbs.dtu.dk/services/SignalP/ +# +# output format "Short(no graphics)", changed to tab-delimited +# +# columns (12): +# name,Cmax,CmaxPos,Ymax,YmaxPos,Smax,SmaxPos,Smean,D,?,Dmax,Networks-used +# +# uses only column 1 & 10 for name and Y/N for predicted Signal +# +sub readInSignalP($) +{ + my $query_file = shift; + my %signalp_hash = (); + my $detected = 0; + open(my $fh, "<", $query_file) or die("Can't open $query_file to read!\n"); + while(<$fh>) + { + # skip header line + next if($_ =~ /^#/); + + # remove new line space + chomp($_); + + # substitute multiple space with tab + $_ =~ s/[\ ]+/\t/g; + + # split line + my @line = split("\t", $_, 12); + + # fill up hash + $signalp_hash{$line[0]} = $line[9] eq "Y" ? 1 : 0; + $detected++ if ($line[9] eq "Y"); + } + close($fh); + + my $hash_size = scalar(keys %signalp_hash); + print STDERR "SignalP (detected/records): $detected / $hash_size\n"; + + return \%signalp_hash; +} + +# readInTMHMM +# +# TMHMM predictions from: +# http://www.cbs.dtu.dk/services/TMHMM-2.0/ +# +# output format "one line per protein" +# +# columns (6): +# name, protein_length, ExpAA, First60, PredHel, Topology +# +sub readInTMHMM($) +{ + my $query_file = shift; + my %tmhmm_hash = (); + my $detected = 0; + open(my $fh, "<", $query_file) or die("Can't open $query_file to read!\n"); + while(<$fh>) + { + # skip header line + next if($_ =~ /^#/); + + # remove new line space + chomp($_); + + # remove column annotations + ($_ =~ s/len=//g); + ($_ =~ s/ExpAA=//g); + ($_ =~ s/First60=//g); + ($_ =~ s/PredHel=//g); + ($_ =~ s/Topology=//g); + + # split line + my @line = split("\t", $_, 6); + + # check detected + $detected++ if($line[4] > 0); + + # fill hash + $tmhmm_hash{$line[0]} = [$line[4],$line[5]]; + } + close($fh); + + my $hash_size = scalar(keys %tmhmm_hash); + print STDERR "TMHMM (detected/records): $detected / $hash_size\n"; + + return \%tmhmm_hash; +} + +# readInNetNGlyc +# +# NetNGlyc predictions from: +# http://www.cbs.dtu.dk/services/NetNGlyc/ +# +# grep from HTML result only one line record +# +# columns(6): +# name, position, motif, threshold, consensus_nodes, consensus_signs +# +sub readInNetNGlyc($$) +{ + my $query_file = shift; + my $threshold = shift; + my %netnglyc_hash = (); + open(my $fh, "<", $query_file) or die("Can't open $query_file to read!\n"); + while(<$fh>) + { + # skip header lines + next if($_ =~ /^#/); + + # remove new line space + chomp($_); + + # correct line spacing + $_ =~ s/[\t\ ]+/\t/g; + + # split line + my @line = split("\t", $_, 6); + + # skip sites below threshold + next if($line[3] < $threshold); + + # fill up hash + push(@{$netnglyc_hash{$line[0]}{"POS"}},$line[1]); + if(exists($netnglyc_hash{$line[0]})) + { + $netnglyc_hash{$line[0]}{"CNT"}++; + } + else + { + $netnglyc_hash{$line[0]}{"CNT"} = 1; + } + } + close($fh); + + my $hash_size = scalar(keys %netnglyc_hash); + print STDERR "NetNGlyc (records): $hash_size\n"; + + return \%netnglyc_hash; +} + +# readInCello +# +# Cello predictions from: +# http://cello.life.nctu.edu.tw/ +# +# grep from HTML result only one line per record +# +# columns(3): +# name, class, score +# +sub readInCello($) +{ + my $query_file = shift; + my %cello_hash = (); + my $detected = 0; + open(my $fh, "<", $query_file) or die("Can't open $query_file to read!\n"); + while(<$fh>) + { + # skip header line + next if($_ =~ /^#/); + + # remove new line space + chomp($_); + + # split line + my @line = split("\t", $_, 3); + + # check detected + $detected++ if($line[2] > 0); + + # fill hash + $cello_hash{$line[0]} = [$line[1],$line[2]]; + } + close($fh); + + my $hash_size = scalar(keys %cello_hash); + print STDERR "CELLO (detected/records): $detected / $hash_size\n"; + + return \%cello_hash; +} + + +# Usage +# +sub usage($) +{ + my $message = shift; + if (defined $message && length $message) { + $message .= "\n" unless $message =~ /\n$/; + } + + my $command = $0; + $command =~ s#^.*/##; + + print STDERR ( + + $message, + "usage: $command -refann refann_file -signalp signalp_file -tmhmm tmhmm_file -netnglyc netnglyc_file\n" . + "description: classifies proteins based on CBS tools prediction\n" . + "parameters: \n" . + "-refann\n" . + "\treference annotation for protein records\n" . + "\tcontains the keys from the fasta file used in the prediction tools\n" . + "\n" . + "-signalp\n" . + "\tprediction of presence and location of signal peptide\n" . + "\tcleavege sites by SignalP 4.1\n" . + "\thttp://www.cbs.dtu.dk/services/SignalP/\n" . + "\tsignalp_file :: tab delimited file with 12 fileds\n" . + "\t1.name, 2.Cmax, 3.pos, 4.Ymax, 5.pos, 6.Smax\n" . + "\t7.pos, 8.Smean, 9.D, 10.?, 11.Dmaxcut, 12.Networks-used\n" . + "\n" . + "-tmhmm\n" . + "\tprediction of transmembrane domains by TMHMM 2.0\n" . + "\thttp://www.cbs.dtu.dk/services/TMHMM/\n" . + "\ttmhmm_file :: tab delimited file with 6 fields\n" . + "\t1.label, 2.len, 3.ExpAA, 4.First60, 5.PredHel, 6.Topology\n" . + "\n" . + "-netnglyc\n" . + "\tprediction of N-glycosilation site by NetNGlyc 1.0\n" . + "\thttp://www.cbs.dtu.dk/services/NetNGlyc/\n" . + "\tnglyc_file :: tab delimited file with 6 fields\n" . + "\t1.label, 2.pos, 3.motif, 4.threshold, 5.perceptrons, 6.jury strength\n" . + "\n" . + "-thresh\n" . + "\tthreshold for the predicted NetNGlyc motifs, default 0.5, range (0,1)\n" . + "\n" . + "-help\n" . + "\tdefine usage\n" + ); + + die("\n"); +} + +