Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
GOAnalysis/GOAnalysis.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
275 lines (225 sloc)
7.82 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# GOAnalysis | |
# do GO analysis of two unrancked lists | |
# | |
# Input | |
# -obo : Gene Ontology terms annotation in | |
# open biological and biomedical ontology format | |
# -ann : Gene annotation to GO terms | |
# -target : target list of genes (1st column tab delimited file) | |
# -background : background list of genes (1st column tab delimited file) | |
# -threshold : p-value threshold (default 0.001) | |
# -live : gene ontology terms and genes annotation is taken | |
# live from GO Consortium web page: http://geneontology.org/ | |
# | |
# Output | |
# tab-delimited table of overexpressed terms, sorted by increasing p-value | |
# | |
# Version 1.0 | |
# Date Sep 2016 | |
# Georgi Tushev | |
# Scientific Computing Facility | |
# Max-Planck Institute for Brain Research | |
# send bug reports to sciclist@brain.mpg.de | |
use warnings; | |
use strict; | |
use Getopt::Long(); | |
#use Math::GSL::CDF qw/:hypergeometric/; | |
sub usage($); | |
sub ParseQueryList($$$); | |
sub ParseGOAnnotation($$$); | |
sub ParseTermsAnnotation($$$); | |
MAIN: | |
{ | |
# define input selection | |
my $obo_file; | |
my $annotation_file; | |
my $target_file; | |
my $background_file; | |
my $threshold = 0.05; | |
my $live; | |
my $help; | |
my %hash = (); | |
# set-up parameters | |
Getopt::Long::GetOptions( | |
"obo=s" => \$obo_file, | |
"ann=s" => \$annotation_file, | |
"target=s" => \$target_file, | |
"background=s" => \$background_file, | |
"threshold=f" => \$threshold, | |
"live" => \$live, | |
"help" => \$help | |
) or usage("Error :: invalid command line options"); | |
# define help output | |
usage("version 1.0, Sep 2016") if($help); | |
# parse background list | |
ParseQueryList(\%hash, $background_file, 1); | |
# parse target list | |
ParseQueryList(\%hash, $target_file, 2); | |
# parse go annotation | |
ParseGOAnnotation(\%hash, $annotation_file, $live); | |
# parse term annotation | |
ParseTermsAnnotation(\%hash, $obo_file, $live); | |
# count M and n number of genes in background/target AND in ontology | |
my $M = 0; | |
my $n = 0; | |
foreach my $gene (keys %{$hash{"list"}}) | |
{ | |
$M++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 1)); | |
$n++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 2)); | |
} | |
print STDERR "M= $M\n"; | |
print STDERR "n= $n\n"; | |
my $PASS = 0; | |
my $TOTAL = 0; | |
# count per GO term | |
foreach my $term (keys %{$hash{"go2gene"}}) | |
{ | |
if (exists($hash{"go2gene"}{$term}{"namespace"}) && exists($hash{"go2gene"}{$term}{"name"})) | |
{ | |
if (exists($hash{"go2gene"}{$term}{"genes"})) | |
{ | |
my $x = 0; | |
my $K = 0; | |
my @ref = (); | |
foreach my $gene (keys %{$hash{"go2gene"}{$term}{"genes"}}) | |
{ | |
$K++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 1)); | |
if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 2)) | |
{ | |
$x++; | |
push(@ref, $gene); | |
} | |
} | |
# print out result | |
if ($x > 0) | |
{ | |
my $reflist = join(",", sort @ref); | |
my $pval = 1; | |
if ($x < $K) | |
{ | |
#$pval = gsl_cdf_hypergeometric_Q($x, $K, $M - $K, $n); | |
} | |
elsif ($x == $K) | |
{ | |
# one need to change the cumulative upper tail with PDF | |
#$pval = gsl_cdf_hypergeometric_P($x, $K, $M - $K, $n); | |
#print $term,"\t",$pval,"\n"; | |
} | |
my $filter = ($pval <= $threshold) ? 1 : 0; | |
$PASS += $filter; | |
$TOTAL++; | |
print $term,"\t",$hash{"go2gene"}{$term}{"namespace"},"\t",$hash{"go2gene"}{$term}{"name"},"\t",$x,"\t",$M,"\t",$K,"\t",$n,"\t",$pval,"\t",$filter,"\n"; | |
} | |
} | |
} | |
} | |
print STDERR "PASS = $PASS / $TOTAL\n"; | |
} | |
### subroutines ### | |
sub ParseTermsAnnotation($$$) | |
{ | |
my $hash_ref = $_[0]; | |
my $obo_file = $_[1]; | |
my $live = $_[2]; | |
### add live download for obo terms | |
usage("Error :: OBO file is not specified") unless defined ($obo_file); | |
# parse obo file | |
local $/ = ''; | |
open (my $fh, "<", $obo_file) or die $!; | |
while (<$fh>) | |
{ | |
chomp($_); | |
### check if term | |
my $isterm = ($_ =~ m/\[Term\]/) ? "term" : "not-term"; | |
my $isobsolete = ($_ =~ m/is_obsolete\: true/) ? "obsolete" : "valid"; | |
my $goid = ($_ =~ m/id\: (GO\:[0-9]+)/) ? $1 : "<unknown>"; | |
my $name = ($_ =~ m/name\: (.*)/) ? $1 : "<no_name>"; | |
my $namespace = ($_ =~ m/namespace\: (.*)/) ? $1 : "<no_category>"; | |
#my $def = ($_ =~ m/def\: \"(.*)\"/) ? $1 : "<no_definition>"; | |
if (($isterm eq "term") && ($isobsolete eq "valid") && ($goid ne "<unknown>")) | |
{ | |
if (exists($hash_ref->{"go2gene"}{$goid})) | |
{ | |
$hash_ref->{"go2gene"}{$goid}{"name"} = $name; | |
$hash_ref->{"go2gene"}{$goid}{"namespace"} = $namespace; | |
#$hash_ref->{"go2gene"}{$goid}{"def"} = $def; | |
} | |
} | |
} | |
close($fh); | |
} | |
sub ParseGOAnnotation($$$) | |
{ | |
my $hash_ref = $_[0]; | |
my $ann_file = $_[1]; | |
my $live = $_[2]; | |
### add live download for annotation | |
usage("Error :: GO annotation file is not specified") unless defined ($ann_file); | |
# parse annotation file | |
open (my $fh, "gzcat $ann_file |") or die $!; | |
while (<$fh>) | |
{ | |
chomp($_); | |
# skip header | |
next if ($_ =~ m/^\!/); | |
# skip | |
my @line = split('\t', $_, 15); | |
if(exists($hash_ref->{"list"}{$line[2]})) | |
{ | |
$hash_ref->{"list"}{$line[2]} |= 4; | |
$hash_ref->{"gene2go"}{$line[2]}{$line[4]}++; | |
$hash_ref->{"go2gene"}{$line[4]}{"genes"}{$line[2]}++; | |
} | |
else | |
{ | |
next; | |
} | |
} | |
close($fh); | |
} | |
sub ParseQueryList($$$) | |
{ | |
my $hash_ref = $_[0]; | |
my $query_file = $_[1]; | |
my $bit_mask = $_[2]; | |
open(my $fh, "<", $query_file) or die $!; | |
while(<$fh>) | |
{ | |
chomp($_); | |
my ($symbol, $rest) = split('\t', $_, 2); | |
$hash_ref->{"list"}{$symbol} |= $bit_mask; | |
} | |
close($fh); | |
} | |
sub usage($) | |
{ | |
my $message = $_[0]; | |
if (defined $message && length $message) | |
{ | |
$message .= "\n" unless($message =~ /\n$/); | |
} | |
my $command = $0; | |
$command =~ s#^.*/##; | |
print STDERR ( | |
$message, | |
"usage: $command -live -target target_list.txt -background background_list.txt\n" . | |
"description: GOAnalysis performs gene ontology analysis of two unrancked lists\n" . | |
"parameters:\n" . | |
"-obo\n" . | |
"\tGO Terms annotations in OBO file format http://geneontology.org/page/download-ontology\n" . | |
"-ann\n" . | |
"\tGenes to terms annotation in tabulated file format http://geneontology.org/page/download-annotations\n" . | |
"-target\n" . | |
"\tlist of target genes in tab-delimited format, 1st column of official gene symbols is used\n" . | |
"-background\n" . | |
"\tlist pf background genes in tab-delimited format, 1st column of offical gene symbols is used\n" . | |
"-threshold\n" . | |
"\tp-value threshold to determine overexpression\n". | |
"-live\n" . | |
"\tusing current genes and terms annotation from http://geneontology.org/\n" . | |
"-help\n" . | |
"\tdefine usage\n" | |
); | |
die("\n"); | |
} | |