Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/perl
# GOAnalysis
# do GO analysis of two unrancked lists
#
# Input
# -obo : Gene Ontology terms annotation in
# open biological and biomedical ontology format
# -ann : Gene annotation to GO terms
# -target : target list of genes (1st column tab delimited file)
# -background : background list of genes (1st column tab delimited file)
# -threshold : p-value threshold (default 0.001)
# -live : gene ontology terms and genes annotation is taken
# live from GO Consortium web page: http://geneontology.org/
#
# Output
# tab-delimited table of overexpressed terms, sorted by increasing p-value
#
# Version 1.0
# Date Sep 2016
# Georgi Tushev
# Scientific Computing Facility
# Max-Planck Institute for Brain Research
# send bug reports to sciclist@brain.mpg.de
use warnings;
use strict;
use Getopt::Long();
#use Math::GSL::CDF qw/:hypergeometric/;
sub usage($);
sub ParseQueryList($$$);
sub ParseGOAnnotation($$$);
sub ParseTermsAnnotation($$$);
MAIN:
{
# define input selection
my $obo_file;
my $annotation_file;
my $target_file;
my $background_file;
my $threshold = 0.05;
my $live;
my $help;
my %hash = ();
# set-up parameters
Getopt::Long::GetOptions(
"obo=s" => \$obo_file,
"ann=s" => \$annotation_file,
"target=s" => \$target_file,
"background=s" => \$background_file,
"threshold=f" => \$threshold,
"live" => \$live,
"help" => \$help
) or usage("Error :: invalid command line options");
# define help output
usage("version 1.0, Sep 2016") if($help);
# parse background list
ParseQueryList(\%hash, $background_file, 1);
# parse target list
ParseQueryList(\%hash, $target_file, 2);
# parse go annotation
ParseGOAnnotation(\%hash, $annotation_file, $live);
# parse term annotation
ParseTermsAnnotation(\%hash, $obo_file, $live);
# count M and n number of genes in background/target AND in ontology
my $M = 0;
my $n = 0;
foreach my $gene (keys %{$hash{"list"}})
{
$M++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 1));
$n++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 2));
}
print STDERR "M= $M\n";
print STDERR "n= $n\n";
my $PASS = 0;
my $TOTAL = 0;
# count per GO term
foreach my $term (keys %{$hash{"go2gene"}})
{
if (exists($hash{"go2gene"}{$term}{"namespace"}) && exists($hash{"go2gene"}{$term}{"name"}))
{
if (exists($hash{"go2gene"}{$term}{"genes"}))
{
my $x = 0;
my $K = 0;
my @ref = ();
foreach my $gene (keys %{$hash{"go2gene"}{$term}{"genes"}})
{
$K++ if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 1));
if(($hash{"list"}{$gene} & 4) && ($hash{"list"}{$gene} & 2))
{
$x++;
push(@ref, $gene);
}
}
# print out result
if ($x > 0)
{
my $reflist = join(",", sort @ref);
my $pval = 1;
if ($x < $K)
{
#$pval = gsl_cdf_hypergeometric_Q($x, $K, $M - $K, $n);
}
elsif ($x == $K)
{
# one need to change the cumulative upper tail with PDF
#$pval = gsl_cdf_hypergeometric_P($x, $K, $M - $K, $n);
#print $term,"\t",$pval,"\n";
}
my $filter = ($pval <= $threshold) ? 1 : 0;
$PASS += $filter;
$TOTAL++;
print $term,"\t",$hash{"go2gene"}{$term}{"namespace"},"\t",$hash{"go2gene"}{$term}{"name"},"\t",$x,"\t",$M,"\t",$K,"\t",$n,"\t",$pval,"\t",$filter,"\n";
}
}
}
}
print STDERR "PASS = $PASS / $TOTAL\n";
}
### subroutines ###
sub ParseTermsAnnotation($$$)
{
my $hash_ref = $_[0];
my $obo_file = $_[1];
my $live = $_[2];
### add live download for obo terms
usage("Error :: OBO file is not specified") unless defined ($obo_file);
# parse obo file
local $/ = '';
open (my $fh, "<", $obo_file) or die $!;
while (<$fh>)
{
chomp($_);
### check if term
my $isterm = ($_ =~ m/\[Term\]/) ? "term" : "not-term";
my $isobsolete = ($_ =~ m/is_obsolete\: true/) ? "obsolete" : "valid";
my $goid = ($_ =~ m/id\: (GO\:[0-9]+)/) ? $1 : "<unknown>";
my $name = ($_ =~ m/name\: (.*)/) ? $1 : "<no_name>";
my $namespace = ($_ =~ m/namespace\: (.*)/) ? $1 : "<no_category>";
#my $def = ($_ =~ m/def\: \"(.*)\"/) ? $1 : "<no_definition>";
if (($isterm eq "term") && ($isobsolete eq "valid") && ($goid ne "<unknown>"))
{
if (exists($hash_ref->{"go2gene"}{$goid}))
{
$hash_ref->{"go2gene"}{$goid}{"name"} = $name;
$hash_ref->{"go2gene"}{$goid}{"namespace"} = $namespace;
#$hash_ref->{"go2gene"}{$goid}{"def"} = $def;
}
}
}
close($fh);
}
sub ParseGOAnnotation($$$)
{
my $hash_ref = $_[0];
my $ann_file = $_[1];
my $live = $_[2];
### add live download for annotation
usage("Error :: GO annotation file is not specified") unless defined ($ann_file);
# parse annotation file
open (my $fh, "gzcat $ann_file |") or die $!;
while (<$fh>)
{
chomp($_);
# skip header
next if ($_ =~ m/^\!/);
# skip
my @line = split('\t', $_, 15);
if(exists($hash_ref->{"list"}{$line[2]}))
{
$hash_ref->{"list"}{$line[2]} |= 4;
$hash_ref->{"gene2go"}{$line[2]}{$line[4]}++;
$hash_ref->{"go2gene"}{$line[4]}{"genes"}{$line[2]}++;
}
else
{
next;
}
}
close($fh);
}
sub ParseQueryList($$$)
{
my $hash_ref = $_[0];
my $query_file = $_[1];
my $bit_mask = $_[2];
open(my $fh, "<", $query_file) or die $!;
while(<$fh>)
{
chomp($_);
my ($symbol, $rest) = split('\t', $_, 2);
$hash_ref->{"list"}{$symbol} |= $bit_mask;
}
close($fh);
}
sub usage($)
{
my $message = $_[0];
if (defined $message && length $message)
{
$message .= "\n" unless($message =~ /\n$/);
}
my $command = $0;
$command =~ s#^.*/##;
print STDERR (
$message,
"usage: $command -live -target target_list.txt -background background_list.txt\n" .
"description: GOAnalysis performs gene ontology analysis of two unrancked lists\n" .
"parameters:\n" .
"-obo\n" .
"\tGO Terms annotations in OBO file format http://geneontology.org/page/download-ontology\n" .
"-ann\n" .
"\tGenes to terms annotation in tabulated file format http://geneontology.org/page/download-annotations\n" .
"-target\n" .
"\tlist of target genes in tab-delimited format, 1st column of official gene symbols is used\n" .
"-background\n" .
"\tlist pf background genes in tab-delimited format, 1st column of offical gene symbols is used\n" .
"-threshold\n" .
"\tp-value threshold to determine overexpression\n".
"-live\n" .
"\tusing current genes and terms annotation from http://geneontology.org/\n" .
"-help\n" .
"\tdefine usage\n"
);
die("\n");
}