Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 163 lines (122 sloc) 3.92 KB
#!/usr/bin/perl
# parseGenBankFile
# parse RNA/Protein GenBank file
# extracts symbol and RefSeq IDs information
#
# Input
# -gbk : GenBank compressed file as downloaded for NCBI
# -help : prints a help message
#
# Output
# tab-delimited table of RefSeq IDs, Official Symbols and
# records description
#
# Version 1.0
# Date Sep 2016
# Georgi Tushev
# Scientific Computing Facility
# Max-Planck Institute for Brain Research
# send bug reports to sciclist@brain.mpg.de
use warnings;
use strict;
use Getopt::Long();
sub parseGBKRecord($);
sub printInfo($);
sub usage($);
# main body
MAIN:{
# query file name
my $gbk_file;
my $gbk_prot;
my $gbk_rna;
my $help;
# set-up input parameters
Getopt::Long::GetOptions(
"gbk=s" => \$gbk_file,
"help" => \$help
) or usage("Error :: invalid command line options");
# default help output
usage("version 1.0, Sep 2016") if($help);
# check if genbank file is provided
usage("Error :: gz compressed GenBank is required") unless defined($gbk_file);
# record counter
my $counter = 0;
# open file to read
open(my $fh, "gzcat $gbk_file |") or die("Can't open $gbk_file to read!\n");
# record separator
local $/ = "//\n";
# print header line
print "#Symbol\tRefSeqID\tRefSeqRef\tEntrezID\tGI\tLength\tOrganism\tDescription\n";
# loop through each record
while(my $record = <$fh>)
{
# remove new line
chomp($record);
# parse GBK record
my $info = parseGBKRecord($record);
# print result
printInfo($info);
# increment record counter
$counter++;
}
close($fh);
#print $record,"\n";
print STDERR $counter,"\n";
}
### --- SUBROUTINES --- ###
sub parseGBKRecord($)
{
# get locally a record
my $record = $_[0];
# parse features
my $symbol = ($record =~ m/\/gene=\"(.+)\"/) ? $1 : "<symbol>";
my $refseq_id = ($record =~ m/LOCUS\s+([ANXY][MPR]\_[0-9]+)/) ? $1 : "<refseq_id>";
my $entrez_id = ($record =~ m/\/db_xref=\"GeneID:([0-9]+)\"/) ? $1 : "<entrez_id>";
my $refseq_ref = ($record =~ m/\/protein_id=\"(?:.*)([ANXY][MRP]\_[0-9]+)/) ? $1 : "<refseq_ref>";
if ($refseq_ref eq "<refseq_ref>")
{
$refseq_ref = ($record =~ m/\/coded_by=\"(?:.*)([ANXY][MRP]\_[0-9]+)/) ? $1 : "<refseq_ref>";
}
my $gi_prot = ($record =~ m/GI\:([0-9]+)/) ? $1 : "<gi_prot>";
my $organism = ($record =~ m/ORGANISM\s+(.+)\n/) ? $1 : "<organism>";
my $length = ($record =~ m/LOCUS\s+[ANXY][MPR]\_[0-9]+\s+([0-9]+)\s/) ? $1 : "<length>";
my $description = ($record =~ m/DEFINITION\s+(.+)ACCESSION/s) ? $1 : "<description>";
# clean description
$description =~ s/\n//g;
$description =~ s/ +/ /g;
$description =~ s/ \[$organism\]\.//g;
$description =~ s/\s?$organism//g;
# Return result
return [$symbol, $refseq_id, $refseq_ref, $entrez_id, $gi_prot, $length, $organism, $description];
}
sub printInfo($)
{
my $info_ref = shift;
my $info_siz = scalar(@{$info_ref});
for(my $k = 0; $k < $info_siz; $k++)
{
print $info_ref->[$k],"\t" if ($k < ($info_siz - 1));
print $info_ref->[$k],"\n" if ($k == ($info_siz - 1));
}
}
sub usage($)
{
my $message = $_[0];
if (defined $message && length $message)
{
$message .= "\n" unless($message =~ /\n$/);
}
my $command = $0;
$command =~ s#^.*/##;
print STDERR (
$message,
"usage: $command -gbk genebank_file.gbk.gz\n" .
"description: parse RNA/Protein GenBank file and extracts symbol and RefSeq IDs information\n" .
"parameters:\n" .
"-gbk\n" .
"\tGZIP compressed GenBank file as downloaded from NCBI ftp://ftp.ncbi.nlm.nih.gov/genomes/Mus_musculus/RNA/\n" .
"-help\n" .
"\tdefine usage\n"
);
die("\n");
}