Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 266 lines (228 sloc) 7.51 KB
#!/usr/bin/perl
# parseGenBankFile
# parse RNA/Protein GenBank file
# extracts symbol and RefSeq IDs information
#
# Input
# -gbk : GenBank compressed file as downloaded for NCBI
# -help : prints a help message
#
# Output
# tab-delimited table of RefSeq IDs, Official Symbols and
# records description
#
# Version 1.0
# Date Sep 2016
# Georgi Tushev
# Scientific Computing Facility
# Max-Planck Institute for Brain Research
# send bug reports to sciclist@brain.mpg.de
use warnings;
use strict;
use Getopt::Long();
sub parseGenBankFile($$);
sub printCodonTable($$$);
sub usage($);
# main body
MAIN:{
# query file name
my $gbk_file;
my $help;
# set-up input parameters
Getopt::Long::GetOptions(
"gbk=s" => \$gbk_file,
"help" => \$help
) or usage("Error :: invalid command line options");
# default help output
usage("version 1.0, Sep 2016") if($help);
# check if genbank file is provided
usage("Error :: gz compressed GenBank is required") unless defined($gbk_file);
# record counter
my %codon_table = ();
my %genetic_code = (
'TCA' => 'S', # Serine
'TCC' => 'S', # Serine
'TCG' => 'S', # Serine
'TCT' => 'S', # Serine
'TTC' => 'F', # Phenylalanine
'TTT' => 'F', # Phenylalanine
'TTA' => 'L', # Leucine
'TTG' => 'L', # Leucine
'TAC' => 'Y', # Tyrosine
'TAT' => 'Y', # Tyrosine
'TAA' => '*', # Stop
'TAG' => '*', # Stop
'TGC' => 'C', # Cysteine
'TGT' => 'C', # Cysteine
'TGA' => '*', # Stop
'TGG' => 'W', # Tryptophan
'CTA' => 'L', # Leucine
'CTC' => 'L', # Leucine
'CTG' => 'L', # Leucine
'CTT' => 'L', # Leucine
'CCA' => 'P', # Proline
'CCC' => 'P', # Proline
'CCG' => 'P', # Proline
'CCT' => 'P', # Proline
'CAC' => 'H', # Histidine
'CAT' => 'H', # Histidine
'CAA' => 'Q', # Glutamine
'CAG' => 'Q', # Glutamine
'CGA' => 'R', # Arginine
'CGC' => 'R', # Arginine
'CGG' => 'R', # Arginine
'CGT' => 'R', # Arginine
'ATA' => 'I', # Isoleucine
'ATC' => 'I', # Isoleucine
'ATT' => 'I', # Isoleucine
'ATG' => 'M', # Methionine
'ACA' => 'T', # Threonine
'ACC' => 'T', # Threonine
'ACG' => 'T', # Threonine
'ACT' => 'T', # Threonine
'AAC' => 'N', # Asparagine
'AAT' => 'N', # Asparagine
'AAA' => 'K', # Lysine
'AAG' => 'K', # Lysine
'AGC' => 'S', # Serine
'AGT' => 'S', # Serine
'AGA' => 'R', # Arginine
'AGG' => 'R', # Arginine
'GTA' => 'V', # Valine
'GTC' => 'V', # Valine
'GTG' => 'V', # Valine
'GTT' => 'V', # Valine
'GCA' => 'A', # Alanine
'GCC' => 'A', # Alanine
'GCG' => 'A', # Alanine
'GCT' => 'A', # Alanine
'GAC' => 'D', # Aspartic Acid
'GAT' => 'D', # Aspartic Acid
'GAA' => 'E', # Glutamic Acid
'GAG' => 'E', # Glutamic Acid
'GGA' => 'G', # Glycine
'GGC' => 'G', # Glycine
'GGG' => 'G', # Glycine
'GGT' => 'G', # Glycine
);
my %aminoacid_map = (
'A' => 'Ala', 'R' => 'Arg', 'N' => 'Asn', 'D' => 'Asp',
'C' => 'Cys', 'Q' => 'Gln', 'E' => 'Glu', 'G' => 'Gly',
'H' => 'His', 'I' => 'Ile', 'L' => 'Leu', 'K' => 'Lys',
'M' => 'Met', 'F' => 'Phe', 'P' => 'Pro', 'S' => 'Ser',
'T' => 'Thr', 'W' => 'Trp', 'Y' => 'Tyr', 'V' => 'Val',
'B' => 'Asx', 'Z' => 'glx', 'X' => 'Xaa', '*' => 'Stop'
);
# parse GenBank file
parseGenBankFile($gbk_file, \%codon_table);
# print codon table
printCodonTable(\%codon_table, \%genetic_code, \%aminoacid_map);
}
### --- SUBROUTINES --- ###
sub printCodonTable($$$)
{
my $codon_table_ref = $_[0];
my $genetic_code = $_[1];
my $aminoacid_map = $_[2];
# print header
print "# Gene.Symbol\t";
print "Transcript\t";
my @list_codons = sort keys %{$genetic_code};
my @list_aa = ();
my @list_abbreviation = ();
foreach (@list_codons)
{
my $aa = $genetic_code->{$_};
push(@list_aa, $aa);
push(@list_abbreviation, $aminoacid_map->{$aa});
}
# sort based on AminoAcid letter
my @index_sort = sort {$list_aa[$a] cmp $list_aa[$b]} 0..$#list_aa;
@list_codons = @list_codons[@index_sort];
@list_aa = @list_aa[@index_sort];
@list_abbreviation = @list_abbreviation[@index_sort];
print join("\t",@list_codons),"\n";
print "#\t#\t",join("\t", @list_aa),"\n";
print "#\t#\t",join("\t", @list_abbreviation),"\n";
# print counts
foreach my $gene (sort keys %{$codon_table_ref})
{
foreach my $transcript (sort keys %{$codon_table_ref->{$gene}})
{
my @count = ();
foreach my $codon (@list_codons)
{
my $value = exists($codon_table_ref->{$gene}{$transcript}{$codon}) ? $codon_table_ref->{$gene}{$transcript}{$codon} : 0;
push(@count, $value);
}
print $gene,"\t",$transcript,"\t",join("\t", @count),"\n";
}
}
}
sub parseGenBankFile($$)
{
my $gbk_file = $_[0];
my $codon_table_ref = $_[1];
my $count_records = 0;
my $count_protein_coding = 0;
# open file to read
open(my $fh, "gunzip -c $gbk_file |") or die("Can't open $gbk_file to read!\n");
# record separator
local $/ = "//\n";
# loop through each record
while(my $record = <$fh>)
{
# remove new line
chomp($record);
$count_records++;
next if ($record !~ m/\/protein_id=/);
# parse features
my $symbol = ($record =~ m/\/gene=\"(.+)\"/) ? $1 : "<symbol>";
my $refseq_id = ($record =~ m/LOCUS\s+([ANXY][MPR]\_[0-9]+)/) ? $1 : "<refseq_id>";
my $cds_start = ($record =~ m/CDS\s+(\d+)/) ? $1 : -1;
my $cds_end = ($record =~ m/CDS\s+\d+\.\.(\d+)/) ? $1 : -1;
next if (($cds_start == -1) || ($cds_end == -1));
# parse sequence
my $seq_start = index($record, "ORIGIN");
my $seq_record = substr($record, $seq_start, length($record) - $seq_start);
$seq_record =~ s/[^acgtn]//g;
$seq_record =~ tr/[acgtn]/[ACGTN]/;
if (($cds_end - $cds_start) > 0)
{
#print $refseq_id,";",$symbol,"\t",substr($seq_record, $cds_start - 1, $cds_end - $cds_start + 1),"\n";
}
next if (length($seq_record) < $cds_start);
# count codons
for (my $k = $cds_start - 1; $k < $cds_end; $k+=3)
{
my $codon = substr($seq_record, $k, 3);
$codon_table_ref->{$symbol}{$refseq_id}{$codon}++;
}
$count_protein_coding++;
#last if($count_protein_coding == 20000);
}
close($fh);
print STDERR "GenBank records: ", $count_records, "\n";
print STDERR "GenBank protein coding: ", $count_protein_coding, "\n";
}
sub usage($)
{
my $message = $_[0];
if (defined $message && length $message)
{
$message .= "\n" unless($message =~ /\n$/);
}
my $command = $0;
$command =~ s#^.*/##;
print STDERR (
$message,
"usage: $command -gbk genebank_file.gbk.gz\n" .
"description: parse RNA/Protein GenBank file and extracts symbol and RefSeq IDs information\n" .
"parameters:\n" .
"-gbk\n" .
"\tGZIP compressed GenBank file as downloaded from NCBI ftp://ftp.ncbi.nlm.nih.gov/genomes/Mus_musculus/RNA/\n" .
"-help\n" .
"\tdefine usage\n"
);
die("\n");
}