Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
function [nSeqs, omittedIndices, passFasta] = readFastaSequences(txt, segmentSizeInBps, givenNSeqs, outputFolder, debugLevel, debugMsgLocation)
% READFASTASEQUENCES
%
% READFASTASEQUENCES reads fasta file and extracts sequence to fasta variable.
% Modified version of readfasta shipped with ODH Matlab code.
%
% INPUT PARAMS:
% txt - name of FASTA filename
%
% segmentSizeInBps - segment size in basepairs
%
% givenNSeqs - # sequences as given from the main function (expected in
% the FASTA file). This is needed since the returned number of
% sequences can be different than givenNSeqs
%
% outputFolder - path to the output folder (omitted fasta IDs are written
% at this location
%
% debugLevel and debugMsgLocation - (default) 2, and (default) 1 for stdout
% or filename if to be written to file
%
% OUTPUT PARAMS:
% passfasta - struct containing cell arrays with legends and sequences
% passFasta.legend = cell array with sequence legends (headers)
% passFasta.seqeunce = cell array with sequences
%
% omittedIndices - List of indices that were omittedIndices due to their
% length shorter than the segment size
%
% nSeqs - #sequences read from the given file
%
% ADDITIONAL NOTES
%
% Author: snikumbh@mpi-inf,mpg.de
totalArguments = 6;
if nargin < totalArguments
debugMsgLocation = 1;
end
if nargin < totalArguments - 1
debugMsgLocation = 1;
debugLevel = 2;
end
fasta.legend = cell(1, givenNSeqs);
fasta.sequence = cell(1, givenNSeqs);
i=0;
omittedIndices = [];
fasta.title=[txt,', read: ', date];
% Read Sequences and their FASTA legends
fid=fopen(txt,'r');
ofid=fopen(strcat(outputFolder, '/omittedFastaIds.txt'), 'a');
fprintf(ofid, '----%s---%s----\n\n', char(datetime), txt);
lengths = [];
while i < givenNSeqs % feof(fid)==0
lineRead=fgetl(fid);
if isempty(strfind(lineRead, '#'))
if strfind(lineRead,'>')
fasta.legend{i+1} = lineRead;
else
i=i+1;
lengths = [lengths length(lineRead)];
if lengths(end) >= segmentSizeInBps
fasta.sequence{i}=[fasta.sequence{i},upper(lineRead)];
else % we omit that sequence
omittedIndices = [omittedIndices i];
fprintf(ofid, '%d\t%s', i, fasta.legend{i});
fprintf(ofid, '\n');
end
end
end
end
fclose(ofid);
fclose(fid);
ineligibleLengths = lengths(lengths < segmentSizeInBps);
ineligibleIndices = find(lengths < segmentSizeInBps);
nSeqs = givenNSeqs;
passFasta = fasta;
if size(ineligibleLengths, 2) > 0
logMessages(debugMsgLocation, sprintf('%d sequence(s) have been omitted as they are shorter than the segment-size specified. Check omittedFastaIds.txt\n', length(ineligibleIndices)), debugLevel);
% Change to what would be passed, if it is different than 'fasta' above
nSeqs = length(find(lengths >= segmentSizeInBps));
passFasta.sequence = cell(1, nSeqs);
j = 1;
for i=1:givenNSeqs
if ~isempty(fasta.sequence{i})
passFasta.sequence{j} = fasta.sequence{i};
j = j + 1;
end
end
end
end