Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)
% GETODHFEATUREVECINSTANCES
%
% INPUT PARAMS
% Param 'seq'
% candidate sequence (length L given in #base pairs)
%
% Param 'oligoLen'
% Oligomer length for ODH (K, default: 2)
%
% Param 'maxDist'
% Maximum distance value to consider for ODH;
% (default: 50)
%
% Param 'segmentSizeInPercentage'
% Percentage value used to segment the complete sequence; set this to 0 if
% percentage-based segmenting is not to be used (default: 0)
%
% Param 'segmentSizeInBps'
% If percentage value for segmenting is 0 this has to be non-zero and less
% than complete sequence length. It is given in number of basepairs.
% (default: 100 bps)
%
% Param 'segmentingStart'
% 'shift' specifies shifting the segmentStart to
% (round(segmentSizeInBps/2) + 1);
% otherwise 'no-shift' (default)
%
% Param 'alphabet'
% DNA/Protein sequence alphabet (default: DNA)
%
%
% OUTPUT PARAMS
% Param 'ODHFeatureVecInstances'
% Representation of (ordered) instances (or segments) of the sequence supplied;
% each in sparse format.
%
% Param numOfSegments
%
% ADDITIONAL NOTES
% -- Order of segments
% . This is only to help the user in determining the where non-shifted
% segments end in the collection, and where shifted segment start
% . The subsequent (set) kernel computation is agnostic to this arrangement
%
% -- maxDist is now as per the original ODH paper, Lmax - K
% . Edits: 06, February, 2017, By: snikumbh
%
% -- Sparsify the end result again? May be not needed.
% . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
%
%
% Author: snikumbh@mpi-inf.mpg.de
% Set default values when not supplied
totalArguments = 8;
if nargin < totalArguments
alphabet = 'acgt';
debug = 0;
end
if nargin < totalArguments - 1
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
end
if nargin < totalArguments - 2
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
end
if nargin < totalArguments - 3
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
end
if nargin < totalArguments - 4
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
end
if nargin < totalArguments - 5
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
oligoLen = 2;
end
alphabetLen = length(alphabet);
M = alphabetLen^oligoLen;
% current sequence length
sequenceLen = length(seq);
if debug
fprintf('SequenceLength = %d\n', sequenceLen);
end
% allocate memory for (large) non-sparse feature vector
if strcmp(segmentingStart, 'no-shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)];
segmentBoundaries = [segmentLength: segmentLength: sequenceLen];
if mod(sequenceLen, segmentLength) <= (segmentLength/2)
segmentBoundaries(length(segmentBoundaries)) = sequenceLen;
else
segmentBoundaries = [segmentBoundaries sequenceLen];
end
numOfSegments = length(segmentBoundaries);
elseif strcmp(segmentingStart, 'shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))];
segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen];
numOfSegments = length(segmentBoundaries);
end
% Asert same number of segmentStarts and Boundaries
% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
if debug
fprintf('Segments with %s\n', segmentingStart);
fprintf('#Segments = %d\n', numOfSegments);
fprintf('segmentStarts = \n');
disp(segmentStarts);
fprintf('segmentBoundaries = \n');
disp(segmentBoundaries);
end
if maxDist == segmentSizeInBps
maxDist = segmentSizeInBps-oligoLen;
end
numDist = maxDist + 1;
% 1. correct for zero modifiedODHFeatureVec
% 2. also check that max_dist is not greater
% than length of the segments. The last segment
% will always be the equal-sized or smallest,
% hence checking against it only.
%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1
% max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible
% fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist);
% num_dist = max_dist + 1;
%else
% num_dist = max_dist + 1;
%end
%
%columns correspond to different instances; rows to FVs
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% Refer paper.
% Segmenting already done.
% Set variables before entering loop
singleSegmentODHFeatureVecLen = (M^2) * numDist;
multiplier = alphabetLen.^[oligoLen-1:-1:0];
for segmentIndex = 1:numOfSegments
thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex));
thisSegmentLen = length(thisSegment);
singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1);
seq_numerical = zeros(1, thisSegmentLen);
for l = 1:alphabetLen
seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l;
end
seq_numerical(seq_numerical==0) = -Inf;
seq_numerical = seq_numerical - 1;
for posInSegment = 1:thisSegmentLen-oligoLen+1
ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1));
end
% ind_list stores the index of oligo-pairs
for first_pos = 1:thisSegmentLen-oligoLen+1
ind_first_pos = (M*numDist) * ind_list(first_pos);
for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1))
%for second_pos = first_pos:thisSegmentLen-oligoLen+1
ind_dist = second_pos - first_pos + 1;
% don't exceed vector dimension!
%if ind_dist <= numDist
% this is commented, because the second_pos values
% will never cross num_dist
ind_second_pos = numDist * ind_list(second_pos);
% non-alphabet bases are ignored
if (ind_first_pos < 0 | ind_second_pos < 0)
else
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ...
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1;
end
%else % nothing
%end
end
end
% singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance.
singleSegmentODHFeatureVec = singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2);
% Individual instances are already stored as space vectors
ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec);
end %segment for loop ends
% ensure that the final fv is of right dimensions
%if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen
% fprintf('Oops, somthing went wrong! Vector lengths dont match\n');
%else %nothing, all OK
%end
% Since the individual vectors are already sparse, do we need to sparsify again?
ODHFeatureVecInstances = sparse(ODHFeatureVecInstances);
end %Function ends