getODHFeatureVecInstances.m

function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)

% GETODHFEATUREVECINSTANCES
%
% INPUT PARAMS
% Param 'seq' 
%  candidate sequence (length L given in #base pairs)
%
% Param 'oligoLen'
% Oligomer length for ODH (K, default: 2)
%
% Param 'maxDist'
% Maximum distance value to consider for ODH;
%		(default: 50)
%
% Param 'segmentSizeInPercentage'
% Percentage value used to segment the complete sequence; set this to 0 if 
% percentage-based segmenting is not to be used (default: 0)
%
% Param 'segmentSizeInBps'
% If percentage value for segmenting is 0 this has to be non-zero and less 
% than complete sequence length. It is given in number of basepairs. 
% (default: 100 bps)
%
% Param 'segmentingStart' 
% 'shift' specifies shifting the segmentStart to 
%     (round(segmentSizeInBps/2) + 1); 
% otherwise 'no-shift' (default)
%
% Param 'alphabet'
% DNA/Protein sequence alphabet (default: DNA)
%
%
% OUTPUT PARAMS 
% Param 'ODHFeatureVecInstances'
% Representation of (ordered) instances (or segments) of the sequence supplied; 
% each in sparse format.
%
% Param numOfSegments
% 
% ADDITIONAL NOTES
% -- Order of segments
%    . This is only to help the user in determining the where non-shifted 
%      segments end in the collection, and where shifted segment start
%    . The subsequent (set) kernel computation is agnostic to this arrangement
%
% -- maxDist is now as per the original ODH paper, Lmax - K
%    . Edits: 06, February, 2017, By: snikumbh 
%
% -- Sparsify the end result again? May be not needed.
%    . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
%
%
% Author: snikumbh@mpi-inf.mpg.de

% Set default values when not supplied
totalArguments = 8;
if nargin < totalArguments
    alphabet = 'acgt';
    debug = 0;
end
if nargin < totalArguments - 1
    debug = 0;
    alphabet = 'acgt';
    segmentingStart = 'no-shift';
end
if nargin < totalArguments - 2
    debug = 0;
    alphabet = 'acgt';
    segmentingStart = 'no-shift';
    segmentSizeInBps = 100;
end
if nargin < totalArguments - 3
    debug = 0;
    alphabet = 'acgt';
    segmentingStart = 'no-shift';
    segmentSizeInBps = 100;
    segmentSizeInPercentage = 0;
end
if nargin < totalArguments - 4
    debug = 0;
    alphabet = 'acgt';
    segmentingStart = 'no-shift';
    segmentSizeInBps = 100;
    segmentSizeInPercentage = 0;
    maxDist = 50;
end
if nargin < totalArguments - 5
    debug = 0;
    alphabet = 'acgt';
    segmentingStart = 'no-shift';
    segmentSizeInBps = 100;
    segmentSizeInPercentage = 0;
    maxDist = 50;
    oligoLen = 2;
end


alphabetLen = length(alphabet);
M = alphabetLen^oligoLen;

% current sequence length
sequenceLen = length(seq);
if debug
   fprintf('SequenceLength = %d\n', sequenceLen);
end
% allocate memory for (large) non-sparse feature vector
if strcmp(segmentingStart, 'no-shift') 
    segmentLength = segmentSizeInBps;
    % Segmenting
    segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)];
    segmentBoundaries = [segmentLength: segmentLength: sequenceLen];
    if mod(sequenceLen, segmentLength) <= (segmentLength/2)
        segmentBoundaries(length(segmentBoundaries)) = sequenceLen;
    else
        segmentBoundaries = [segmentBoundaries sequenceLen];
    end
    numOfSegments = length(segmentBoundaries);
elseif strcmp(segmentingStart, 'shift')
    segmentLength = segmentSizeInBps;
    % Segmenting
    segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))];
    segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen];
    numOfSegments = length(segmentBoundaries);
end
% Asert same number of segmentStarts and Boundaries
% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
if debug
    fprintf('Segments with %s\n', segmentingStart);
    fprintf('#Segments = %d\n', numOfSegments);
    fprintf('segmentStarts = \n');
    disp(segmentStarts);
    fprintf('segmentBoundaries = \n');
    disp(segmentBoundaries);
end

if maxDist == segmentSizeInBps
    maxDist = segmentSizeInBps-oligoLen;
end

numDist = maxDist + 1;

% 1. correct for zero modifiedODHFeatureVec
% 2. also check that max_dist is not greater 
% than length of the segments. The last segment 
% will always be the equal-sized or smallest, 
% hence checking against it only.

%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1
%    max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible
%    fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist);
%    num_dist = max_dist + 1;
%else
%    num_dist = max_dist + 1;
%end
%
%columns correspond to different instances; rows to FVs
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% Refer paper. 

% Segmenting already done.
% Set variables before entering loop
singleSegmentODHFeatureVecLen = (M^2) * numDist;
multiplier = alphabetLen.^[oligoLen-1:-1:0];

for segmentIndex = 1:numOfSegments
    thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex));
    thisSegmentLen = length(thisSegment);

    singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1);

    seq_numerical = zeros(1, thisSegmentLen);
    for l = 1:alphabetLen
        seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l;
    end

    seq_numerical(seq_numerical==0) = -Inf;
    seq_numerical = seq_numerical - 1;
    
    for posInSegment = 1:thisSegmentLen-oligoLen+1
        ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1));
    end
    % ind_list stores the index of oligo-pairs
    for first_pos = 1:thisSegmentLen-oligoLen+1
        ind_first_pos  = (M*numDist) * ind_list(first_pos);
        for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1))
	%for second_pos = first_pos:thisSegmentLen-oligoLen+1
            ind_dist       = second_pos - first_pos + 1;
            % don't exceed vector dimension!
            %if ind_dist <= numDist
		% this is commented, because the second_pos values
		% will never cross num_dist
                ind_second_pos =    numDist  * ind_list(second_pos);
                % non-alphabet bases are ignored
                if (ind_first_pos < 0 | ind_second_pos < 0)
                else
	            singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ...
	              singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1;
                end
            %else % nothing
            %end
        end
    end
    % singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance.
    singleSegmentODHFeatureVec =  singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2);
    % Individual instances are already stored as space vectors
    ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec);
end %segment for loop ends

% ensure that the final fv is of right dimensions 
%if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen
%    fprintf('Oops, somthing went wrong! Vector lengths dont match\n');
%else %nothing, all OK
%end

% Since the individual vectors are already sparse, do we need to sparsify again?
ODHFeatureVecInstances = sparse(ODHFeatureVecInstances);

end %Function ends