Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)
% GETODHFEATUREVECINSTANCES
%
% INPUT PARAMS
% Param 'seq'
% candidate sequence (length L given in #base pairs)
%
% Param 'oligoLen'
% Oligomer length for ODH (K, default: 2)
%
% Param 'maxDist'
% Maximum distance value to consider for ODH;
% (default: 50)
%
% Param 'segmentSizeInPercentage'
% Percentage value used to segment the complete sequence; set this to 0 if
% percentage-based segmenting is not to be used (default: 0)
%
% Param 'segmentSizeInBps'
% If percentage value for segmenting is 0 this has to be non-zero and less
% than complete sequence length. It is given in number of basepairs.
% (default: 100 bps)
%
% Param 'segmentingStart'
% 'shift' specifies shifting the segmentStart to
% (round(segmentSizeInBps/2) + 1);
% otherwise 'no-shift' (default)
%
% Param 'alphabet'
% DNA/Protein sequence alphabet (default: DNA)
%
%
% OUTPUT PARAMS
% Param 'ODHFeatureVecInstances'
% Representation of (ordered) instances (or segments) of the sequence supplied;
% each in sparse format.
%
% Param numOfSegments
%
% ADDITIONAL NOTES
% -- Order of segments
% . This is only to help the user in determining the where non-shifted
% segments end in the collection, and where shifted segment start
% . The subsequent (set) kernel computation is agnostic to this arrangement
%
% -- maxDist is now as per the original ODH paper, Lmax - K
% . Edits: 06, February, 2017, By: snikumbh
%
% -- Sparsify the end result again? May be not needed.
% . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
%
%
% Author: snikumbh@mpi-inf.mpg.de
% Set default values when not supplied
totalArguments = 8;
if nargin < totalArguments
alphabet = 'acgt';
debug = 0;
end
if nargin < totalArguments - 1
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
end
if nargin < totalArguments - 2
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
end
if nargin < totalArguments - 3
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
end
if nargin < totalArguments - 4
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
end
if nargin < totalArguments - 5
debug = 0;
alphabet = 'acgt';
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
oligoLen = 2;
end
alphabetLen = length(alphabet);
M = alphabetLen^oligoLen;
% current sequence length
sequenceLen = length(seq);
if debug
fprintf('SequenceLength = %d\n', sequenceLen);
end
% allocate memory for (large) non-sparse feature vector
if strcmp(segmentingStart, 'no-shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)];
segmentBoundaries = [segmentLength: segmentLength: sequenceLen];
if mod(sequenceLen, segmentLength) <= (segmentLength/2)
segmentBoundaries(length(segmentBoundaries)) = sequenceLen;
else
segmentBoundaries = [segmentBoundaries sequenceLen];
end
numOfSegments = length(segmentBoundaries);
elseif strcmp(segmentingStart, 'shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))];
segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen];
numOfSegments = length(segmentBoundaries);
end
% Asert same number of segmentStarts and Boundaries
% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
if debug
fprintf('Segments with %s\n', segmentingStart);
fprintf('#Segments = %d\n', numOfSegments);
fprintf('segmentStarts = \n');
disp(segmentStarts);
fprintf('segmentBoundaries = \n');
disp(segmentBoundaries);
end
if maxDist == segmentSizeInBps
maxDist = segmentSizeInBps-oligoLen;
end
numDist = maxDist + 1;
% 1. correct for zero modifiedODHFeatureVec
% 2. also check that max_dist is not greater
% than length of the segments. The last segment
% will always be the equal-sized or smallest,
% hence checking against it only.
%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1
% max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible
% fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist);
% num_dist = max_dist + 1;
%else
% num_dist = max_dist + 1;
%end
%
%columns correspond to different instances; rows to FVs
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% Refer paper.
% Segmenting already done.
% Set variables before entering loop
singleSegmentODHFeatureVecLen = (M^2) * numDist;
multiplier = alphabetLen.^[oligoLen-1:-1:0];
for segmentIndex = 1:numOfSegments
thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex));
thisSegmentLen = length(thisSegment);
singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1);
seq_numerical = zeros(1, thisSegmentLen);
for l = 1:alphabetLen
seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l;
end
seq_numerical(seq_numerical==0) = -Inf;
seq_numerical = seq_numerical - 1;
for posInSegment = 1:thisSegmentLen-oligoLen+1
ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1));
end
% ind_list stores the index of oligo-pairs
for first_pos = 1:thisSegmentLen-oligoLen+1
ind_first_pos = (M*numDist) * ind_list(first_pos);
for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1))
%for second_pos = first_pos:thisSegmentLen-oligoLen+1
ind_dist = second_pos - first_pos + 1;
% don't exceed vector dimension!
%if ind_dist <= numDist
% this is commented, because the second_pos values
% will never cross num_dist
ind_second_pos = numDist * ind_list(second_pos);
% non-alphabet bases are ignored
if (ind_first_pos < 0 | ind_second_pos < 0)
else
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ...
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1;
end
%else % nothing
%end
end
end
% singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance.
singleSegmentODHFeatureVec = singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2);
% Individual instances are already stored as space vectors
ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec);
end %segment for loop ends
% ensure that the final fv is of right dimensions
%if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen
% fprintf('Oops, somthing went wrong! Vector lengths dont match\n');
%else %nothing, all OK
%end
% Since the individual vectors are already sparse, do we need to sparsify again?
ODHFeatureVecInstances = sparse(ODHFeatureVecInstances);
end %Function ends