Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
comik/getODHFeatureVecInstances.m
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
221 lines (203 sloc)
7.28 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug) | |
% GETODHFEATUREVECINSTANCES | |
% | |
% INPUT PARAMS | |
% Param 'seq' | |
% candidate sequence (length L given in #base pairs) | |
% | |
% Param 'oligoLen' | |
% Oligomer length for ODH (K, default: 2) | |
% | |
% Param 'maxDist' | |
% Maximum distance value to consider for ODH; | |
% (default: 50) | |
% | |
% Param 'segmentSizeInPercentage' | |
% Percentage value used to segment the complete sequence; set this to 0 if | |
% percentage-based segmenting is not to be used (default: 0) | |
% | |
% Param 'segmentSizeInBps' | |
% If percentage value for segmenting is 0 this has to be non-zero and less | |
% than complete sequence length. It is given in number of basepairs. | |
% (default: 100 bps) | |
% | |
% Param 'segmentingStart' | |
% 'shift' specifies shifting the segmentStart to | |
% (round(segmentSizeInBps/2) + 1); | |
% otherwise 'no-shift' (default) | |
% | |
% Param 'alphabet' | |
% DNA/Protein sequence alphabet (default: DNA) | |
% | |
% | |
% OUTPUT PARAMS | |
% Param 'ODHFeatureVecInstances' | |
% Representation of (ordered) instances (or segments) of the sequence supplied; | |
% each in sparse format. | |
% | |
% Param numOfSegments | |
% | |
% ADDITIONAL NOTES | |
% -- Order of segments | |
% . This is only to help the user in determining the where non-shifted | |
% segments end in the collection, and where shifted segment start | |
% . The subsequent (set) kernel computation is agnostic to this arrangement | |
% | |
% -- maxDist is now as per the original ODH paper, Lmax - K | |
% . Edits: 06, February, 2017, By: snikumbh | |
% | |
% -- Sparsify the end result again? May be not needed. | |
% . [These can be concatenated to obtain the one final modifiedODHFeatureVec] | |
% | |
% | |
% | |
% Author: snikumbh@mpi-inf.mpg.de | |
% Set default values when not supplied | |
totalArguments = 8; | |
if nargin < totalArguments | |
alphabet = 'acgt'; | |
debug = 0; | |
end | |
if nargin < totalArguments - 1 | |
debug = 0; | |
alphabet = 'acgt'; | |
segmentingStart = 'no-shift'; | |
end | |
if nargin < totalArguments - 2 | |
debug = 0; | |
alphabet = 'acgt'; | |
segmentingStart = 'no-shift'; | |
segmentSizeInBps = 100; | |
end | |
if nargin < totalArguments - 3 | |
debug = 0; | |
alphabet = 'acgt'; | |
segmentingStart = 'no-shift'; | |
segmentSizeInBps = 100; | |
segmentSizeInPercentage = 0; | |
end | |
if nargin < totalArguments - 4 | |
debug = 0; | |
alphabet = 'acgt'; | |
segmentingStart = 'no-shift'; | |
segmentSizeInBps = 100; | |
segmentSizeInPercentage = 0; | |
maxDist = 50; | |
end | |
if nargin < totalArguments - 5 | |
debug = 0; | |
alphabet = 'acgt'; | |
segmentingStart = 'no-shift'; | |
segmentSizeInBps = 100; | |
segmentSizeInPercentage = 0; | |
maxDist = 50; | |
oligoLen = 2; | |
end | |
alphabetLen = length(alphabet); | |
M = alphabetLen^oligoLen; | |
% current sequence length | |
sequenceLen = length(seq); | |
if debug | |
fprintf('SequenceLength = %d\n', sequenceLen); | |
end | |
% allocate memory for (large) non-sparse feature vector | |
if strcmp(segmentingStart, 'no-shift') | |
segmentLength = segmentSizeInBps; | |
% Segmenting | |
segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)]; | |
segmentBoundaries = [segmentLength: segmentLength: sequenceLen]; | |
if mod(sequenceLen, segmentLength) <= (segmentLength/2) | |
segmentBoundaries(length(segmentBoundaries)) = sequenceLen; | |
else | |
segmentBoundaries = [segmentBoundaries sequenceLen]; | |
end | |
numOfSegments = length(segmentBoundaries); | |
elseif strcmp(segmentingStart, 'shift') | |
segmentLength = segmentSizeInBps; | |
% Segmenting | |
segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))]; | |
segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen]; | |
numOfSegments = length(segmentBoundaries); | |
end | |
% Asert same number of segmentStarts and Boundaries | |
% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!'); | |
if debug | |
fprintf('Segments with %s\n', segmentingStart); | |
fprintf('#Segments = %d\n', numOfSegments); | |
fprintf('segmentStarts = \n'); | |
disp(segmentStarts); | |
fprintf('segmentBoundaries = \n'); | |
disp(segmentBoundaries); | |
end | |
if maxDist == segmentSizeInBps | |
maxDist = segmentSizeInBps-oligoLen; | |
end | |
numDist = maxDist + 1; | |
% 1. correct for zero modifiedODHFeatureVec | |
% 2. also check that max_dist is not greater | |
% than length of the segments. The last segment | |
% will always be the equal-sized or smallest, | |
% hence checking against it only. | |
%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1 | |
% max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible | |
% fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist); | |
% num_dist = max_dist + 1; | |
%else | |
% num_dist = max_dist + 1; | |
%end | |
% | |
%columns correspond to different instances; rows to FVs | |
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments); | |
% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1); | |
% Refer paper. | |
% Segmenting already done. | |
% Set variables before entering loop | |
singleSegmentODHFeatureVecLen = (M^2) * numDist; | |
multiplier = alphabetLen.^[oligoLen-1:-1:0]; | |
for segmentIndex = 1:numOfSegments | |
thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex)); | |
thisSegmentLen = length(thisSegment); | |
singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1); | |
seq_numerical = zeros(1, thisSegmentLen); | |
for l = 1:alphabetLen | |
seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l; | |
end | |
seq_numerical(seq_numerical==0) = -Inf; | |
seq_numerical = seq_numerical - 1; | |
for posInSegment = 1:thisSegmentLen-oligoLen+1 | |
ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1)); | |
end | |
% ind_list stores the index of oligo-pairs | |
for first_pos = 1:thisSegmentLen-oligoLen+1 | |
ind_first_pos = (M*numDist) * ind_list(first_pos); | |
for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1)) | |
%for second_pos = first_pos:thisSegmentLen-oligoLen+1 | |
ind_dist = second_pos - first_pos + 1; | |
% don't exceed vector dimension! | |
%if ind_dist <= numDist | |
% this is commented, because the second_pos values | |
% will never cross num_dist | |
ind_second_pos = numDist * ind_list(second_pos); | |
% non-alphabet bases are ignored | |
if (ind_first_pos < 0 | ind_second_pos < 0) | |
else | |
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ... | |
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1; | |
end | |
%else % nothing | |
%end | |
end | |
end | |
% singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance. | |
singleSegmentODHFeatureVec = singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2); | |
% Individual instances are already stored as space vectors | |
ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec); | |
end %segment for loop ends | |
% ensure that the final fv is of right dimensions | |
%if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen | |
% fprintf('Oops, somthing went wrong! Vector lengths dont match\n'); | |
%else %nothing, all OK | |
%end | |
% Since the individual vectors are already sparse, do we need to sparsify again? | |
ODHFeatureVecInstances = sparse(ODHFeatureVecInstances); | |
end %Function ends |