Skip to content

Commit

Permalink
chunks to segments -- keyword usage changed
Browse files Browse the repository at this point in the history
  • Loading branch information
snikumbh committed Jul 19, 2017
1 parent c9f8192 commit ecf9401
Showing 1 changed file with 67 additions and 67 deletions.
134 changes: 67 additions & 67 deletions getODHFeatureVecInstances.m
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
function [ODHFeatureVecInstances, numOfChunks] = getODHFeatureVecInstances(seq, oligoLen, maxDist, chunkSizeInPercentage, chunkSizeInBps, chunkingStart, alphabet, debug)
function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)

% Author: snikumbh
%
Expand All @@ -7,24 +7,24 @@
% 2. oligoLen: oligomer length for ODH (K, default: 2)
% 3. maxDist: maximum distance value to consider for ODH;
% (default: 50)
% 4. chunkSizeInPercentage: Percentage value used to chunk
% 4. segmentSizeInPercentage: Percentage value used to segment
% the complete sequence; set this
% to 0 if percentage-based chunking
% to 0 if percentage-based segmenting
% is not to be used (default: 0)
% 5. chunkSizeInBps: If percentage value for chunking is 0
% 5. segmentSizeInBps: If percentage value for segmenting is 0
% this has to be non-zero and less than
% complete sequence length. It is given
% in number of basepairs. (default:
% 100 bps)
% 6. chunkingStart: 'shift' specifies shifting the chunkStart
% to (round(chunkSizeInBps/2) + 1);
% 6. segmentingStart: 'shift' specifies shifting the segmentStart
% to (round(segmentSizeInBps/2) + 1);
% otherwise 'no-shift' (default)
% 7. alphabet: DNA/Protein sequence alphabet (default: DNA)
%
%
% Returns
% the ODH FeatureVec representation of (ordered) instances
% (chunks) of the sequence supplied; each in sparse format.
% (segments) of the sequence supplied; each in sparse format.
% Sparsify the end result again? May be not needed.
% [These can be concatenated to obtain the one final modifiedODHFeatureVec]
%
Expand All @@ -41,35 +41,35 @@
if nargin < totalArguments - 1
debug = 0;
alphabet = 'acgt';
chunkingStart = 'no-shift';
segmentingStart = 'no-shift';
end
if nargin < totalArguments - 2
debug = 0;
alphabet = 'acgt';
chunkingStart = 'no-shift';
chunkSizeInBps = 100;
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
end
if nargin < totalArguments - 3
debug = 0;
alphabet = 'acgt';
chunkingStart = 'no-shift';
chunkSizeInBps = 100;
chunkSizeInPercentage = 0;
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
end
if nargin < totalArguments - 4
debug = 0;
alphabet = 'acgt';
chunkingStart = 'no-shift';
chunkSizeInBps = 100;
chunkSizeInPercentage = 0;
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
end
if nargin < totalArguments - 5
debug = 0;
alphabet = 'acgt';
chunkingStart = 'no-shift';
chunkSizeInBps = 100;
chunkSizeInPercentage = 0;
segmentingStart = 'no-shift';
segmentSizeInBps = 100;
segmentSizeInPercentage = 0;
maxDist = 50;
oligoLen = 2;
end
Expand All @@ -84,87 +84,87 @@
fprintf('SequenceLength = %d\n', sequenceLen);
end
% allocate memory for (large) non-sparse feature vector
if strcmp(chunkingStart, 'no-shift')
chunkLength = chunkSizeInBps;
% Chunking
chunkStarts = [1: chunkLength: sequenceLen-(chunkLength/2)];
chunkBoundaries = [chunkLength: chunkLength: sequenceLen];
if mod(sequenceLen, chunkLength) <= (chunkLength/2)
chunkBoundaries(length(chunkBoundaries)) = sequenceLen;
if strcmp(segmentingStart, 'no-shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)];
segmentBoundaries = [segmentLength: segmentLength: sequenceLen];
if mod(sequenceLen, segmentLength) <= (segmentLength/2)
segmentBoundaries(length(segmentBoundaries)) = sequenceLen;
else
chunkBoundaries = [chunkBoundaries sequenceLen];
segmentBoundaries = [segmentBoundaries sequenceLen];
end
numOfChunks = length(chunkBoundaries);
elseif strcmp(chunkingStart, 'shift')
chunkLength = chunkSizeInBps;
% Chunking
chunkStarts = [1+(chunkLength/2): chunkLength: (sequenceLen-(chunkLength/2))];
chunkBoundaries = [chunkLength+(chunkLength/2): chunkLength: sequenceLen];
numOfChunks = length(chunkBoundaries);
numOfSegments = length(segmentBoundaries);
elseif strcmp(segmentingStart, 'shift')
segmentLength = segmentSizeInBps;
% Segmenting
segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))];
segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen];
numOfSegments = length(segmentBoundaries);
end
% Asert same number of chunkStarts and Boundaries
%assert(size(chunkStarts,2) == size(chunkBoundaries,2), 'Check chunks!');
% Asert same number of segmentStarts and Boundaries
%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
if debug
fprintf('Chunks with %s\n', chunkingStart);
fprintf('#Chunks = %d\n', numOfChunks);
fprintf('chunkStarts = \n');
disp(chunkStarts);
fprintf('chunkBoundaries = \n');
disp(chunkBoundaries);
fprintf('Segments with %s\n', segmentingStart);
fprintf('#Segments = %d\n', numOfSegments);
fprintf('segmentStarts = \n');
disp(segmentStarts);
fprintf('segmentBoundaries = \n');
disp(segmentBoundaries);
end

if maxDist == chunkSizeInBps
maxDist = chunkSizeInBps-oligoLen;
if maxDist == segmentSizeInBps
maxDist = segmentSizeInBps-oligoLen;
end

numDist = maxDist + 1;

% 1. correct for zero modifiedODHFeatureVec
% 2. also check that max_dist is not greater
% than length of the chunks. The last chunk
% than length of the segments. The last segment
% will always be the equal-sized or smallest,
% hence checking against it only.

%if max_dist > (sequenceLen-chunkBoundaries)-oligoLen+1
% max_dist = (sequenceLen-chunkBoundaries)-oligoLen+1; %maximum possible
%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1
% max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible
% fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist);
% num_dist = max_dist + 1;
%else
% num_dist = max_dist + 1;
%end
%
%columns correspond to different instances; rows to FVs
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfChunks);
ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
%modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
% Refer paper.

% Chunking already done.
% Segmenting already done.
% Set variables before entering loop
singleChunkODHFeatureVecLen = (M^2) * numDist;
singleSegmentODHFeatureVecLen = (M^2) * numDist;
multiplier = alphabetLen.^[oligoLen-1:-1:0];

for chunkIndex = 1:numOfChunks
thisChunk = seq(chunkStarts(chunkIndex) : chunkBoundaries(chunkIndex));
thisChunkLen = length(thisChunk);
for segmentIndex = 1:numOfSegments
thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex));
thisSegmentLen = length(thisSegment);

singleChunkODHFeatureVec = zeros( singleChunkODHFeatureVecLen , 1);
singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1);

seq_numerical = zeros(1, thisChunkLen);
seq_numerical = zeros(1, thisSegmentLen);
for l = 1:alphabetLen
seq_numerical(lower(thisChunk)==lower(alphabet(l))) = l;
seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l;
end

seq_numerical(seq_numerical==0) = -Inf;
seq_numerical = seq_numerical - 1;

for posInChunk = 1:thisChunkLen-oligoLen+1
ind_list(posInChunk) = sum(multiplier .* seq_numerical(posInChunk:posInChunk+oligoLen-1));
for posInSegment = 1:thisSegmentLen-oligoLen+1
ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1));
end
% ind_list stores the index of oligo-pairs
for first_pos = 1:thisChunkLen-oligoLen+1
for first_pos = 1:thisSegmentLen-oligoLen+1
ind_first_pos = (M*numDist) * ind_list(first_pos);
for second_pos = first_pos:min((first_pos+numDist-1), (thisChunkLen-oligoLen+1))
%for second_pos = first_pos:thisChunkLen-oligoLen+1
for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1))
%for second_pos = first_pos:thisSegmentLen-oligoLen+1
ind_dist = second_pos - first_pos + 1;
% don't exceed vector dimension!
%if ind_dist <= numDist
Expand All @@ -174,18 +174,18 @@
% non-alphabet bases are ignored
if (ind_first_pos < 0 | ind_second_pos < 0)
else
singleChunkODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ...
singleChunkODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1;
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ...
singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1;
end
%else % nothing
%end
end
end
% singleChunkODHFeatureVec is ready. Normalize by its length and Collect as an instance.
singleChunkODHFeatureVec = singleChunkODHFeatureVec ./ norm(singleChunkODHFeatureVec, 2);
% singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance.
singleSegmentODHFeatureVec = singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2);
% Individual instances are already stored as space vectors
ODHFeatureVecInstances(:, chunkIndex) = sparse(singleChunkODHFeatureVec);
end %chunk for loop ends
ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec);
end %segment for loop ends

% ensure that the final fv is of right dimensions
%if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen
Expand Down

0 comments on commit ecf9401

Please sign in to comment.