From ecf9401985041294256c532a5ad835aebcd19d6a Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Wed, 19 Jul 2017 16:02:18 +0200 Subject: [PATCH] chunks to segments -- keyword usage changed --- getODHFeatureVecInstances.m | 134 ++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/getODHFeatureVecInstances.m b/getODHFeatureVecInstances.m index f8111f8..70744b1 100644 --- a/getODHFeatureVecInstances.m +++ b/getODHFeatureVecInstances.m @@ -1,4 +1,4 @@ -function [ODHFeatureVecInstances, numOfChunks] = getODHFeatureVecInstances(seq, oligoLen, maxDist, chunkSizeInPercentage, chunkSizeInBps, chunkingStart, alphabet, debug) +function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug) % Author: snikumbh % @@ -7,24 +7,24 @@ % 2. oligoLen: oligomer length for ODH (K, default: 2) % 3. maxDist: maximum distance value to consider for ODH; % (default: 50) -% 4. chunkSizeInPercentage: Percentage value used to chunk +% 4. segmentSizeInPercentage: Percentage value used to segment % the complete sequence; set this -% to 0 if percentage-based chunking +% to 0 if percentage-based segmenting % is not to be used (default: 0) -% 5. chunkSizeInBps: If percentage value for chunking is 0 +% 5. segmentSizeInBps: If percentage value for segmenting is 0 % this has to be non-zero and less than % complete sequence length. It is given % in number of basepairs. (default: % 100 bps) -% 6. chunkingStart: 'shift' specifies shifting the chunkStart -% to (round(chunkSizeInBps/2) + 1); +% 6. segmentingStart: 'shift' specifies shifting the segmentStart +% to (round(segmentSizeInBps/2) + 1); % otherwise 'no-shift' (default) % 7. alphabet: DNA/Protein sequence alphabet (default: DNA) % % % Returns % the ODH FeatureVec representation of (ordered) instances -% (chunks) of the sequence supplied; each in sparse format. +% (segments) of the sequence supplied; each in sparse format. % Sparsify the end result again? May be not needed. % [These can be concatenated to obtain the one final modifiedODHFeatureVec] % @@ -41,35 +41,35 @@ if nargin < totalArguments - 1 debug = 0; alphabet = 'acgt'; - chunkingStart = 'no-shift'; + segmentingStart = 'no-shift'; end if nargin < totalArguments - 2 debug = 0; alphabet = 'acgt'; - chunkingStart = 'no-shift'; - chunkSizeInBps = 100; + segmentingStart = 'no-shift'; + segmentSizeInBps = 100; end if nargin < totalArguments - 3 debug = 0; alphabet = 'acgt'; - chunkingStart = 'no-shift'; - chunkSizeInBps = 100; - chunkSizeInPercentage = 0; + segmentingStart = 'no-shift'; + segmentSizeInBps = 100; + segmentSizeInPercentage = 0; end if nargin < totalArguments - 4 debug = 0; alphabet = 'acgt'; - chunkingStart = 'no-shift'; - chunkSizeInBps = 100; - chunkSizeInPercentage = 0; + segmentingStart = 'no-shift'; + segmentSizeInBps = 100; + segmentSizeInPercentage = 0; maxDist = 50; end if nargin < totalArguments - 5 debug = 0; alphabet = 'acgt'; - chunkingStart = 'no-shift'; - chunkSizeInBps = 100; - chunkSizeInPercentage = 0; + segmentingStart = 'no-shift'; + segmentSizeInBps = 100; + segmentSizeInPercentage = 0; maxDist = 50; oligoLen = 2; end @@ -84,49 +84,49 @@ fprintf('SequenceLength = %d\n', sequenceLen); end % allocate memory for (large) non-sparse feature vector -if strcmp(chunkingStart, 'no-shift') - chunkLength = chunkSizeInBps; - % Chunking - chunkStarts = [1: chunkLength: sequenceLen-(chunkLength/2)]; - chunkBoundaries = [chunkLength: chunkLength: sequenceLen]; - if mod(sequenceLen, chunkLength) <= (chunkLength/2) - chunkBoundaries(length(chunkBoundaries)) = sequenceLen; +if strcmp(segmentingStart, 'no-shift') + segmentLength = segmentSizeInBps; + % Segmenting + segmentStarts = [1: segmentLength: sequenceLen-(segmentLength/2)]; + segmentBoundaries = [segmentLength: segmentLength: sequenceLen]; + if mod(sequenceLen, segmentLength) <= (segmentLength/2) + segmentBoundaries(length(segmentBoundaries)) = sequenceLen; else - chunkBoundaries = [chunkBoundaries sequenceLen]; + segmentBoundaries = [segmentBoundaries sequenceLen]; end - numOfChunks = length(chunkBoundaries); -elseif strcmp(chunkingStart, 'shift') - chunkLength = chunkSizeInBps; - % Chunking - chunkStarts = [1+(chunkLength/2): chunkLength: (sequenceLen-(chunkLength/2))]; - chunkBoundaries = [chunkLength+(chunkLength/2): chunkLength: sequenceLen]; - numOfChunks = length(chunkBoundaries); + numOfSegments = length(segmentBoundaries); +elseif strcmp(segmentingStart, 'shift') + segmentLength = segmentSizeInBps; + % Segmenting + segmentStarts = [1+(segmentLength/2): segmentLength: (sequenceLen-(segmentLength/2))]; + segmentBoundaries = [segmentLength+(segmentLength/2): segmentLength: sequenceLen]; + numOfSegments = length(segmentBoundaries); end -% Asert same number of chunkStarts and Boundaries -%assert(size(chunkStarts,2) == size(chunkBoundaries,2), 'Check chunks!'); +% Asert same number of segmentStarts and Boundaries +%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!'); if debug - fprintf('Chunks with %s\n', chunkingStart); - fprintf('#Chunks = %d\n', numOfChunks); - fprintf('chunkStarts = \n'); - disp(chunkStarts); - fprintf('chunkBoundaries = \n'); - disp(chunkBoundaries); + fprintf('Segments with %s\n', segmentingStart); + fprintf('#Segments = %d\n', numOfSegments); + fprintf('segmentStarts = \n'); + disp(segmentStarts); + fprintf('segmentBoundaries = \n'); + disp(segmentBoundaries); end -if maxDist == chunkSizeInBps - maxDist = chunkSizeInBps-oligoLen; +if maxDist == segmentSizeInBps + maxDist = segmentSizeInBps-oligoLen; end numDist = maxDist + 1; % 1. correct for zero modifiedODHFeatureVec % 2. also check that max_dist is not greater -% than length of the chunks. The last chunk +% than length of the segments. The last segment % will always be the equal-sized or smallest, % hence checking against it only. -%if max_dist > (sequenceLen-chunkBoundaries)-oligoLen+1 -% max_dist = (sequenceLen-chunkBoundaries)-oligoLen+1; %maximum possible +%if max_dist > (sequenceLen-segmentBoundaries)-oligoLen+1 +% max_dist = (sequenceLen-segmentBoundaries)-oligoLen+1; %maximum possible % fprintf('IMPORTANT: Max. distance has been set to %d\n', max_dist); % num_dist = max_dist + 1; %else @@ -134,37 +134,37 @@ %end % %columns correspond to different instances; rows to FVs -ODHFeatureVecInstances = zeros((M^2) * numDist, numOfChunks); +ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments); %modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1); % Refer paper. -% Chunking already done. +% Segmenting already done. % Set variables before entering loop -singleChunkODHFeatureVecLen = (M^2) * numDist; +singleSegmentODHFeatureVecLen = (M^2) * numDist; multiplier = alphabetLen.^[oligoLen-1:-1:0]; -for chunkIndex = 1:numOfChunks - thisChunk = seq(chunkStarts(chunkIndex) : chunkBoundaries(chunkIndex)); - thisChunkLen = length(thisChunk); +for segmentIndex = 1:numOfSegments + thisSegment = seq(segmentStarts(segmentIndex) : segmentBoundaries(segmentIndex)); + thisSegmentLen = length(thisSegment); - singleChunkODHFeatureVec = zeros( singleChunkODHFeatureVecLen , 1); + singleSegmentODHFeatureVec = zeros( singleSegmentODHFeatureVecLen , 1); - seq_numerical = zeros(1, thisChunkLen); + seq_numerical = zeros(1, thisSegmentLen); for l = 1:alphabetLen - seq_numerical(lower(thisChunk)==lower(alphabet(l))) = l; + seq_numerical(lower(thisSegment)==lower(alphabet(l))) = l; end seq_numerical(seq_numerical==0) = -Inf; seq_numerical = seq_numerical - 1; - for posInChunk = 1:thisChunkLen-oligoLen+1 - ind_list(posInChunk) = sum(multiplier .* seq_numerical(posInChunk:posInChunk+oligoLen-1)); + for posInSegment = 1:thisSegmentLen-oligoLen+1 + ind_list(posInSegment) = sum(multiplier .* seq_numerical(posInSegment:posInSegment+oligoLen-1)); end % ind_list stores the index of oligo-pairs - for first_pos = 1:thisChunkLen-oligoLen+1 + for first_pos = 1:thisSegmentLen-oligoLen+1 ind_first_pos = (M*numDist) * ind_list(first_pos); - for second_pos = first_pos:min((first_pos+numDist-1), (thisChunkLen-oligoLen+1)) - %for second_pos = first_pos:thisChunkLen-oligoLen+1 + for second_pos = first_pos:min((first_pos+numDist-1), (thisSegmentLen-oligoLen+1)) + %for second_pos = first_pos:thisSegmentLen-oligoLen+1 ind_dist = second_pos - first_pos + 1; % don't exceed vector dimension! %if ind_dist <= numDist @@ -174,18 +174,18 @@ % non-alphabet bases are ignored if (ind_first_pos < 0 | ind_second_pos < 0) else - singleChunkODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ... - singleChunkODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1; + singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) = ... + singleSegmentODHFeatureVec(ind_first_pos + ind_second_pos + ind_dist) + 1; end %else % nothing %end end end - % singleChunkODHFeatureVec is ready. Normalize by its length and Collect as an instance. - singleChunkODHFeatureVec = singleChunkODHFeatureVec ./ norm(singleChunkODHFeatureVec, 2); + % singleSegmentODHFeatureVec is ready. Normalize by its length and Collect as an instance. + singleSegmentODHFeatureVec = singleSegmentODHFeatureVec ./ norm(singleSegmentODHFeatureVec, 2); % Individual instances are already stored as space vectors - ODHFeatureVecInstances(:, chunkIndex) = sparse(singleChunkODHFeatureVec); -end %chunk for loop ends + ODHFeatureVecInstances(:, segmentIndex) = sparse(singleSegmentODHFeatureVec); +end %segment for loop ends % ensure that the final fv is of right dimensions %if length(modifiedODHFeatureVec) ~= modifiedODHFeatureVecLen