From 02bed785860ead4ce7350fe5fa03050d59f4b531 Mon Sep 17 00:00:00 2001 From: Sarvesh Prakash Nikumbh Date: Wed, 19 Jul 2017 17:55:23 +0200 Subject: [PATCH] Cleaning up compute ConformalMI kernel implementation. --- computeConformedMultiInstanceKernel.m | 108 ++++++++------------------ 1 file changed, 32 insertions(+), 76 deletions(-) diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m index dd0aacf..2d4b6a2 100644 --- a/computeConformedMultiInstanceKernel.m +++ b/computeConformedMultiInstanceKernel.m @@ -1,14 +1,16 @@ function [conformedMultiInstanceKernel, rawConformedMultiInstanceKernel, instancesTransformationKernel, clusterCentres] = computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, trainIndices, allSeqsAsBags, nClusters, sig, Y, computationVersion, debugLevel, debugMsgLocation) % -% 1. Obtain expansion points +% Obtain expansion points % -% 1.1 Convert allSeqsAsBags in to a matrix X of dimensions nInstances x FVlength +% -- Convert allSeqsAsBags in to a matrix X of dimensions nInstances x FVlength % -% spread out the instances, get the nInstances -% trainIndices are trainingIndices +% -- spread out the instances, get the nInstances +% -- trainIndices are trainingIndices % Edit: We earlier expected trainIndices to be a row vector. Now it can be either % a row or a column vector. + if nargin < 10 + % default computationVersion = 'Looping'; end nBags = length(allSeqsAsBags); @@ -20,10 +22,9 @@ instanceIDVector = zeros(1,instanceEnds(end)); end -%nInstancesPos = 0; -% maintain a count of the instances arising from training indices/bags -% this is used later to define the number of rows for XtoPassFrom +% Maintain a count of the instances arising from training indices/bags +% This is used later to define the number of rows for XtoPassFrom currentTotal = 0; for i=1:nBags % #instances at each index/bag @@ -31,13 +32,15 @@ if strcmp(computationVersion, 'AccumArray') instanceIDVector(instanceStarts(i):instanceEnds(i)) = repmat(i, [1 size(allSeqsAsBags{i}, 2)]); end - % useful to have stratified repressentation, we don't need this now. + %% useful to have stratified repressentation, we don't need this now. % if i <= posInd % nInstancesPos = nInstancesPos + size(allSeqsAsBags{i}, 2); - % end % useful to have stratified repressentation + % end + %% useful to have stratified repressentation if any(i==trainIndices) == 1 %check, if a trainingIndex, get instances currentTotal = currentTotal+idxInstances(i); - end % useful to count instances only in training bags + end + %% useful to count instances only in training bags end forIndicesInX = cumsum(idxInstances); nInstances = sum(idxInstances); @@ -49,7 +52,7 @@ %X = zeros( nInstances, size(allSeqsAsBags{1},1) ); logMessages(debugMsgLocation, sprintf('\n--- currentTotal for setting a zeros XtoPassFrom: %d\n', currentTotal), debugLevel); XtoPassFrom = zeros( currentTotal, size(allSeqsAsBags{1},1) ); -% make sure that only instances from train bags are taken +% Make sure that only instances from training bags are taken % trainIndices store the right set of indices as passed to it currentTotal = 0; for i=1:length(trainIndices)%indices of relevant bags/ training bags @@ -59,21 +62,22 @@ logMessages(debugMsgLocation, sprintf('--- allSeqsAsBags is now n x p matrix, ready for kmeans\n'), debugLevel); % % 1.2. Apply kmeans -% -- we are using Matlab's algorithm for now. +% -- we are using Matlab's algorithm for kmeans. % -- using the buckshot hueristic means: % - randomly sample sqrt(nClusters * nInstances) % data points from nInstances % - this will give nClusters -% sending complete X for k-means clustering, also perform replicates -% XtoPass = X; +% If sending complete X for k-means clustering set XtoPass = X; +% Replicates nofRep = 1; -% randomly sample without replacement +% Randomly sample without replacement % XtoPassFrom contains only the relevant instances, hence freely select any instances from it nSampled = round(sqrt(nClusters * currentTotal)); logMessages(debugMsgLocation, sprintf('--- for kmeans, Total: %d, nSampled: %d, nClusters: %d\n', currentTotal, nSampled, nClusters), debugLevel); XtoPass = XtoPassFrom( transpose(randsample( currentTotal, round(sqrt(nClusters * currentTotal)) )), :); -%XtoPass = XtoPassFrom; +%% Without the buckshot heuristic, use +% XtoPass = XtoPassFram clear('XtoPassFrom'); [clusterCentres, matrixOfDistancesFromCentresForPassedVectors] = getExpansionPoints(XtoPass, nClusters, nofRep, debugLevel, debugMsgLocation); clear('XtoPass'); @@ -96,7 +100,7 @@ % instancesTransformationKernel is cell array for number of bags % instancesTransformationKernel{eachBag} is nInstanceInBag-by-nClusters -%initialize kernels +% Initialize kernels for i=1:nClusters conformedMultiInstanceKernel{i} = zeros(nBags); rawConformedMultiInstanceKernel{i} = zeros(nBags); @@ -110,13 +114,12 @@ %%instanceSubs are only the upper-triangular indices including the diagonal %clear('tempInstanceSubs'); instanceWideKernelRepeated = repmat(instanceWideKernel, 1, 1, nClusters); - %allInstancesTransformations = zeros(nInstances, nClusters); allInstancesTransformations = cat(1,instancesTransformationKernel{:}); for i=1:nClusters KernelAsVector = (allInstancesTransformations(:,i) * allInstancesTransformations(:,i)') .* instanceWideKernelRepeated(:,:,i); tempKernel = accumarray(instanceSubs, KernelAsVector(:)); assert(size(tempKernel,1) == size(zeros(nBags),1)); - %tempKernel = triu(tempKernel, 1) + tempKernel'; %baecause instanceSubs stored indices for only the upper-triangular part of the matrix + %tempKernel = triu(tempKernel, 1) + tempKernel'; %because instanceSubs stored indices for only the upper-triangular part of the matrix conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); end clear('instanceWideKernelRepeated'); @@ -126,54 +129,16 @@ logMessages(debugMsgLocation, sprintf('done in %.3f seconds', toc), debugLevel); end if strcmp(computationVersion, 'Looping') - % one can convert the transformationKernels into 3D matrices + %% one can convert the transformationKernels into 3D matrices logMessages(debugMsgLocation, sprintf('\n--- Looping-over-bags version...'), debugLevel); - do_reshaped = 1; - if issparse(allSeqsAsBags{1}) && do_reshaped == 0 - logMessages(debugMsgLocation, sprintf('Doing the reshaped version for sparse vectors..\n'), debugLevel); - % pre-allocate memory, doesn't help in speed - %for i=1:nClusters - % rawConformedMultiInstanceKernel{i} = zeros(nBags); - % conformedMultiInstanceKernel{i} = zeros(nBags); - %end - tic; - tempKernelCollection = zeros(nBags, nBags, nClusters); - for b1=1:nBags - if rem(b1, 100) == 0 - logMessages(debugMsgLocation, sprintf('Bag/Row %d--', b1), debugLevel); - end - b1r = reshape(instancesTransformationKernel{b1}, idxInstances(b1) , [], nClusters); - b1BagAsMat = full(allSeqsAsBags{b1}); - for b2=b1:nBags - b2r = reshape(instancesTransformationKernel{b2}, idxInstances(b2) , [], nClusters); - b2rt = permute(b2r,[2,1,3]); - b1b2_transformationProducts = bsxfun(@times, b1r , b2rt); - b2BagAsMat = full(allSeqsAsBags{b2}); - tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, (b1BagAsMat' * b2BagAsMat)); - %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, full(allSeqsAsBags{b1}' * allSeqsAsBags{b2})); - %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, instanceWideKernel(instanceStarts(b1):instanceEnds(b1), instanceStarts(b2):instanceEnds(b2)) ); - tempKernelCollection(b1, b2, :) = 1/(idxInstances(b1) * idxInstances(b2)) * sum(sum(tempToBeSummed,1)); - end - end - clear b1BagAsMat; - clear b2BagAsMat; - logMessages(debugMsgLocation, sprintf('Bag-wise kernel done in %.3f seconds\n', toc), debugLevel); - tic;logMessages(debugMsgLocation, sprintf('Making from upper-triangular to full...'), debugLevel); - for i=1:nClusters - tempKernel = zeros(nBags); - tempKernel = triu(tempKernelCollection(:,:,i), 1) + tempKernelCollection(:,:,i)'; - rawConformedMultiInstanceKernel{i} = tempKernel; - %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(tempKernel)), min(min(tempKernel))); - conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); - end - logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); - elseif issparse(allSeqsAsBags{1}) && do_reshaped == 1 + %% Performing the reshaped version + if issparse(allSeqsAsBags{1}) logMessages(debugMsgLocation, sprintf('Doing the reshaped version..\n'), debugLevel); - % pre-allocate memory, doesn't help in speed - %for i=1:nClusters - % rawConformedMultiInstanceKernel{i} = zeros(nBags); - % conformedMultiInstanceKernel{i} = zeros(nBags); - %end + %% Pre-allocate memory, doesn't help in speed + % for i=1:nClusters + % rawConformedMultiInstanceKernel{i} = zeros(nBags); + % conformedMultiInstanceKernel{i} = zeros(nBags); + % end tic; tempKernelCollection = zeros(nBags, nBags, nClusters); for b1=1:nBags @@ -182,7 +147,6 @@ b2r = reshape(instancesTransformationKernel{b2}, idxInstances(b2) , [], nClusters); b2rt = permute(b2r,[2,1,3]); b1b2_transformationProducts = bsxfun(@times, b1r , b2rt); - %tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, full(allSeqsAsBags{b1}' * allSeqsAsBags{b2})); tempToBeSummed = bsxfun(@times, b1b2_transformationProducts, instanceWideKernel(instanceStarts(b1):instanceEnds(b1), instanceStarts(b2):instanceEnds(b2)) ); term = 1/(idxInstances(b1) * idxInstances(b2)); tempKernelCollection(b1, b2, :) = sum(sum(tempToBeSummed,1)); @@ -192,21 +156,13 @@ tic; for i=1:nClusters tempKernel = zeros(nBags); - %tempKernel = tempKernelCollection(:,:,i)'; tempKernel = triu(tempKernelCollection(:,:,i), 1) + tempKernelCollection(:,:,i)'; - %if issymmetric(tempKernel) - % fprintf('Samarth symmetric\n'); - %else - % fprintf('Samarth.. not sysmmetric.. triu operation is needed\n'); - %end rawConformedMultiInstanceKernel{i} = tempKernel; - %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(tempKernel)), min(min(tempKernel))); conformedMultiInstanceKernel{i} = normalizeKernel(tempKernel); - %fprintf('***Max.: %.4f --- Min.:%.4f***\n', max(max(conformedMultiInstanceKernel{i})), min(min(conformedMultiInstanceKernel{i}))); end logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); - else % if do-reshaped ends - % % % % % + else % when not sparse + % not tested either for correctness or efficiency. We recommend only using sparse vectors, thus the approach above. tic; for i=1:size(clusterCentres,1) tempKernel = zeros(nBags);