Help for functions added for many smaller functions

snikumbh · Jul 28, 2017 · 4d48120 · 4d48120
1 parent 1ae57b9
commit 4d48120
Showing 14 changed files with 342 additions and 102 deletions.
diff --git a/analyze_wvector.m b/analyze_wvector.m
@@ -1,5 +1,54 @@
 function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)
 
+% ANALYSE_WVECTOR
+% Analyses the weight vector and plot distance-centric k-mer visualizations
+%
+% INPUT PARAMS
+% Param 'folderName'
+% Location/Path on disk of the output folder
+%
+% Param 'rankId'
+% Rank of the weight vector in CoMIK
+%
+% Param 'wvector'
+% The weight vector itself which is to be used here
+%
+% Param 'oligoLen'
+% Oligomer length for the ODH representation
+%
+% Param 'maxDist'
+% Maximum distance value for the ODH representation
+%
+% Param 'topN'
+% Specify a value for topN 
+%
+% Param 'verbose'
+% Set to 1 for verbose output
+%
+% Param 'max_or_topN'
+% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed
+% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017.
+%
+% Param 'debugLevel'
+%
+% Param 'debugMsgLocation'
+%
+% OUTPUT PARAMS
+% Param 'run_this_cmd'
+% This could be used create a cmd that is to be run after the program ends to 
+% stitch together pages from the separate PDF files
+%
+% Param 'pdfcrop_lines'
+% Cropping the PDF pages/files to cut out the unused area
+%
+%
+% ADDITIONAL NOTES
+%
+%
+% Author: snikumbh@mpi-inf.mpg.de
+%
+
+
 totalArguments = 10;
 if nargin < totalArguments
     debugMsgLocation = 1;

diff --git a/applyTransformation.m b/applyTransformation.m
@@ -1,14 +1,39 @@
-function [transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)
+function [instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)
 
+% APPLYTRANSFORMATION
+% Transforms the individual segments w.r.t. the cluster centres obtained by k-means
 %
-% we apply Gaussian tranformation
+% INPUT PARAMS
+% Param 'allSeqsAsBags' (Cell array)
+% Collection of all segments per sequence
+% 
+% Param 'clusterCentres' (matrix)
+% A matrix where each cluster centre is represented coulnwise; dimensions: nClusters x FVlength
+% 
+% Param 'sig'
+% Gaussian bandwidth value
+%
+% Param 'debugLevel'
+%
+% Param 'debugMsgLocation'
+%
+% OUTPUT PARAMS 
+% Param 'instancesTransformationKernel'
+% Transformation of the instances w.r.t. the cluster centres
+% 
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inif.mpg.de
+
+
+% We apply Gaussian tranformation
 % 
 
 nBags = size(allSeqsAsBags, 2);
-transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
+% transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
 nClusters = size(clusterCentres,1);
 FVDim1 = size(clusterCentres, 2);
-do_reshaped = 1;
+do_reshaped = 1; % by default, we use th reshaped version. This is tested to be the fastest is most experiments.
 if ~issparse(allSeqsAsBags{1}) && do_reshaped == 1
     logMessages(debugMsgLocation, sprintf('Doing reshaped version..\n'), debugLevel);
     for i=1:nBags
@@ -24,15 +49,15 @@
     end
 elseif issparse(allSeqsAsBags{1}) && do_reshaped == 1
     logMessages(debugMsgLocation, sprintf('Doing reshaped version for sparse vectors..\n'), debugLevel);
-    % Because N-D sparse arrays are not possible
-    % Hence, we opt to reshape/repmat the clusterCentres instead of the sparse FVs
+    % Because N-D sparse arrays are not possible, hence, we opt to reshape/repmat 
+    % the clusterCentres instead of the sparse FVs
     for i=1:nBags
         % length of instancesTransformationKernel is nBags
         % dimensions of each instancesTransformationKernel nInstances-by-nClusters
 	numOfInstances = size(allSeqsAsBags{i},2);
         instancesTransformationKernel{i} = zeros(numOfInstances, nClusters);
 	reshapedClusterCentres = reshape(repmat(clusterCentres, numOfInstances,1), FVDim1, numOfInstances, nClusters);
-        %dim are now nClusters-by-weightVectorDim-by-nClusters
+        % dim are now nClusters-by-weightVectorDim-by-nClusters
         temp = bsxfun(@minus, allSeqsAsBags{i}, reshapedClusterCentres);
         temp = sum(temp.^2,1);
         temp = exp(-0.5*temp./sig^2);
@@ -46,7 +71,7 @@
 	    temp = sum(temp'.^2,1);
 	    temp = exp(-0.5*temp./sig^2);
 	    instancesTransformationKernelA{i}(:,j) = temp';
-	    transformationKernel(i,j) = sum(temp(:));
+	    % transformationKernel(i,j) = sum(temp(:));
         end
     end
 end

diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m
@@ -420,7 +420,7 @@
     %
     tic;
     logMessages(debugMsgLocation, sprintf('Applying transformation to the test samples, this might take some time...\n'), debugLevel);
-    [~, allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
+    [allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
     logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
     %
     % Getting the instance weightings for the test examples

diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m
@@ -159,9 +159,8 @@
 % transformationKernel is Gaussian RBF
 tic;
 logMessages(debugMsgLocation, sprintf('--- applying transformation...'), debugLevel);
-[transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
+[instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
 logMessages(debugMsgLocation, sprintf('done in %.3f  seconds\n', toc), debugLevel);
-% transformationKernel is nBags-by-nClusters
 % instancesTransformationKernel is cell array for number of bags
 % instancesTransformationKernel{eachBag} is nInstanceInBag-by-nClusters
 

diff --git a/getCompleteSequenceODHFeatureVec.m b/getCompleteSequenceODHFeatureVec.m
diff --git a/getExpansionPoints.m b/getExpansionPoints.m
@@ -1,24 +1,41 @@
 function [expansionPoints, matrixOfDistancesFromCentres] = getExpansionPoints(X, nClusters, rep, debugLevel, debugMsgLocation)
 
-% Params
-% X : All bags opened up into instances
-% nClusters : #clusters
+% GETEXPANSIONPOINTS
+% Performs k-means and returns the clusterCentres as the expansion points
+%
+% INPUT PARAMS
+% Param X (matrix)
+% All bags opened up into instances
+%
+% Param nClusters (matrix)
+% Number of clusters for k-means
 % 
-% Returns
-% a set of expansionPoints
+% OUTPUT PARAMS
+% Param 'expansionPoints' (matrix)
+% a set of expansionPoints, in other words cluster centres
+%
+% Param 'matrixOfDistancesFromCentres' (matrix)
+% As the name suggests, 
 %
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inf.mpg.de
+
 logMessages(debugMsgLocation, sprintf('--- Obtaining a set of expansion points '), debugLevel);
-% obtained X is an n x p matrix, ready for kmeans
-% expansionPoints is a k-by-p matrix
-% D is a n-by-k matrix giving distances from each 
-%   point to every centroid. This will be useful
-%   if we use Gaussian form for the transformation 
-%   function
+% Obtained X in the input arguments is an n x p matrix, ready for kmeans
+% expansionPoints, to be returned, is a k-by-p matrix
+
+% Below, matrixOfDistancesFromCentres is a n-by-k matrix giving distances from each point to every 
+% cluster centre.
+%
+% sumD, from the Matlab documentation, is the within-cluster sums of point-to-centroid distances in a k-by-1 vector
+%
+
 tic;
-rng('default'); % or can be set to 'default'
+rng('default');
 logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel);
 if debugLevel == 2
-    [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);%Display: final
+    [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','on');%, 'MaxIter',1000);%Display: final
 elseif debugLevel == 0
     [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);
 end

diff --git a/getInstanceWeights.m b/getInstanceWeights.m
@@ -1,12 +1,33 @@
 function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel)
 
+% GETINSTANCEWEIGHTS
+% Computes the theta values and instance weights from the sub-kernel weights and segment 
+% transformations
+% 
+% INPUT PARAMS
+% Param 'kernelWeights'
+% Weights assigned by MKL tf each of the sub-kernels in the collection
+%
+% Param 'allSeqsTransformationKernel'
+% Per sequence transformation of segments used to compute the instance weights
+%
+% OUTPUT PARAMS
+% Param 'thetaVals'
+% The theta values corresponding to each cluster centre 
+% 
+% Param 'instanceWeightsInEachBag'
+% Weight for each segment per sequence
+%
+% ADDITIONAL NOTES
+% -- Theta vals and sub-kernel weights follow a squared relation (Refer paper)
+% 
+% -- Order followed is: 
+%    . non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
+%
+% -- Number of kernels is same as number of clusters
 %
-% Return the theta values and instance weights 
-% for all bags
 %
-% following a squared relation
-% order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
-% Edit: now number of kernels is same as number of clusters
+% Author: snikumbh@mpi-inf.mpg.de
 
 thetaVals = sqrt(kernelWeights);
 thetaVals = thetaVals';
@@ -23,13 +44,13 @@
 		size(allSeqsTransformationKernel{j},1), 1) ...
                 .* allSeqsTransformationKernel{j}, 2);
     instanceWeightsInEachBag{j} = temp;
-    %% if to be written to file, pass fname to the function
-    %stdTemp = standardizeMatrix(temp');
-    %finalTemp = zeros(size(stdTemp));
-    %% Using rankings
-    %[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
-    %finalTemp(1,ranking) = 1:size(stdTemp,2);
-    %dlmwrite(fname, finalTemp, '-append');
+    % if to be written to file, pass fname to the function
+    % stdTemp = standardizeMatrix(temp');
+    % finalTemp = zeros(size(stdTemp));
+    % Using rankings
+    % [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
+    % finalTemp(1,ranking) = 1:size(stdTemp,2);
+    % dlmwrite(fname, finalTemp, '-append');
 end
 
 

diff --git a/getODHFeatureVecInstances.m b/getODHFeatureVecInstances.m
@@ -1,36 +1,58 @@
 function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)
 
-% Author: snikumbh
+% GETODHFEATUREVECINSTANCES
 %
-% Params
-% 1. seq: candidate sequence (length L given in #base pairs)
-% 2. oligoLen: oligomer length for ODH (K, default: 2)
-% 3. maxDist: maximum distance value to consider for ODH;
+% INPUT PARAMS
+% Param 'seq' 
+%  candidate sequence (length L given in #base pairs)
+%
+% Param 'oligoLen'
+% Oligomer length for ODH (K, default: 2)
+%
+% Param 'maxDist'
+% Maximum distance value to consider for ODH;
 %		(default: 50)
-% 4. segmentSizeInPercentage: Percentage value used to segment 
-%                       the complete sequence; set this  
-%			to 0 if percentage-based segmenting
-%			is not to be used (default: 0)
-% 5. segmentSizeInBps: If percentage value for segmenting is 0
-%			this has to be non-zero and less than
-%			complete sequence length. It is given
-%			in number of basepairs. (default: 
-%			100 bps)
-% 6. segmentingStart: 'shift' specifies shifting the segmentStart
-%			to (round(segmentSizeInBps/2) + 1); 
-%			otherwise 'no-shift' (default)
-% 7. alphabet: DNA/Protein sequence alphabet (default: DNA)
-%
-%
-% Returns 
-% the ODH FeatureVec representation of (ordered) instances
-% (segments) of the sequence supplied; each in sparse format.
-% Sparsify the end result again? May be not needed.
-% [These can be concatenated to obtain the one final modifiedODHFeatureVec]
+%
+% Param 'segmentSizeInPercentage'
+% Percentage value used to segment the complete sequence; set this to 0 if 
+% percentage-based segmenting is not to be used (default: 0)
+%
+% Param 'segmentSizeInBps'
+% If percentage value for segmenting is 0 this has to be non-zero and less 
+% than complete sequence length. It is given in number of basepairs. 
+% (default: 100 bps)
+%
+% Param 'segmentingStart' 
+% 'shift' specifies shifting the segmentStart to 
+%     (round(segmentSizeInBps/2) + 1); 
+% otherwise 'no-shift' (default)
+%
+% Param 'alphabet'
+% DNA/Protein sequence alphabet (default: DNA)
+%
+%
+% OUTPUT PARAMS 
+% Param 'ODHFeatureVecInstances'
+% Representation of (ordered) instances (or segments) of the sequence supplied; 
+% each in sparse format.
+%
+% Param numOfSegments
 % 
-% Edits: 06, February, 2017, By: snikumbh
-% maxDist is now as per the original ODH paper, Lmax - K.
+% ADDITIONAL NOTES
+% -- Order of segments
+%    . This is only to help the user in determining the where non-shifted 
+%      segments end in the collection, and where shifted segment start
+%    . The subsequent (set) kernel computation is agnostic to this arrangement
+%
+% -- maxDist is now as per the original ODH paper, Lmax - K
+%    . Edits: 06, February, 2017, By: snikumbh 
+%
+% -- Sparsify the end result again? May be not needed.
+%    . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
+%
+%
 %
+% Author: snikumbh@mpi-inf.mpg.de
 
 % Set default values when not supplied
 totalArguments = 8;
@@ -103,7 +125,7 @@
     numOfSegments = length(segmentBoundaries);
 end
 % Asert same number of segmentStarts and Boundaries
-%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
+% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
 if debug
     fprintf('Segments with %s\n', segmentingStart);
     fprintf('#Segments = %d\n', numOfSegments);
@@ -135,7 +157,7 @@
 %
 %columns correspond to different instances; rows to FVs
 ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
-%modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
+% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
 % Refer paper. 
 
 % Segmenting already done.