From 4d48120142947f24a744868be8029c40b74196e0 Mon Sep 17 00:00:00 2001
From: Sarvesh Prakash Nikumbh <snikumbh@mpi-inf.mpg.de>
Date: Fri, 28 Jul 2017 17:04:49 +0200
Subject: [PATCH] Help for functions added for many smaller functions

---
 analyze_wvector.m                     | 49 ++++++++++++++++
 applyTransformation.m                 | 41 +++++++++++---
 comik_main_with_weight_vector.m       |  2 +-
 computeConformedMultiInstanceKernel.m |  3 +-
 getCompleteSequenceODHFeatureVec.m    |  5 --
 getExpansionPoints.m                  | 43 +++++++++-----
 getInstanceWeights.m                  | 45 +++++++++++----
 getODHFeatureVecInstances.m           | 80 +++++++++++++++++----------
 get_colored.m                         | 18 ++++++
 mkl.py                                |  1 -
 normalizeKernel.m                     | 16 ++++++
 obtainWeightVector.m                  | 78 +++++++++++++++++---------
 performMKLWithShogunPython.m          | 48 ++++++++++++++++
 readFastaSequences.m                  | 15 +++--
 14 files changed, 342 insertions(+), 102 deletions(-)
 delete mode 100644 getCompleteSequenceODHFeatureVec.m

diff --git a/analyze_wvector.m b/analyze_wvector.m
index 86237cb..35f0d07 100644
--- a/analyze_wvector.m
+++ b/analyze_wvector.m
@@ -1,5 +1,54 @@
 function [run_this_cmd, pdfcrop_lines] = analyse_wvector(folderName, rankId, wvector, oligoLen, maxDist, topN, verbose, max_or_topN, debugLevel, debugMsgLocation)
 
+% ANALYSE_WVECTOR
+% Analyses the weight vector and plot distance-centric k-mer visualizations
+%
+% INPUT PARAMS
+% Param 'folderName'
+% Location/Path on disk of the output folder
+%
+% Param 'rankId'
+% Rank of the weight vector in CoMIK
+%
+% Param 'wvector'
+% The weight vector itself which is to be used here
+%
+% Param 'oligoLen'
+% Oligomer length for the ODH representation
+%
+% Param 'maxDist'
+% Maximum distance value for the ODH representation
+%
+% Param 'topN'
+% Specify a value for topN 
+%
+% Param 'verbose'
+% Set to 1 for verbose output
+%
+% Param 'max_or_topN'
+% Flag to specify if topN vizualization or max (Absolute Max Per Distance) visualization is to be performed
+% Refer to Nikumbh and Pfeifer, BMC Bioinformatics, 2017.
+%
+% Param 'debugLevel'
+%
+% Param 'debugMsgLocation'
+%
+% OUTPUT PARAMS
+% Param 'run_this_cmd'
+% This could be used create a cmd that is to be run after the program ends to 
+% stitch together pages from the separate PDF files
+%
+% Param 'pdfcrop_lines'
+% Cropping the PDF pages/files to cut out the unused area
+%
+%
+% ADDITIONAL NOTES
+%
+%
+% Author: snikumbh@mpi-inf.mpg.de
+%
+
+
 totalArguments = 10;
 if nargin < totalArguments
     debugMsgLocation = 1;
diff --git a/applyTransformation.m b/applyTransformation.m
index 1cb6ab9..9b68482 100644
--- a/applyTransformation.m
+++ b/applyTransformation.m
@@ -1,14 +1,39 @@
-function [transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)
+function [instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation)
 
+% APPLYTRANSFORMATION
+% Transforms the individual segments w.r.t. the cluster centres obtained by k-means
 %
-% we apply Gaussian tranformation
+% INPUT PARAMS
+% Param 'allSeqsAsBags' (Cell array)
+% Collection of all segments per sequence
+% 
+% Param 'clusterCentres' (matrix)
+% A matrix where each cluster centre is represented coulnwise; dimensions: nClusters x FVlength
+% 
+% Param 'sig'
+% Gaussian bandwidth value
+%
+% Param 'debugLevel'
+%
+% Param 'debugMsgLocation'
+%
+% OUTPUT PARAMS 
+% Param 'instancesTransformationKernel'
+% Transformation of the instances w.r.t. the cluster centres
+% 
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inif.mpg.de
+
+
+% We apply Gaussian tranformation
 % 
 
 nBags = size(allSeqsAsBags, 2);
-transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
+% transformationKernel = zeros(nBags, size(clusterCentres, 1)); % bag level information
 nClusters = size(clusterCentres,1);
 FVDim1 = size(clusterCentres, 2);
-do_reshaped = 1;
+do_reshaped = 1; % by default, we use th reshaped version. This is tested to be the fastest is most experiments.
 if ~issparse(allSeqsAsBags{1}) && do_reshaped == 1
     logMessages(debugMsgLocation, sprintf('Doing reshaped version..\n'), debugLevel);
     for i=1:nBags
@@ -24,15 +49,15 @@
     end
 elseif issparse(allSeqsAsBags{1}) && do_reshaped == 1
     logMessages(debugMsgLocation, sprintf('Doing reshaped version for sparse vectors..\n'), debugLevel);
-    % Because N-D sparse arrays are not possible
-    % Hence, we opt to reshape/repmat the clusterCentres instead of the sparse FVs
+    % Because N-D sparse arrays are not possible, hence, we opt to reshape/repmat 
+    % the clusterCentres instead of the sparse FVs
     for i=1:nBags
         % length of instancesTransformationKernel is nBags
         % dimensions of each instancesTransformationKernel nInstances-by-nClusters
 	numOfInstances = size(allSeqsAsBags{i},2);
         instancesTransformationKernel{i} = zeros(numOfInstances, nClusters);
 	reshapedClusterCentres = reshape(repmat(clusterCentres, numOfInstances,1), FVDim1, numOfInstances, nClusters);
-        %dim are now nClusters-by-weightVectorDim-by-nClusters
+        % dim are now nClusters-by-weightVectorDim-by-nClusters
         temp = bsxfun(@minus, allSeqsAsBags{i}, reshapedClusterCentres);
         temp = sum(temp.^2,1);
         temp = exp(-0.5*temp./sig^2);
@@ -46,7 +71,7 @@
 	    temp = sum(temp'.^2,1);
 	    temp = exp(-0.5*temp./sig^2);
 	    instancesTransformationKernelA{i}(:,j) = temp';
-	    transformationKernel(i,j) = sum(temp(:));
+	    % transformationKernel(i,j) = sum(temp(:));
         end
     end
 end
diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m
index 7887379..b35da36 100644
--- a/comik_main_with_weight_vector.m
+++ b/comik_main_with_weight_vector.m
@@ -420,7 +420,7 @@
     %
     tic;
     logMessages(debugMsgLocation, sprintf('Applying transformation to the test samples, this might take some time...\n'), debugLevel);
-    [~, allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
+    [allSeqsTransformationKernel_Test] = applyTransformation(allSeqsAsBags_Test, clusterCentres, conformalXformationParam, debugLevel, debugMsgLocation);
     logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
     %
     % Getting the instance weightings for the test examples
diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m
index 0c32ab1..ef4623b 100644
--- a/computeConformedMultiInstanceKernel.m
+++ b/computeConformedMultiInstanceKernel.m
@@ -159,9 +159,8 @@
 % transformationKernel is Gaussian RBF
 tic;
 logMessages(debugMsgLocation, sprintf('--- applying transformation...'), debugLevel);
-[transformationKernel, instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
+[instancesTransformationKernel] = applyTransformation(allSeqsAsBags, clusterCentres, sig, debugLevel, debugMsgLocation);
 logMessages(debugMsgLocation, sprintf('done in %.3f  seconds\n', toc), debugLevel);
-% transformationKernel is nBags-by-nClusters
 % instancesTransformationKernel is cell array for number of bags
 % instancesTransformationKernel{eachBag} is nInstanceInBag-by-nClusters
 
diff --git a/getCompleteSequenceODHFeatureVec.m b/getCompleteSequenceODHFeatureVec.m
deleted file mode 100644
index a4cd8ff..0000000
--- a/getCompleteSequenceODHFeatureVec.m
+++ /dev/null
@@ -1,5 +0,0 @@
-function getCompleteSequenceODHFeatureVec()
-
-%returns the ODH feature vector using the complete sequence as one
-
-end
diff --git a/getExpansionPoints.m b/getExpansionPoints.m
index 18a70dd..676392c 100644
--- a/getExpansionPoints.m
+++ b/getExpansionPoints.m
@@ -1,24 +1,41 @@
 function [expansionPoints, matrixOfDistancesFromCentres] = getExpansionPoints(X, nClusters, rep, debugLevel, debugMsgLocation)
 
-% Params
-% X : All bags opened up into instances
-% nClusters : #clusters
+% GETEXPANSIONPOINTS
+% Performs k-means and returns the clusterCentres as the expansion points
+%
+% INPUT PARAMS
+% Param X (matrix)
+% All bags opened up into instances
+%
+% Param nClusters (matrix)
+% Number of clusters for k-means
 % 
-% Returns
-% a set of expansionPoints
+% OUTPUT PARAMS
+% Param 'expansionPoints' (matrix)
+% a set of expansionPoints, in other words cluster centres
+%
+% Param 'matrixOfDistancesFromCentres' (matrix)
+% As the name suggests, 
 %
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inf.mpg.de
+
 logMessages(debugMsgLocation, sprintf('--- Obtaining a set of expansion points '), debugLevel);
-% obtained X is an n x p matrix, ready for kmeans
-% expansionPoints is a k-by-p matrix
-% D is a n-by-k matrix giving distances from each 
-%   point to every centroid. This will be useful
-%   if we use Gaussian form for the transformation 
-%   function
+% Obtained X in the input arguments is an n x p matrix, ready for kmeans
+% expansionPoints, to be returned, is a k-by-p matrix
+
+% Below, matrixOfDistancesFromCentres is a n-by-k matrix giving distances from each point to every 
+% cluster centre.
+%
+% sumD, from the Matlab documentation, is the within-cluster sums of point-to-centroid distances in a k-by-1 vector
+%
+
 tic;
-rng('default'); % or can be set to 'default'
+rng('default');
 logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel);
 if debugLevel == 2
-    [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);%Display: final
+    [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','on');%, 'MaxIter',1000);%Display: final
 elseif debugLevel == 0
     [idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);
 end
diff --git a/getInstanceWeights.m b/getInstanceWeights.m
index 62b49c7..447e83d 100644
--- a/getInstanceWeights.m
+++ b/getInstanceWeights.m
@@ -1,12 +1,33 @@
 function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel)
 
+% GETINSTANCEWEIGHTS
+% Computes the theta values and instance weights from the sub-kernel weights and segment 
+% transformations
+% 
+% INPUT PARAMS
+% Param 'kernelWeights'
+% Weights assigned by MKL tf each of the sub-kernels in the collection
+%
+% Param 'allSeqsTransformationKernel'
+% Per sequence transformation of segments used to compute the instance weights
+%
+% OUTPUT PARAMS
+% Param 'thetaVals'
+% The theta values corresponding to each cluster centre 
+% 
+% Param 'instanceWeightsInEachBag'
+% Weight for each segment per sequence
+%
+% ADDITIONAL NOTES
+% -- Theta vals and sub-kernel weights follow a squared relation (Refer paper)
+% 
+% -- Order followed is: 
+%    . non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
+%
+% -- Number of kernels is same as number of clusters
 %
-% Return the theta values and instance weights 
-% for all bags
 %
-% following a squared relation
-% order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
-% Edit: now number of kernels is same as number of clusters
+% Author: snikumbh@mpi-inf.mpg.de
 
 thetaVals = sqrt(kernelWeights);
 thetaVals = thetaVals';
@@ -23,13 +44,13 @@
 		size(allSeqsTransformationKernel{j},1), 1) ...
                 .* allSeqsTransformationKernel{j}, 2);
     instanceWeightsInEachBag{j} = temp;
-    %% if to be written to file, pass fname to the function
-    %stdTemp = standardizeMatrix(temp');
-    %finalTemp = zeros(size(stdTemp));
-    %% Using rankings
-    %[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
-    %finalTemp(1,ranking) = 1:size(stdTemp,2);
-    %dlmwrite(fname, finalTemp, '-append');
+    % if to be written to file, pass fname to the function
+    % stdTemp = standardizeMatrix(temp');
+    % finalTemp = zeros(size(stdTemp));
+    % Using rankings
+    % [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
+    % finalTemp(1,ranking) = 1:size(stdTemp,2);
+    % dlmwrite(fname, finalTemp, '-append');
 end
 
 
diff --git a/getODHFeatureVecInstances.m b/getODHFeatureVecInstances.m
index 70744b1..4c7e5fc 100644
--- a/getODHFeatureVecInstances.m
+++ b/getODHFeatureVecInstances.m
@@ -1,36 +1,58 @@
 function [ODHFeatureVecInstances, numOfSegments] = getODHFeatureVecInstances(seq, oligoLen, maxDist, segmentSizeInPercentage, segmentSizeInBps, segmentingStart, alphabet, debug)
 
-% Author: snikumbh
+% GETODHFEATUREVECINSTANCES
 %
-% Params
-% 1. seq: candidate sequence (length L given in #base pairs)
-% 2. oligoLen: oligomer length for ODH (K, default: 2)
-% 3. maxDist: maximum distance value to consider for ODH;
+% INPUT PARAMS
+% Param 'seq' 
+%  candidate sequence (length L given in #base pairs)
+%
+% Param 'oligoLen'
+% Oligomer length for ODH (K, default: 2)
+%
+% Param 'maxDist'
+% Maximum distance value to consider for ODH;
 %		(default: 50)
-% 4. segmentSizeInPercentage: Percentage value used to segment 
-%                       the complete sequence; set this  
-%			to 0 if percentage-based segmenting
-%			is not to be used (default: 0)
-% 5. segmentSizeInBps: If percentage value for segmenting is 0
-%			this has to be non-zero and less than
-%			complete sequence length. It is given
-%			in number of basepairs. (default: 
-%			100 bps)
-% 6. segmentingStart: 'shift' specifies shifting the segmentStart
-%			to (round(segmentSizeInBps/2) + 1); 
-%			otherwise 'no-shift' (default)
-% 7. alphabet: DNA/Protein sequence alphabet (default: DNA)
-%
-%
-% Returns 
-% the ODH FeatureVec representation of (ordered) instances
-% (segments) of the sequence supplied; each in sparse format.
-% Sparsify the end result again? May be not needed.
-% [These can be concatenated to obtain the one final modifiedODHFeatureVec]
+%
+% Param 'segmentSizeInPercentage'
+% Percentage value used to segment the complete sequence; set this to 0 if 
+% percentage-based segmenting is not to be used (default: 0)
+%
+% Param 'segmentSizeInBps'
+% If percentage value for segmenting is 0 this has to be non-zero and less 
+% than complete sequence length. It is given in number of basepairs. 
+% (default: 100 bps)
+%
+% Param 'segmentingStart' 
+% 'shift' specifies shifting the segmentStart to 
+%     (round(segmentSizeInBps/2) + 1); 
+% otherwise 'no-shift' (default)
+%
+% Param 'alphabet'
+% DNA/Protein sequence alphabet (default: DNA)
+%
+%
+% OUTPUT PARAMS 
+% Param 'ODHFeatureVecInstances'
+% Representation of (ordered) instances (or segments) of the sequence supplied; 
+% each in sparse format.
+%
+% Param numOfSegments
 % 
-% Edits: 06, February, 2017, By: snikumbh
-% maxDist is now as per the original ODH paper, Lmax - K.
+% ADDITIONAL NOTES
+% -- Order of segments
+%    . This is only to help the user in determining the where non-shifted 
+%      segments end in the collection, and where shifted segment start
+%    . The subsequent (set) kernel computation is agnostic to this arrangement
+%
+% -- maxDist is now as per the original ODH paper, Lmax - K
+%    . Edits: 06, February, 2017, By: snikumbh 
+%
+% -- Sparsify the end result again? May be not needed.
+%    . [These can be concatenated to obtain the one final modifiedODHFeatureVec]
+%
+%
 %
+% Author: snikumbh@mpi-inf.mpg.de
 
 % Set default values when not supplied
 totalArguments = 8;
@@ -103,7 +125,7 @@
     numOfSegments = length(segmentBoundaries);
 end
 % Asert same number of segmentStarts and Boundaries
-%assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
+% assert(size(segmentStarts,2) == size(segmentBoundaries,2), 'Check segments!');
 if debug
     fprintf('Segments with %s\n', segmentingStart);
     fprintf('#Segments = %d\n', numOfSegments);
@@ -135,7 +157,7 @@
 %
 %columns correspond to different instances; rows to FVs
 ODHFeatureVecInstances = zeros((M^2) * numDist, numOfSegments);
-%modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
+% modifiedODHFeatureVec = [];%zeros( modifiedODHFeatureVecLen , 1);
 % Refer paper. 
 
 % Segmenting already done.
diff --git a/get_colored.m b/get_colored.m
index 6f09d16..3665c7a 100644
--- a/get_colored.m
+++ b/get_colored.m
@@ -1,11 +1,29 @@
 function [colored_string] = get_colored(given_string, oligoLen)
 
+% GETCOLORED
 % Colors the given string (DNA motif) using the IUPAC color scheme for the nucleotides A, C, G, T.
+%
+% INPUT PARAMS
+% Param 'given_string'
+% The DNA oligomer
+% 
+% Param 'oligoLen'
+% Length of the oligomer
+%
+% OUTPUT PARAMS
+% Param 'colored_string'
+% String with each DNA sequence character appropriately colored according to the IUPAC scheme
+%
+% ADDITIONAL NOTES
+%
+%
+% Author: snikumbh@mpi-inf.mpg.de
 
 given_string = char(given_string);
 alphabet = upper('acgt');
 colored_string = [];
 for i=1:size(given_string, 2)
+    % i iterates over the number of characters in the string
     if isscalar(given_string(1,i))
     if oligoLen~= 0 && i == oligoLen+1
 	colored_string = strcat(colored_string, '\color{black}|');
diff --git a/mkl.py b/mkl.py
index c7b1cd2..dd4793e 100755
--- a/mkl.py
+++ b/mkl.py
@@ -133,7 +133,6 @@
     testkernel = CombinedKernel()
     for k in range(0, len(kernelFilenames)):
         thisFname = kernelFilenames[k]
-        #print thisFname
         if os.path.exists(thisFname):
             testkernel.append_kernel(CustomKernel(numpy.loadtxt(thisFname, delimiter=',')))
         else:
diff --git a/normalizeKernel.m b/normalizeKernel.m
index f801afa..27a7f6b 100644
--- a/normalizeKernel.m
+++ b/normalizeKernel.m
@@ -1,5 +1,21 @@
 function normalizedKernel = normalizeKernel(Kmat)
 
+% NORMALIZEKERNEL
+% Diagonal normalizes a given kernel matrix
+% 
+% INPUT PARAMS
+% Param 'Kmat' (matrix)
+% The input kernel matrix
+% 
+% OUTPUT PARAMS
+% Param 'normalizedKernel'
+% The diagonal-normalized kernel matrix
+%
+% ADDITIONAL NOTES
+%
+%
+% Author: snikumbh@mpi-inf.mpg.de
+
 KdiagVec = sqrt(diag(Kmat));
 normalizedKernel = Kmat ./ (KdiagVec * KdiagVec');
 
diff --git a/obtainWeightVector.m b/obtainWeightVector.m
index 124f742..32914d8 100644
--- a/obtainWeightVector.m
+++ b/obtainWeightVector.m
@@ -1,15 +1,55 @@
 function [overallWeightVector, weightVectors, biasValue] = obtainWeightVector(folderName, oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation)
+
+% OBTAINWEIGHTVECTOR
 %
-% Author: snikumbh@mpi-inf.mpg.de
+% INPUT PARAMS
+% Param 'folderName'
+% Location/Path on disk
+%
+% Param 'oligoLen'
+% Value of oligomer length for the ODH representation
+%
+% Param 'maxDist'
+% Value of the maximum distance for the ODH representation
+%
+% Param 'subkernelWeights' (vector)
+% Weights assigned to each sub-kernel upon solving MKL
+%
+% Param 'allSeqsAsBags' (cell array)
+% All sequences represented as bags
+%
+% Param 'rawConformedSetKernel' (matrix)
+% The unnormalized conformally transformed MI kernel
+%
+% Param 'allSeqsTransformationKernel' (matrix)
+% Transformations of all segments for each sequence in the collection
 %
-% 1. We have subkernel weights given, all corresponding weight vectors are to be computed
-% 2. Alpha values are given from the trained model
-% 3. Location for alpha values: folderName specified in arguments
-% 4. Output arguments: overallWeightVector and biasValue
+% Param 'debugLevel'
 %
-% Additional comment:
-% Passing a value for 'folderName' is essential
-% Therefore, there are no default set of values of subkernelWeights
+% Param 'debugMsgLocation'
+%
+% OUTPUT PARAMS
+% Param 'overallWeightVector'
+% The overall weight vector which is a linear combination of the individual 
+% weight vectors
+%  
+% Param 'weightVectors'
+% The individual weight vectors corresponding the multiple kernels in CoMIK
+%
+% Param 'biasValue'
+% The bias value obtained solving SVM
+%
+%
+% ADDITIONAL NOTES
+% -- Alpha values are given from the trained model
+% -- Location for alpha values: folderName specified in arguments
+% -- Output arguments: overallWeightVector and biasValue
+% -- We have subkernel weights given, all corresponding weight vectors are to be computed
+% -- Passing a value for 'folderName', is essential, where one can find the subkernel weights
+%    . Therefore, there are no default set of values of subkernelWeights
+%
+% Author: snikumbh@mpi-inf.mpg.de
+
     
 alphaFilename = 'alphas.txt';
 svFilename = 'sv.txt';
@@ -20,12 +60,8 @@
 biasValue = dlmread(strcat(folderName, '/', biasFilename));
 
 % Compute the weight vectors corresponding to all subkernels, keep them in the same order as the kernels
-% % 3. read in the output from the python script and
+% -- Read in the output from the python script and
 %    return back the values.
-%    Order: The kernels are arranged as
-%               Non-shifted conformed kernels (#kernels = nClusters)
-%               Shifted conformed kernels (#kernels  = nClusters)
-%           Total kernels= 2*nCluster
 %
 nClusters = size(subkernelWeights, 1);
 nKernels = nClusters;
@@ -33,19 +69,16 @@
 % prepare bag-level phi, restrict only to support vectorsa
 % bagPhi is nSVBags-by-nClusters
 bagPhi = zeros(  size(allSeqsAsBags{1},1), size(alphaIndices,1), nClusters);
-%bagPhiShifted = zeros( size(allSeqsAsBagsShifted{1},1), size(alphaIndices,1), nClusters);
 
 idx = 0;
 for i=1:length(allSeqsAsBags)% this is nBags
-    %restrict to sv indices
+    % restrict to sv indices
     if any(i == alphaIndices)
-	idx = idx + 1;%this is second dimension of the bagPhi, ranges from 1 to #alphaIndices
+	idx = idx + 1; % this is second dimension of the bagPhi, ranges from 1 to #alphaIndices
         for j=1:nClusters
-	    %allSeqs NonShifted/Shifted TransformationKernel already has nInstances x nClusters
+	    % allSeqs TransformationKernel already has nInstances x nClusters
 	    normFactor = (1/sqrt(rawConformedSetKernel{j}(i,i)));
 	    bagPhi(:, idx, j) = normFactor * sum(allSeqsAsBags{i} .* repmat(allSeqsTransformationKernel{i}(:,j)', size(allSeqsAsBags{i},1), 1), 2);
-	    %normFactor = (1/sqrt(rawShiftedConformedSetKernel{j}(i,i)));
-            %bagPhiShifted(:, idx, j) = normFactor * sum(allSeqsAsBagsShifted{i} .* repmat(allSeqsShiftedTransformationKernel{i}(:,j)', size(allSeqsAsBagsShifted{i},1), 1), 2);
         end
     end
 end
@@ -54,17 +87,10 @@
 % The first dimension of the weight vector comes from the featurevector length which depends on oligoLength param
 weightVectors = zeros(size(allSeqsAsBags{1},1), nClusters);
 
-%disp(size(alphaValues));
-%disp(size(bagPhiNonShifted(:, :, 1)));
 for i=1:nClusters
     % subkernels for non-shifted bags
     weightVectors(:, i) = (subkernelWeights(i,1) * alphaValues' * bagPhi(:, :, i)')';
 end
-%disp(size(weightVectors(:,1)));
-%for i=(nClusters+1):2*nClusters
-%    % subkernels for non-shifted bags
-%    weightVectors(:, i) = (subkernelWeights(i,1) * alphaValues' * bagPhiShifted(:,:, (i-nClusters))')';
-%end
 % Get the ranked order (weight vector corresponding to the subkernel with the highest weight is ranked at the top)
 
 [sortedSubkernelWeights, rankedIndices] = sort(subkernelWeights, 'descend');
diff --git a/performMKLWithShogunPython.m b/performMKLWithShogunPython.m
index 05b3666..4be814b 100644
--- a/performMKLWithShogunPython.m
+++ b/performMKLWithShogunPython.m
@@ -1,5 +1,53 @@
 function [subkernelWeights, predLabels] = performMKLwithShogunPython(Cval, Ytrain, Ytest, mklNorm, imbalance, train_allSeqsConformedSetKernel, test_allSeqsConformedSetKernel, outputFolder, whetherPerformTest, debugLevel, debugMsgLocation)
 
+% PERFORMMKLWITHSHOGUNPYTHON
+% 
+% INPUT PARAMS
+% Param 'Cval'
+% Cost value for SVM
+%
+% Param 'Ytrain'
+% Labels of the training examples
+%
+% Param 'Ytest'
+% Labels of the test examples
+%
+% Param 'mklNorm'
+% p-norm for the MKL
+%
+% Param 'imbalance'
+% Class imbalance, to be used for upweighting the mis-classification costs of the 
+% minority class
+%
+% Param 'train_allSeqsConformedSetKernel'
+% Kernel matrix corresponding to the training sequences (train x train dimension)
+%
+% Param 'test_allSeqsConformedSetKernel'
+% Kernel matrix corresponding to the test sequences (possibly test x train dimension)
+%
+% Param 'outputFolder'
+% Location/path on disk of the output folder
+%
+% Param 'whetherPerformTest'
+% Flag to specify to the python script that is run from the function if test 
+% predictions are to be made using the test kernel
+%
+% Param 'debugLevel'
+%
+% Param 'debugMsgLocation
+%
+% OUTPUT PARAMS
+% Param 'subkernelWeights'
+% Weights assigned to the sub kernel matrices by MKL
+%
+% Param 'predLabels'
+% Prediction labels assigned by the trained model
+%
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inf.mpg.de
+
+
 totalArguments = 10;
 if nargin < totalArguments 
     debugLevel = 2;
diff --git a/readFastaSequences.m b/readFastaSequences.m
index 8617896..cea2d72 100644
--- a/readFastaSequences.m
+++ b/readFastaSequences.m
@@ -5,7 +5,7 @@
 % READFASTASEQUENCES reads fasta file and extracts sequence to fasta variable.
 % Modified version of readfasta shipped with ODH Matlab code.
 % 
-% INPUT arguments:
+% INPUT PARAMS:
 % txt - name of FASTA filename
 % 
 % segmentSizeInBps - segment size in basepairs
@@ -20,7 +20,7 @@
 % debugLevel and debugMsgLocation - (default) 2, and (default) 1 for stdout 
 %	or filename if to be written to file
 %
-% RETURNS:
+% OUTPUT PARAMS:
 % passfasta - struct containing cell arrays with legends and sequences
 %	passFasta.legend   = cell array with sequence legends (headers)
 %	passFasta.seqeunce = cell array with sequences
@@ -28,9 +28,11 @@
 % omittedIndices - List of indices that were omittedIndices due to their 
 %	length shorter than the segment size
 % 
-% nSeqs - #sequences read from the given file (
+% nSeqs - #sequences read from the given file
 %
-% author: snikumbh@mpi-inf,mpg.de
+% ADDITIONAL NOTES
+%
+% Author: snikumbh@mpi-inf,mpg.de
 
 totalArguments = 6;
 if nargin < totalArguments
@@ -51,7 +53,7 @@
 ofid=fopen(strcat(outputFolder, '/ommittedFastaIds.txt'), 'a'); 
 fprintf(ofid, '----%s----\n', txt);
 lengths = [];
-while i<givenNSeqs%feof(fid)==0
+while i < givenNSeqs % feof(fid)==0
     lineRead=fgetl(fid);
     if isempty(strfind(lineRead, '#'))
         if strfind(lineRead,'>')
@@ -71,10 +73,13 @@
 end
 fclose(ofid);
 fclose(fid);
+
 ineligibleLengths = lengths(lengths < segmentSizeInBps);
 ineligibleIndices = find(lengths < segmentSizeInBps);
+
 nSeqs = givenNSeqs;
 passFasta = fasta;
+
 if size(ineligibleLengths, 2) > 0
     logMessages(debugMsgLocation, sprintf('%d sequence(s) have been ommitted as they are shorter than the segment-size specified. Check ommittedFastaIds.txt\n', length(ineligibleIndices)), debugLevel);
     % Change to what would be passed, if it is different than 'fasta' above