Cleaning up function definition and comments phase-I.1

snikumbh · Jul 19, 2017 · 88fd7fa · 88fd7fa
1 parent eb8bd56
commit 88fd7fa
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 87 deletions.
diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m
@@ -1,19 +1,15 @@
-function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector)
+function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector)
 
 % Author: snikumbh 
 %
-% Edits: 06, February, 2017, By: snikumbh
-% All segments, shifted and non-shifted in one bag: non-shifted segments followed by shifted segments.
-% End result, #kernels = #clusterCentres
+% All segments, shifted and non-shifted in one bag; order of segments: non-shifted segments followed by shifted segments.
+% This results in #kernels = #clusterCentres
 %
-% Edits: 10 April, 2017, By: snikumbh
 % debugLevel 2	prints all messages
 % debugLevel 1	place holder level, may be used in the future
 % debugLevel 0  silent on most of the messages, prints very few messages for the user
 
-% Set seed for the Matlab session for reproducibility
-% Happens in the wrapper function, already
-% RandStream('mt19937ar','seed',1);
+% Setting of seed for the Matlab session for reproducibility: already done in the wrapper function.
 
 % Handle arguments and values/defaults
 totalArguments = 20;
@@ -33,31 +29,31 @@
 
 % Read positives
 logMessages(debugMsgLocation,sprintf('Positives...'), debugLevel);
-[nPos, ommittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, chunkSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation);
-% nPos and nNeg be could smaller than equal to givenNPos and givenNNeg resp. due to ommitting any sequences
-% whose length is less than the chunkSizeInBps
+[nPos, omittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, segmentSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation);
+% nPos and nNeg could be smaller than or equal to givenNPos and givenNNeg resp. 
+% due to omitting any sequences whose length is less than the segmentSizeInBps
 %
 % Read Negatives
 logMessages(debugMsgLocation,sprintf('Negatives...'), debugLevel);
-[nNeg, ommittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, chunkSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation);
+[nNeg, omittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, segmentSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation);
+%
 % Combine positives + negatives
 nBags = nPos + nNeg;
-% Check if we need to update the testIndices if some sequences have been ommitted
-% if posIndices and negIndices are empty, then no update required
-if ~isempty(ommittedPosIndices)
-    %nOmmitted = length(find(ismember(testIndices, ommittedPosIndices)));
-    testIndices(find(ismember(testIndices, ommittedPosIndices))) = [];
-    %logMessages(debugMsgLocation,sprintf('%d positive sequences ommitted from test set.\n', nOmmitted), debugLevel);
+% Check if we need to update the testIndices if some sequences have been omitted
+% if posIndices and negIndices are empty, then no updates required
+if ~isempty(omittedPosIndices)
+    %nOmmitted = length(find(ismember(testIndices, omittedPosIndices)));
+    testIndices(find(ismember(testIndices, omittedPosIndices))) = [];
+    %logMessages(debugMsgLocation,sprintf('%d positive sequences omitted from test set.\n', nOmmitted), debugLevel);
 end
-if ~isempty(ommittedNegIndices)
-    %nOmmitted = length(find(ismember(testIndices, ommittedNegIndices)));
-    testIndices(find(ismember(testIndices, nPos+ommittedNegIndices))) = [];%offset by nPos
-    %logMessages(debugMsgLocation,sprintf('%d negative sequences ommitted from test set.\n', nOmmitted), debugLevel);
+if ~isempty(omittedNegIndices)
+    %nOmmitted = length(find(ismember(testIndices, omittedNegIndices)));
+    testIndices(find(ismember(testIndices, nPos+omittedNegIndices))) = [];%offset by nPos
+    %logMessages(debugMsgLocation,sprintf('%d negative sequences omitted from test set.\n', nOmmitted), debugLevel);
 end
 % Additionally, one needs to account for the reduction in the number of sequences affecting the indices. Remove indices > nBags 
-%testIndices = testIndices(testIndices <= nBags);
 for i=1:length(testIndices)
-    testIndices(i) = testIndices(i) - nnz(ommittedPosIndices < testIndices(i)) - nnz( (nPos+ommittedNegIndices) < testIndices(i));
+    testIndices(i) = testIndices(i) - nnz(omittedPosIndices < testIndices(i)) - nnz( (nPos+omittedNegIndices) < testIndices(i));
 end
 % Put them together
 allSeqsRawFasta.sequence = cell(1, nBags);
@@ -69,35 +65,34 @@
     allSeqsRawFasta.sequence{i+nPos} = allSeqsRawNegFasta.sequence{i};%offset by nPos
 end
 clear allSeqsRawNegFasta;
-% Write the revised set test indices to disk for reproducibility
+% Write the revised set of test indices to disk to support reproducibility
 
-if ~isempty(ommittedPosIndices) | ~isempty(ommittedNegIndices)
+if ~isempty(omittedPosIndices) | ~isempty(omittedNegIndices)
     sortedTestIndices = sort(testIndices);
     dlmwrite(strcat(outputFolder, '/testIndices_New.txt'), sortedTestIndices');
 end
 
 % generate labels
-% handle imbalance, multiply this to C for negatives
 Y = [ones(1, nPos) -ones(1, nNeg)];
 logMessages(debugMsgLocation,sprintf('#Bags: %d\n', nBags), debugLevel);
 
 % default for 'acgt'
 alphabet = 'acgt';
-% for the update, numDist = maxDist + 1;
-%fvLength = (4^oligoLen)^2 * numDist;
-chunkSizeInPercentage = 0; %default
-% non-shifted chunks
+% setting default value for segmentSizePercentage. 
+% Currently, this is the only valid value.
+segmentSizeInPercentage = 0; 
 logMessages(debugMsgLocation,sprintf('All bags...'), debugLevel);
 tic;
 idx1 = 0;
 thisInstances = [];
 
 for k=1:nBags
-    %if oligoLen == 3
+    if oligoLen >= 3
+	% because, for oligoLen 3 or larger, this may take some time.
 	if rem(k,100) == 0
             logMessages(debugMsgLocation,sprintf('%d--', k), debugLevel);
 	end
-    %end
+    end
     if any(k == testIndices)
 	% do-nothing now, we collect them later
 	% Collection of test sequences can take place later, 
@@ -106,12 +101,12 @@
 	idx1 = idx1 + 1;
 	%% Non-shifted
         [tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
-                                        oligoLen, maxDist, chunkSizeInPercentage, ...
-                                        chunkSizeInBps, 'no-shift', alphabet);
+                                        oligoLen, maxDist, segmentSizeInPercentage, ...
+                                        segmentSizeInBps, 'no-shift', alphabet);
 	%% Shifted
 	[tempS, S] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
-                                        oligoLen, maxDist, chunkSizeInPercentage, ...
-                                        chunkSizeInBps, 'shift', alphabet);
+                                        oligoLen, maxDist, segmentSizeInPercentage, ...
+                                        segmentSizeInBps, 'shift', alphabet);
 	allSeqsAsBags{idx1} = [tempNS tempS];
 	thisInstances = [thisInstances (NS + S)];
     end
@@ -131,12 +126,12 @@
 nInstances = instanceEnds(end);
 logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel);
 
-direct = 0; 
-% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag.
-% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation.
+% Using the sparseComputation flag for computing the instance-wide kernel.
+% IMP: setting sparseComputation to '0' uses the brokendown approach to compute the instance-wide kernel.
+% This hasn't been tested for either correctness or efficiency. We recommend using sparseComputation.
 % We are using sparseComputation.
 sparseComputation = 1;
-[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
+[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation);
 %
 % Further are functions that will be recomputed per inner cross-validation iteration
 % Cross-validation loop is also handled simultaneously
@@ -155,12 +150,12 @@
 
 Youtertest = Y(testIndices);
 trainIndicesInY = setdiff([1:nBags], testIndices);
-% trainIndicesInY = [1:nBagsForTrain];
 trainIndicesInBags = [1:nBagsForTrain];
 YwholeTrain = Y(trainIndicesInY);
 
 nPosTr = size(find(YwholeTrain > 0),2);
 nNegTr = size(find(YwholeTrain < 0),2);
+% handle imbalance, multiply this to C for negatives
 imbalance = nPosTr/nNegTr;
 logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel);
 
@@ -174,7 +169,7 @@
     foldIdForIndices = cvpartition(YwholeTrain, 'Kfold', nFolds);
     % ^returns logicals
     %
-    % 5. All kernels ready. Pose this as a multiple kernel learning problem next.
+    % 4. All kernels ready. Pose this as a multiple kernel learning problem next.
     %    - We have used shogun's python modular interface to perform MKL.
     %    - Upon performing this MKL, the kernel weights obtained are squared theta values (see paper for details)
     %
@@ -221,7 +216,7 @@
 						trainConformedKernel, testConformedKernel, outputFolder, whetherPerformTest, debugLevel, debugMsgLocation);
 	    % both subkernelWeights and predictions are zero vectors, if solving MKL failed. Treat this here.
 	    if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0
-	        % set the following variables: this_teAUROC, this_teAUPRC
+	        % set the following variables to 0.0. a) this_teAUROC, b) this_teAUPRC
 	        this_teAUROC = 0.0;
  	        this_teAUPRC = 0.0;
 	    else %solving MKL was fine
@@ -258,7 +253,7 @@
 clear trainConformedKernel;
 clear testConformedKernel;
 %
-% Use the best param combinations to train using the whole trainiing set and predict the test set here
+% 5. Use the best param combinations to train using the whole trainiing set and predict the test set here
 %
 logMessages(debugMsgLocation,sprintf('---Re-training and Test---\n'), debugLevel);
 % Re-train the model with all train instances together
@@ -273,8 +268,7 @@
 						computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, ...
 						trainIndicesInBags, allSeqsAsBags, nClusters, conformalXformationParam, ...
 						YwholeTrain, computationVersion, debugLevel, debugMsgLocation);
-%fprintf('\ndone in %.3f seconds\n', toc);
-
+%
 %
 for k=1:nClusters
     % Conformed kernels
@@ -284,13 +278,14 @@
 end
 %
 whetherPerformTest = 0; 
-% We now set this to 0, since we will make predictions using computed weight vector
+% We now set this to 0, since we will make predictions using computed weight vector and not the kernel
 logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel);
 %
 [subkernelWeights, predictions] = performMKLWithShogunPython(bestParamComb.best_C, YwholeTrain, Youtertest, mklNorm, imbalance, ...
 						VtrainConformedKernel, VtestConformedKernel, ...
 						 outputFolder, whetherPerformTest, debugLevel, debugMsgLocation);
-% Ideally, this is not need here, since such param values won't (or shouldn't) be selected as best values
+% Ideally, this is not needed here, since such param values won't (or shouldn't) be selected as best values
+% If that does happen, something is probably wrong.
 if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0
     % set the following variables: train_AUROC, train_AUPRC
     train_AUROC = 0.0;
@@ -302,7 +297,7 @@
 logMessages(debugMsgLocation,sprintf('train_AUROC with whole training set: %.4f\n', train_AUROC), debugLevel);
 % Below, we make our own predictions for the test examples using the weight vector
 
-% We will now collect the test examples
+% 6.1 We will now collect the test examples
 tic;
 logMessages(debugMsgLocation, sprintf('Now collecting the unseen test examples...'), debugLevel);
 idx1 = 0;
@@ -313,23 +308,24 @@
         idx1 = idx1 + 1;
 	%% Non-shifted
         [tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
-					oligoLen, maxDist, chunkSizeInPercentage, ...
-					chunkSizeInBps, 'no-shift', alphabet);
+					oligoLen, maxDist, segmentSizeInPercentage, ...
+					segmentSizeInBps, 'no-shift', alphabet);
 	%% Shifted
 	[tempS, S]  = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
-                                        oligoLen, maxDist, chunkSizeInPercentage, ...
-                                        chunkSizeInBps, 'shift', alphabet);
+                                        oligoLen, maxDist, segmentSizeInPercentage, ...
+                                        segmentSizeInBps, 'shift', alphabet);
         allSeqsAsBags_Test{idx1} = [tempNS tempS];
     end
 end
 clear allSeqsRawFasta;
 logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
-% clears up some space useful to load the many test examples as is typical in problems in biology
+% clears up some space useful to load the many test examples as is typical in problems in computational biology
 if(predictUsingWeightVector == 1)
     logMessages(debugMsgLocation, sprintf('---Prediction using weight vector---\n'), debugLevel);
-    % 6.1 Use the subkernel weights and get transformed instances among the test examples, may be use applyTransformation.m
+    % 6.2 Use the subkernel weights and get transformed instances among the test examples
     % instanceWeights for instances in training bags
-    [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel, 'samarth_train1.csv');
+    % filename passed as argument to getInstanceWeights is used to write them to file.
+    [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel);
     % Get the transformations for test examples, and the diagonal elements which will be used for normalization further
     % Should one proceed block-wise at this juncture if the number of test examples is extremely large as is typically the case?
     %
@@ -339,18 +335,19 @@
     logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
     %
     % Getting the instance weightings for the test examples
-    [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test, 'samarth_test1.csv');
+    [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test);
     % At this point, all (train + test examples) instance weights have been obtained.
     % 
-    % 6.2 Use function obtainWeightVector and procure the weight vectors and the bias value
+    % 6.3 Use function obtainWeightVector and procure the weight vectors and the bias value
     % The weight vector need not be ordered by rank. Rather, we need it by its original order.
     tic;
     logMessages(debugMsgLocation, sprintf('Computing the weight vector...'), debugLevel);
     [overallWeightVector, weightVectors, biasValue] = obtainWeightVector(char(outputFolder), oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation);
     logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
-    % Don't need allSeqsAsBags, allSeqsTransformationKernel?
-    %clear allSeqsAsBags allSeqsTransformationKernel;
-    % 6.3 Use these to make predictions using y = w*x + b
+    %% Don't need allSeqsAsBags, allSeqsTransformationKernel?
+    % clear allSeqsAsBags allSeqsTransformationKernel;
+    %
+    % 6.4 Use these to make predictions using y = w*x + b
     yhatTemp = 0.0;
     %
     % bagPhi for Test is nFeatures-by-nBagForTest-by-nClusters
@@ -388,15 +385,11 @@
 end
 %
 %
-% Write the instanceWeightInEachBag to file
-%dlmwrite('samarth_test.csv', instanceWeightsInEachBag, '-append');
-%disp(instanceWeightsInEachBag{1});
-%disp(size(instanceWeightsInEachBag));
-%disp(size(instanceWeightsInEachBag{1}));
-%disp(size(instanceWeightsInEachBag{2}));
-%for i=1:size(instanceWeightsInEachBag,2)
-%    dlmwrite('samarth_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append');
-%end
+%% Write the instanceWeightInEachBag to file
+% dlmwrite('instanceWeights_test.csv', instanceWeightsInEachBag, '-append');
+% for i=1:size(instanceWeightsInEachBag,2)
+%    dlmwrite('instanceWeights_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append');
+% end
 
 if strcmp(whetherToPlotHeatmap, 'Yes')
     logMessages(debugMsgLocation,sprintf('Plotting heatmaps...'), debugLevel);
@@ -406,14 +399,14 @@
   	normInstanceWeightsInEachBag = standardizeMatrix(cell2mat(instanceWeightsInEachBag)');
     end
     %
-    heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf');
+    heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(segmentSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf');
     %
     p = plot_heatmap(normInstanceWeightsInEachBag, heatmapFname, 0, 1);
     logMessages(debugMsgLocation,sprintf('done!\n'), debugLevel);
     %
 end
 %
-% 7. Visualize weight vector as motifs
+% 7. Visualize weight vectors and motifs
 % 
 if strcmp(whetherToVisualizeWVector, 'Yes')
     logMessages(debugMsgLocation,sprintf('Visualizing weight vectors...\n'), debugLevel);

diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m
@@ -1,4 +1,4 @@
-function  [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation)
+function  [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation)
 
 % COMPUTEINSTANCEWIDEKERNEL 
 % 

diff --git a/getInstanceWeights.m b/getInstanceWeights.m
@@ -1,10 +1,9 @@
-function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel, fname)
+function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel)
 
 %
-% Return the thetaals and 
-% instance weights for non-shifted and shifted bags
+% Return the theta values and instance weights 
+% for all bags
 %
-
 % following a squared relation
 % order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
 % Edit: now number of kernels is same as number of clusters
@@ -24,15 +23,13 @@
 		size(allSeqsTransformationKernel{j},1), 1) ...
                 .* allSeqsTransformationKernel{j}, 2);
     instanceWeightsInEachBag{j} = temp;
-    % To write to file
-    stdTemp = standardizeMatrix(temp');
-    finalTemp = zeros(size(stdTemp));
-    %using rankings
-    [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
-    %for i=1:size(plotDataM,1)
-    finalTemp(1,ranking) = 1:size(stdTemp,2);
-    %end
-    dlmwrite(fname, finalTemp, '-append');
+    %% if to be written to file, pass fname to the function
+    %stdTemp = standardizeMatrix(temp');
+    %finalTemp = zeros(size(stdTemp));
+    %% Using rankings
+    %[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
+    %finalTemp(1,ranking) = 1:size(stdTemp,2);
+    %dlmwrite(fname, finalTemp, '-append');
 end