Skip to content

Commit

Permalink
Cleaning up function definition and comments phase-I.1
Browse files Browse the repository at this point in the history
  • Loading branch information
snikumbh committed Jul 19, 2017
1 parent eb8bd56 commit 88fd7fa
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 87 deletions.
139 changes: 66 additions & 73 deletions comik_main_with_weight_vector.m
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector)
function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector)

% Author: snikumbh
%
% Edits: 06, February, 2017, By: snikumbh
% All segments, shifted and non-shifted in one bag: non-shifted segments followed by shifted segments.
% End result, #kernels = #clusterCentres
% All segments, shifted and non-shifted in one bag; order of segments: non-shifted segments followed by shifted segments.
% This results in #kernels = #clusterCentres
%
% Edits: 10 April, 2017, By: snikumbh
% debugLevel 2 prints all messages
% debugLevel 1 place holder level, may be used in the future
% debugLevel 0 silent on most of the messages, prints very few messages for the user

% Set seed for the Matlab session for reproducibility
% Happens in the wrapper function, already
% RandStream('mt19937ar','seed',1);
% Setting of seed for the Matlab session for reproducibility: already done in the wrapper function.

% Handle arguments and values/defaults
totalArguments = 20;
Expand All @@ -33,31 +29,31 @@

% Read positives
logMessages(debugMsgLocation,sprintf('Positives...'), debugLevel);
[nPos, ommittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, chunkSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation);
% nPos and nNeg be could smaller than equal to givenNPos and givenNNeg resp. due to ommitting any sequences
% whose length is less than the chunkSizeInBps
[nPos, omittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, segmentSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation);
% nPos and nNeg could be smaller than or equal to givenNPos and givenNNeg resp.
% due to omitting any sequences whose length is less than the segmentSizeInBps
%
% Read Negatives
logMessages(debugMsgLocation,sprintf('Negatives...'), debugLevel);
[nNeg, ommittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, chunkSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation);
[nNeg, omittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, segmentSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation);
%
% Combine positives + negatives
nBags = nPos + nNeg;
% Check if we need to update the testIndices if some sequences have been ommitted
% if posIndices and negIndices are empty, then no update required
if ~isempty(ommittedPosIndices)
%nOmmitted = length(find(ismember(testIndices, ommittedPosIndices)));
testIndices(find(ismember(testIndices, ommittedPosIndices))) = [];
%logMessages(debugMsgLocation,sprintf('%d positive sequences ommitted from test set.\n', nOmmitted), debugLevel);
% Check if we need to update the testIndices if some sequences have been omitted
% if posIndices and negIndices are empty, then no updates required
if ~isempty(omittedPosIndices)
%nOmmitted = length(find(ismember(testIndices, omittedPosIndices)));
testIndices(find(ismember(testIndices, omittedPosIndices))) = [];
%logMessages(debugMsgLocation,sprintf('%d positive sequences omitted from test set.\n', nOmmitted), debugLevel);
end
if ~isempty(ommittedNegIndices)
%nOmmitted = length(find(ismember(testIndices, ommittedNegIndices)));
testIndices(find(ismember(testIndices, nPos+ommittedNegIndices))) = [];%offset by nPos
%logMessages(debugMsgLocation,sprintf('%d negative sequences ommitted from test set.\n', nOmmitted), debugLevel);
if ~isempty(omittedNegIndices)
%nOmmitted = length(find(ismember(testIndices, omittedNegIndices)));
testIndices(find(ismember(testIndices, nPos+omittedNegIndices))) = [];%offset by nPos
%logMessages(debugMsgLocation,sprintf('%d negative sequences omitted from test set.\n', nOmmitted), debugLevel);
end
% Additionally, one needs to account for the reduction in the number of sequences affecting the indices. Remove indices > nBags
%testIndices = testIndices(testIndices <= nBags);
for i=1:length(testIndices)
testIndices(i) = testIndices(i) - nnz(ommittedPosIndices < testIndices(i)) - nnz( (nPos+ommittedNegIndices) < testIndices(i));
testIndices(i) = testIndices(i) - nnz(omittedPosIndices < testIndices(i)) - nnz( (nPos+omittedNegIndices) < testIndices(i));
end
% Put them together
allSeqsRawFasta.sequence = cell(1, nBags);
Expand All @@ -69,35 +65,34 @@
allSeqsRawFasta.sequence{i+nPos} = allSeqsRawNegFasta.sequence{i};%offset by nPos
end
clear allSeqsRawNegFasta;
% Write the revised set test indices to disk for reproducibility
% Write the revised set of test indices to disk to support reproducibility

if ~isempty(ommittedPosIndices) | ~isempty(ommittedNegIndices)
if ~isempty(omittedPosIndices) | ~isempty(omittedNegIndices)
sortedTestIndices = sort(testIndices);
dlmwrite(strcat(outputFolder, '/testIndices_New.txt'), sortedTestIndices');
end

% generate labels
% handle imbalance, multiply this to C for negatives
Y = [ones(1, nPos) -ones(1, nNeg)];
logMessages(debugMsgLocation,sprintf('#Bags: %d\n', nBags), debugLevel);

% default for 'acgt'
alphabet = 'acgt';
% for the update, numDist = maxDist + 1;
%fvLength = (4^oligoLen)^2 * numDist;
chunkSizeInPercentage = 0; %default
% non-shifted chunks
% setting default value for segmentSizePercentage.
% Currently, this is the only valid value.
segmentSizeInPercentage = 0;
logMessages(debugMsgLocation,sprintf('All bags...'), debugLevel);
tic;
idx1 = 0;
thisInstances = [];

for k=1:nBags
%if oligoLen == 3
if oligoLen >= 3
% because, for oligoLen 3 or larger, this may take some time.
if rem(k,100) == 0
logMessages(debugMsgLocation,sprintf('%d--', k), debugLevel);
end
%end
end
if any(k == testIndices)
% do-nothing now, we collect them later
% Collection of test sequences can take place later,
Expand All @@ -106,12 +101,12 @@
idx1 = idx1 + 1;
%% Non-shifted
[tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
oligoLen, maxDist, chunkSizeInPercentage, ...
chunkSizeInBps, 'no-shift', alphabet);
oligoLen, maxDist, segmentSizeInPercentage, ...
segmentSizeInBps, 'no-shift', alphabet);
%% Shifted
[tempS, S] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
oligoLen, maxDist, chunkSizeInPercentage, ...
chunkSizeInBps, 'shift', alphabet);
oligoLen, maxDist, segmentSizeInPercentage, ...
segmentSizeInBps, 'shift', alphabet);
allSeqsAsBags{idx1} = [tempNS tempS];
thisInstances = [thisInstances (NS + S)];
end
Expand All @@ -131,12 +126,12 @@
nInstances = instanceEnds(end);
logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel);

direct = 0;
% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag.
% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation.
% Using the sparseComputation flag for computing the instance-wide kernel.
% IMP: setting sparseComputation to '0' uses the brokendown approach to compute the instance-wide kernel.
% This hasn't been tested for either correctness or efficiency. We recommend using sparseComputation.
% We are using sparseComputation.
sparseComputation = 1;
[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation);
[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation);
%
% Further are functions that will be recomputed per inner cross-validation iteration
% Cross-validation loop is also handled simultaneously
Expand All @@ -155,12 +150,12 @@

Youtertest = Y(testIndices);
trainIndicesInY = setdiff([1:nBags], testIndices);
% trainIndicesInY = [1:nBagsForTrain];
trainIndicesInBags = [1:nBagsForTrain];
YwholeTrain = Y(trainIndicesInY);

nPosTr = size(find(YwholeTrain > 0),2);
nNegTr = size(find(YwholeTrain < 0),2);
% handle imbalance, multiply this to C for negatives
imbalance = nPosTr/nNegTr;
logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel);

Expand All @@ -174,7 +169,7 @@
foldIdForIndices = cvpartition(YwholeTrain, 'Kfold', nFolds);
% ^returns logicals
%
% 5. All kernels ready. Pose this as a multiple kernel learning problem next.
% 4. All kernels ready. Pose this as a multiple kernel learning problem next.
% - We have used shogun's python modular interface to perform MKL.
% - Upon performing this MKL, the kernel weights obtained are squared theta values (see paper for details)
%
Expand Down Expand Up @@ -221,7 +216,7 @@
trainConformedKernel, testConformedKernel, outputFolder, whetherPerformTest, debugLevel, debugMsgLocation);
% both subkernelWeights and predictions are zero vectors, if solving MKL failed. Treat this here.
if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0
% set the following variables: this_teAUROC, this_teAUPRC
% set the following variables to 0.0. a) this_teAUROC, b) this_teAUPRC
this_teAUROC = 0.0;
this_teAUPRC = 0.0;
else %solving MKL was fine
Expand Down Expand Up @@ -258,7 +253,7 @@
clear trainConformedKernel;
clear testConformedKernel;
%
% Use the best param combinations to train using the whole trainiing set and predict the test set here
% 5. Use the best param combinations to train using the whole trainiing set and predict the test set here
%
logMessages(debugMsgLocation,sprintf('---Re-training and Test---\n'), debugLevel);
% Re-train the model with all train instances together
Expand All @@ -273,8 +268,7 @@
computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, ...
trainIndicesInBags, allSeqsAsBags, nClusters, conformalXformationParam, ...
YwholeTrain, computationVersion, debugLevel, debugMsgLocation);
%fprintf('\ndone in %.3f seconds\n', toc);

%
%
for k=1:nClusters
% Conformed kernels
Expand All @@ -284,13 +278,14 @@
end
%
whetherPerformTest = 0;
% We now set this to 0, since we will make predictions using computed weight vector
% We now set this to 0, since we will make predictions using computed weight vector and not the kernel
logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel);
%
[subkernelWeights, predictions] = performMKLWithShogunPython(bestParamComb.best_C, YwholeTrain, Youtertest, mklNorm, imbalance, ...
VtrainConformedKernel, VtestConformedKernel, ...
outputFolder, whetherPerformTest, debugLevel, debugMsgLocation);
% Ideally, this is not need here, since such param values won't (or shouldn't) be selected as best values
% Ideally, this is not needed here, since such param values won't (or shouldn't) be selected as best values
% If that does happen, something is probably wrong.
if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0
% set the following variables: train_AUROC, train_AUPRC
train_AUROC = 0.0;
Expand All @@ -302,7 +297,7 @@
logMessages(debugMsgLocation,sprintf('train_AUROC with whole training set: %.4f\n', train_AUROC), debugLevel);
% Below, we make our own predictions for the test examples using the weight vector

% We will now collect the test examples
% 6.1 We will now collect the test examples
tic;
logMessages(debugMsgLocation, sprintf('Now collecting the unseen test examples...'), debugLevel);
idx1 = 0;
Expand All @@ -313,23 +308,24 @@
idx1 = idx1 + 1;
%% Non-shifted
[tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
oligoLen, maxDist, chunkSizeInPercentage, ...
chunkSizeInBps, 'no-shift', alphabet);
oligoLen, maxDist, segmentSizeInPercentage, ...
segmentSizeInBps, 'no-shift', alphabet);
%% Shifted
[tempS, S] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ...
oligoLen, maxDist, chunkSizeInPercentage, ...
chunkSizeInBps, 'shift', alphabet);
oligoLen, maxDist, segmentSizeInPercentage, ...
segmentSizeInBps, 'shift', alphabet);
allSeqsAsBags_Test{idx1} = [tempNS tempS];
end
end
clear allSeqsRawFasta;
logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
% clears up some space useful to load the many test examples as is typical in problems in biology
% clears up some space useful to load the many test examples as is typical in problems in computational biology
if(predictUsingWeightVector == 1)
logMessages(debugMsgLocation, sprintf('---Prediction using weight vector---\n'), debugLevel);
% 6.1 Use the subkernel weights and get transformed instances among the test examples, may be use applyTransformation.m
% 6.2 Use the subkernel weights and get transformed instances among the test examples
% instanceWeights for instances in training bags
[thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel, 'samarth_train1.csv');
% filename passed as argument to getInstanceWeights is used to write them to file.
[thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel);
% Get the transformations for test examples, and the diagonal elements which will be used for normalization further
% Should one proceed block-wise at this juncture if the number of test examples is extremely large as is typically the case?
%
Expand All @@ -339,18 +335,19 @@
logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
%
% Getting the instance weightings for the test examples
[thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test, 'samarth_test1.csv');
[thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test);
% At this point, all (train + test examples) instance weights have been obtained.
%
% 6.2 Use function obtainWeightVector and procure the weight vectors and the bias value
% 6.3 Use function obtainWeightVector and procure the weight vectors and the bias value
% The weight vector need not be ordered by rank. Rather, we need it by its original order.
tic;
logMessages(debugMsgLocation, sprintf('Computing the weight vector...'), debugLevel);
[overallWeightVector, weightVectors, biasValue] = obtainWeightVector(char(outputFolder), oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation);
logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel);
% Don't need allSeqsAsBags, allSeqsTransformationKernel?
%clear allSeqsAsBags allSeqsTransformationKernel;
% 6.3 Use these to make predictions using y = w*x + b
%% Don't need allSeqsAsBags, allSeqsTransformationKernel?
% clear allSeqsAsBags allSeqsTransformationKernel;
%
% 6.4 Use these to make predictions using y = w*x + b
yhatTemp = 0.0;
%
% bagPhi for Test is nFeatures-by-nBagForTest-by-nClusters
Expand Down Expand Up @@ -388,15 +385,11 @@
end
%
%
% Write the instanceWeightInEachBag to file
%dlmwrite('samarth_test.csv', instanceWeightsInEachBag, '-append');
%disp(instanceWeightsInEachBag{1});
%disp(size(instanceWeightsInEachBag));
%disp(size(instanceWeightsInEachBag{1}));
%disp(size(instanceWeightsInEachBag{2}));
%for i=1:size(instanceWeightsInEachBag,2)
% dlmwrite('samarth_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append');
%end
%% Write the instanceWeightInEachBag to file
% dlmwrite('instanceWeights_test.csv', instanceWeightsInEachBag, '-append');
% for i=1:size(instanceWeightsInEachBag,2)
% dlmwrite('instanceWeights_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append');
% end

if strcmp(whetherToPlotHeatmap, 'Yes')
logMessages(debugMsgLocation,sprintf('Plotting heatmaps...'), debugLevel);
Expand All @@ -406,14 +399,14 @@
normInstanceWeightsInEachBag = standardizeMatrix(cell2mat(instanceWeightsInEachBag)');
end
%
heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf');
heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(segmentSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf');
%
p = plot_heatmap(normInstanceWeightsInEachBag, heatmapFname, 0, 1);
logMessages(debugMsgLocation,sprintf('done!\n'), debugLevel);
%
end
%
% 7. Visualize weight vector as motifs
% 7. Visualize weight vectors and motifs
%
if strcmp(whetherToVisualizeWVector, 'Yes')
logMessages(debugMsgLocation,sprintf('Visualizing weight vectors...\n'), debugLevel);
Expand Down
2 changes: 1 addition & 1 deletion computeInstanceWideKernel.m
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation)
function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation)

% COMPUTEINSTANCEWIDEKERNEL
%
Expand Down
23 changes: 10 additions & 13 deletions getInstanceWeights.m
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel, fname)
function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel)

%
% Return the thetaals and
% instance weights for non-shifted and shifted bags
% Return the theta values and instance weights
% for all bags
%

% following a squared relation
% order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels
% Edit: now number of kernels is same as number of clusters
Expand All @@ -24,15 +23,13 @@
size(allSeqsTransformationKernel{j},1), 1) ...
.* allSeqsTransformationKernel{j}, 2);
instanceWeightsInEachBag{j} = temp;
% To write to file
stdTemp = standardizeMatrix(temp');
finalTemp = zeros(size(stdTemp));
%using rankings
[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
%for i=1:size(plotDataM,1)
finalTemp(1,ranking) = 1:size(stdTemp,2);
%end
dlmwrite(fname, finalTemp, '-append');
%% if to be written to file, pass fname to the function
%stdTemp = standardizeMatrix(temp');
%finalTemp = zeros(size(stdTemp));
%% Using rankings
%[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend');
%finalTemp(1,ranking) = 1:size(stdTemp,2);
%dlmwrite(fname, finalTemp, '-append');
end


Expand Down

0 comments on commit 88fd7fa

Please sign in to comment.