diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m index 31cf9a6..192c8f7 100644 --- a/comik_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -1,19 +1,15 @@ -function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, chunkSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) +function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) % Author: snikumbh % -% Edits: 06, February, 2017, By: snikumbh -% All segments, shifted and non-shifted in one bag: non-shifted segments followed by shifted segments. -% End result, #kernels = #clusterCentres +% All segments, shifted and non-shifted in one bag; order of segments: non-shifted segments followed by shifted segments. +% This results in #kernels = #clusterCentres % -% Edits: 10 April, 2017, By: snikumbh % debugLevel 2 prints all messages % debugLevel 1 place holder level, may be used in the future % debugLevel 0 silent on most of the messages, prints very few messages for the user -% Set seed for the Matlab session for reproducibility -% Happens in the wrapper function, already -% RandStream('mt19937ar','seed',1); +% Setting of seed for the Matlab session for reproducibility: already done in the wrapper function. % Handle arguments and values/defaults totalArguments = 20; @@ -33,31 +29,31 @@ % Read positives logMessages(debugMsgLocation,sprintf('Positives...'), debugLevel); -[nPos, ommittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, chunkSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation); -% nPos and nNeg be could smaller than equal to givenNPos and givenNNeg resp. due to ommitting any sequences -% whose length is less than the chunkSizeInBps +[nPos, omittedPosIndices, allSeqsRawPosFasta] = readFastaSequences(givenPosFastaFilename, segmentSizeInBps, givenNPos, outputFolder, debugLevel, debugMsgLocation); +% nPos and nNeg could be smaller than or equal to givenNPos and givenNNeg resp. +% due to omitting any sequences whose length is less than the segmentSizeInBps % % Read Negatives logMessages(debugMsgLocation,sprintf('Negatives...'), debugLevel); -[nNeg, ommittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, chunkSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation); +[nNeg, omittedNegIndices, allSeqsRawNegFasta] = readFastaSequences(givenNegFastaFilename, segmentSizeInBps, givenNNeg, outputFolder, debugLevel, debugMsgLocation); +% % Combine positives + negatives nBags = nPos + nNeg; -% Check if we need to update the testIndices if some sequences have been ommitted -% if posIndices and negIndices are empty, then no update required -if ~isempty(ommittedPosIndices) - %nOmmitted = length(find(ismember(testIndices, ommittedPosIndices))); - testIndices(find(ismember(testIndices, ommittedPosIndices))) = []; - %logMessages(debugMsgLocation,sprintf('%d positive sequences ommitted from test set.\n', nOmmitted), debugLevel); +% Check if we need to update the testIndices if some sequences have been omitted +% if posIndices and negIndices are empty, then no updates required +if ~isempty(omittedPosIndices) + %nOmmitted = length(find(ismember(testIndices, omittedPosIndices))); + testIndices(find(ismember(testIndices, omittedPosIndices))) = []; + %logMessages(debugMsgLocation,sprintf('%d positive sequences omitted from test set.\n', nOmmitted), debugLevel); end -if ~isempty(ommittedNegIndices) - %nOmmitted = length(find(ismember(testIndices, ommittedNegIndices))); - testIndices(find(ismember(testIndices, nPos+ommittedNegIndices))) = [];%offset by nPos - %logMessages(debugMsgLocation,sprintf('%d negative sequences ommitted from test set.\n', nOmmitted), debugLevel); +if ~isempty(omittedNegIndices) + %nOmmitted = length(find(ismember(testIndices, omittedNegIndices))); + testIndices(find(ismember(testIndices, nPos+omittedNegIndices))) = [];%offset by nPos + %logMessages(debugMsgLocation,sprintf('%d negative sequences omitted from test set.\n', nOmmitted), debugLevel); end % Additionally, one needs to account for the reduction in the number of sequences affecting the indices. Remove indices > nBags -%testIndices = testIndices(testIndices <= nBags); for i=1:length(testIndices) - testIndices(i) = testIndices(i) - nnz(ommittedPosIndices < testIndices(i)) - nnz( (nPos+ommittedNegIndices) < testIndices(i)); + testIndices(i) = testIndices(i) - nnz(omittedPosIndices < testIndices(i)) - nnz( (nPos+omittedNegIndices) < testIndices(i)); end % Put them together allSeqsRawFasta.sequence = cell(1, nBags); @@ -69,35 +65,34 @@ allSeqsRawFasta.sequence{i+nPos} = allSeqsRawNegFasta.sequence{i};%offset by nPos end clear allSeqsRawNegFasta; -% Write the revised set test indices to disk for reproducibility +% Write the revised set of test indices to disk to support reproducibility -if ~isempty(ommittedPosIndices) | ~isempty(ommittedNegIndices) +if ~isempty(omittedPosIndices) | ~isempty(omittedNegIndices) sortedTestIndices = sort(testIndices); dlmwrite(strcat(outputFolder, '/testIndices_New.txt'), sortedTestIndices'); end % generate labels -% handle imbalance, multiply this to C for negatives Y = [ones(1, nPos) -ones(1, nNeg)]; logMessages(debugMsgLocation,sprintf('#Bags: %d\n', nBags), debugLevel); % default for 'acgt' alphabet = 'acgt'; -% for the update, numDist = maxDist + 1; -%fvLength = (4^oligoLen)^2 * numDist; -chunkSizeInPercentage = 0; %default -% non-shifted chunks +% setting default value for segmentSizePercentage. +% Currently, this is the only valid value. +segmentSizeInPercentage = 0; logMessages(debugMsgLocation,sprintf('All bags...'), debugLevel); tic; idx1 = 0; thisInstances = []; for k=1:nBags - %if oligoLen == 3 + if oligoLen >= 3 + % because, for oligoLen 3 or larger, this may take some time. if rem(k,100) == 0 logMessages(debugMsgLocation,sprintf('%d--', k), debugLevel); end - %end + end if any(k == testIndices) % do-nothing now, we collect them later % Collection of test sequences can take place later, @@ -106,12 +101,12 @@ idx1 = idx1 + 1; %% Non-shifted [tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'no-shift', alphabet); + oligoLen, maxDist, segmentSizeInPercentage, ... + segmentSizeInBps, 'no-shift', alphabet); %% Shifted [tempS, S] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'shift', alphabet); + oligoLen, maxDist, segmentSizeInPercentage, ... + segmentSizeInBps, 'shift', alphabet); allSeqsAsBags{idx1} = [tempNS tempS]; thisInstances = [thisInstances (NS + S)]; end @@ -131,12 +126,12 @@ nInstances = instanceEnds(end); logMessages(debugMsgLocation,sprintf('Total instances from all bags: %d\n', nInstances), debugLevel); -direct = 0; -% Using this flag for direct computation from bags (by setting to 1) instead of the 'sparseComputation' flag. -% IMP: the 'direct' flag uses parfor for speeding up the instance-wide kernel computation. +% Using the sparseComputation flag for computing the instance-wide kernel. +% IMP: setting sparseComputation to '0' uses the brokendown approach to compute the instance-wide kernel. +% This hasn't been tested for either correctness or efficiency. We recommend using sparseComputation. % We are using sparseComputation. sparseComputation = 1; -[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation); +[instanceStarts, instanceEnds, instanceWideKernel] = computeStdMultiInstanceKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation); % % Further are functions that will be recomputed per inner cross-validation iteration % Cross-validation loop is also handled simultaneously @@ -155,12 +150,12 @@ Youtertest = Y(testIndices); trainIndicesInY = setdiff([1:nBags], testIndices); -% trainIndicesInY = [1:nBagsForTrain]; trainIndicesInBags = [1:nBagsForTrain]; YwholeTrain = Y(trainIndicesInY); nPosTr = size(find(YwholeTrain > 0),2); nNegTr = size(find(YwholeTrain < 0),2); +% handle imbalance, multiply this to C for negatives imbalance = nPosTr/nNegTr; logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel); @@ -174,7 +169,7 @@ foldIdForIndices = cvpartition(YwholeTrain, 'Kfold', nFolds); % ^returns logicals % - % 5. All kernels ready. Pose this as a multiple kernel learning problem next. + % 4. All kernels ready. Pose this as a multiple kernel learning problem next. % - We have used shogun's python modular interface to perform MKL. % - Upon performing this MKL, the kernel weights obtained are squared theta values (see paper for details) % @@ -221,7 +216,7 @@ trainConformedKernel, testConformedKernel, outputFolder, whetherPerformTest, debugLevel, debugMsgLocation); % both subkernelWeights and predictions are zero vectors, if solving MKL failed. Treat this here. if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0 - % set the following variables: this_teAUROC, this_teAUPRC + % set the following variables to 0.0. a) this_teAUROC, b) this_teAUPRC this_teAUROC = 0.0; this_teAUPRC = 0.0; else %solving MKL was fine @@ -258,7 +253,7 @@ clear trainConformedKernel; clear testConformedKernel; % -% Use the best param combinations to train using the whole trainiing set and predict the test set here +% 5. Use the best param combinations to train using the whole trainiing set and predict the test set here % logMessages(debugMsgLocation,sprintf('---Re-training and Test---\n'), debugLevel); % Re-train the model with all train instances together @@ -273,8 +268,7 @@ computeConformedMultiInstanceKernel(instanceStarts, instanceEnds, instanceWideKernel, ... trainIndicesInBags, allSeqsAsBags, nClusters, conformalXformationParam, ... YwholeTrain, computationVersion, debugLevel, debugMsgLocation); -%fprintf('\ndone in %.3f seconds\n', toc); - +% % for k=1:nClusters % Conformed kernels @@ -284,13 +278,14 @@ end % whetherPerformTest = 0; -% We now set this to 0, since we will make predictions using computed weight vector +% We now set this to 0, since we will make predictions using computed weight vector and not the kernel logMessages(debugMsgLocation,sprintf('Imbalance: %.2f\n', imbalance), debugLevel); % [subkernelWeights, predictions] = performMKLWithShogunPython(bestParamComb.best_C, YwholeTrain, Youtertest, mklNorm, imbalance, ... VtrainConformedKernel, VtestConformedKernel, ... outputFolder, whetherPerformTest, debugLevel, debugMsgLocation); -% Ideally, this is not need here, since such param values won't (or shouldn't) be selected as best values +% Ideally, this is not needed here, since such param values won't (or shouldn't) be selected as best values +% If that does happen, something is probably wrong. if sum(any(subkernelWeights)) == 0 && sum(any(predictions)) == 0 % set the following variables: train_AUROC, train_AUPRC train_AUROC = 0.0; @@ -302,7 +297,7 @@ logMessages(debugMsgLocation,sprintf('train_AUROC with whole training set: %.4f\n', train_AUROC), debugLevel); % Below, we make our own predictions for the test examples using the weight vector -% We will now collect the test examples +% 6.1 We will now collect the test examples tic; logMessages(debugMsgLocation, sprintf('Now collecting the unseen test examples...'), debugLevel); idx1 = 0; @@ -313,23 +308,24 @@ idx1 = idx1 + 1; %% Non-shifted [tempNS, NS] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'no-shift', alphabet); + oligoLen, maxDist, segmentSizeInPercentage, ... + segmentSizeInBps, 'no-shift', alphabet); %% Shifted [tempS, S] = getODHFeatureVecInstances(allSeqsRawFasta.sequence{k}, ... - oligoLen, maxDist, chunkSizeInPercentage, ... - chunkSizeInBps, 'shift', alphabet); + oligoLen, maxDist, segmentSizeInPercentage, ... + segmentSizeInBps, 'shift', alphabet); allSeqsAsBags_Test{idx1} = [tempNS tempS]; end end clear allSeqsRawFasta; logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); -% clears up some space useful to load the many test examples as is typical in problems in biology +% clears up some space useful to load the many test examples as is typical in problems in computational biology if(predictUsingWeightVector == 1) logMessages(debugMsgLocation, sprintf('---Prediction using weight vector---\n'), debugLevel); - % 6.1 Use the subkernel weights and get transformed instances among the test examples, may be use applyTransformation.m + % 6.2 Use the subkernel weights and get transformed instances among the test examples % instanceWeights for instances in training bags - [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel, 'samarth_train1.csv'); + % filename passed as argument to getInstanceWeights is used to write them to file. + [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel); % Get the transformations for test examples, and the diagonal elements which will be used for normalization further % Should one proceed block-wise at this juncture if the number of test examples is extremely large as is typically the case? % @@ -339,18 +335,19 @@ logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); % % Getting the instance weightings for the test examples - [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test, 'samarth_test1.csv'); + [thetaVals, instanceWeightsInEachBag_Test] = getInstanceWeights(subkernelWeights, allSeqsTransformationKernel_Test); % At this point, all (train + test examples) instance weights have been obtained. % - % 6.2 Use function obtainWeightVector and procure the weight vectors and the bias value + % 6.3 Use function obtainWeightVector and procure the weight vectors and the bias value % The weight vector need not be ordered by rank. Rather, we need it by its original order. tic; logMessages(debugMsgLocation, sprintf('Computing the weight vector...'), debugLevel); [overallWeightVector, weightVectors, biasValue] = obtainWeightVector(char(outputFolder), oligoLen, maxDist, subkernelWeights, allSeqsAsBags, rawConformedSetKernel, allSeqsTransformationKernel, debugLevel, debugMsgLocation); logMessages(debugMsgLocation, sprintf('done in %.3f seconds\n', toc), debugLevel); - % Don't need allSeqsAsBags, allSeqsTransformationKernel? - %clear allSeqsAsBags allSeqsTransformationKernel; - % 6.3 Use these to make predictions using y = w*x + b + %% Don't need allSeqsAsBags, allSeqsTransformationKernel? + % clear allSeqsAsBags allSeqsTransformationKernel; + % + % 6.4 Use these to make predictions using y = w*x + b yhatTemp = 0.0; % % bagPhi for Test is nFeatures-by-nBagForTest-by-nClusters @@ -388,15 +385,11 @@ end % % -% Write the instanceWeightInEachBag to file -%dlmwrite('samarth_test.csv', instanceWeightsInEachBag, '-append'); -%disp(instanceWeightsInEachBag{1}); -%disp(size(instanceWeightsInEachBag)); -%disp(size(instanceWeightsInEachBag{1})); -%disp(size(instanceWeightsInEachBag{2})); -%for i=1:size(instanceWeightsInEachBag,2) -% dlmwrite('samarth_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append'); -%end +%% Write the instanceWeightInEachBag to file +% dlmwrite('instanceWeights_test.csv', instanceWeightsInEachBag, '-append'); +% for i=1:size(instanceWeightsInEachBag,2) +% dlmwrite('instanceWeights_test.csv', cell2mat(instanceWeightsInEachBag{i})', '-append'); +% end if strcmp(whetherToPlotHeatmap, 'Yes') logMessages(debugMsgLocation,sprintf('Plotting heatmaps...'), debugLevel); @@ -406,14 +399,14 @@ normInstanceWeightsInEachBag = standardizeMatrix(cell2mat(instanceWeightsInEachBag)'); end % - heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(chunkSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf'); + heatmapFname = strcat(outputFolder,'/Heatmap-InstanceWeights', '_oligoLen', num2str(oligoLen), '_segmentSize', num2str(segmentSizeInBps) , '_maxDist', num2str(maxDist), '_nClusters', num2str(bestParamComb.best_nClusters), '_svmC', num2str(bestParamComb.best_C) ,'_sigma', num2str(bestParamComb.best_sigma), '.pdf'); % p = plot_heatmap(normInstanceWeightsInEachBag, heatmapFname, 0, 1); logMessages(debugMsgLocation,sprintf('done!\n'), debugLevel); % end % -% 7. Visualize weight vector as motifs +% 7. Visualize weight vectors and motifs % if strcmp(whetherToVisualizeWVector, 'Yes') logMessages(debugMsgLocation,sprintf('Visualizing weight vectors...\n'), debugLevel); diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m index c2086f4..01e1330 100644 --- a/computeInstanceWideKernel.m +++ b/computeInstanceWideKernel.m @@ -1,4 +1,4 @@ -function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, direct, sparseComputation, debugLevel, debugMsgLocation) +function [instanceStarts, instanceEnds, instanceWideKernel] = computeInstanceWideKernel(allSeqsAsBags, thisInstances, sparseComputation, debugLevel, debugMsgLocation) % COMPUTEINSTANCEWIDEKERNEL % diff --git a/getInstanceWeights.m b/getInstanceWeights.m index 234abbc..62b49c7 100644 --- a/getInstanceWeights.m +++ b/getInstanceWeights.m @@ -1,10 +1,9 @@ -function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel, fname) +function [thetaVals, instanceWeightsInEachBag] = getInstanceWeights(kernelWeights, allSeqsTransformationKernel) % -% Return the thetaals and -% instance weights for non-shifted and shifted bags +% Return the theta values and instance weights +% for all bags % - % following a squared relation % order followed is: non-shifted kernels (1 to N) first, then shifted kernels (N+1 to 2N) in total 2N kernels % Edit: now number of kernels is same as number of clusters @@ -24,15 +23,13 @@ size(allSeqsTransformationKernel{j},1), 1) ... .* allSeqsTransformationKernel{j}, 2); instanceWeightsInEachBag{j} = temp; - % To write to file - stdTemp = standardizeMatrix(temp'); - finalTemp = zeros(size(stdTemp)); - %using rankings - [~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend'); - %for i=1:size(plotDataM,1) - finalTemp(1,ranking) = 1:size(stdTemp,2); - %end - dlmwrite(fname, finalTemp, '-append'); + %% if to be written to file, pass fname to the function + %stdTemp = standardizeMatrix(temp'); + %finalTemp = zeros(size(stdTemp)); + %% Using rankings + %[~, ranking] = sort(stdTemp(1:size(stdTemp,1),1:size(stdTemp,2)), 2, 'descend'); + %finalTemp(1,ranking) = 1:size(stdTemp,2); + %dlmwrite(fname, finalTemp, '-append'); end