Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
comik/comik_wrapper.m
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
240 lines (211 sloc)
10.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [] = comik_wrapper(configFile) | |
% COMIK_WRAPPER | |
% Usage: | |
% comik_wrapper(configFile) | |
% | |
% INPUT PARAMS | |
% Param 'configFile' | |
% Config filename with the param values set. More information on the params | |
% given below | |
% | |
% Only the following params have default values: | |
% | |
% param_name default_val | |
% ---------- ----------- | |
% mklNorm 2.0 | |
% nFolds 10 | |
% nOuterFolds 5 | |
% whetherToPlotHeatmap 'No' | |
% whetherToVisualizeWVector 'Yes' | |
% debugLevel 2 | |
% debugMsgLocation 1 | |
% computationVersion 'Looping' | |
% | |
% Rest, params that are required to be specified: | |
% ----------------------------------------------- | |
% Param 'givenFastFilename' (string) | |
% The input FASTA file should contain all sequences together, positives | |
% followed by negatives. | |
% | |
% Params 'nPosSequences' and 'nNegSequences' | |
% Used to specify the number of positive and negative sequences in the dataset. | |
% | |
% Param 'testIndices' (vector) | |
% Indices of the sequences in the FASTA file which are to be considered as unseen | |
% test examples. For example, with a FASTA file containing a total of 100 | |
% positive sequences followed by 100 negative sequences, the test indices | |
% are given as | |
% testIndices = [81:100 181:200] | |
% for the corresponding 20 positives and 20 negatives to be treated as test | |
% examples. | |
% | |
% Param 'outputFolder' (string) | |
% Specifies the path on disk where output can be written | |
% | |
% Params 'oligoLen' and 'maxDist' | |
% Specify the oligomer length and the maximum distance for the ODH | |
% representation. Caution: A combination of large values can be memory intensive! | |
% | |
% Param 'segmentSizeInBps' | |
% Specify the segment-size in basepairs for CoMIK | |
% | |
% Param 'nClusterVals' | |
% Specify the number of clusters for CoMIK | |
% | |
% Param 'sigmaVals' | |
% Specify the sigma values for the Gaussian transformation | |
% | |
% Param 'Cs' | |
% Cost values for SVM | |
% | |
% ADDITIONAL NOTES | |
% | |
% | |
% Author: snikumbh@mpi-inf.mpg.de | |
if nargin == 1 | |
% check, this should be the config filename | |
% default name: config-comik.txt | |
if ischar(configFile) | |
if exist(configFile, 'file') == 2 | |
fid = fopen(configFile, 'r'); | |
l = fgetl(fid); | |
firstLineConfigFile = '## CoMIK CONFIG FILE'; | |
if findstr(l, firstLineConfigFile) > 0 | |
% configParams is a struct | |
configParams = readConfigFile(configFile); | |
end | |
fclose(fid); | |
else | |
eMsg = 'Given config file does not exist!'; | |
error(eMsg); | |
end | |
else | |
error('Error.\nInput must be a char, not a %s.', class(configFile)); | |
end | |
else | |
error('Specify only the config filename!'); | |
end | |
if exist(configParams.outputFolder, 'dir') == 7 | |
%do-nothing | |
else | |
status = mkdir(configParams.outputFolder); | |
end | |
% -- Perform different outer folds with parfor, asynchronously | |
% -- Use the given testIndices to know the percentages of examples used for | |
% training and test | |
% Set seed for the random number generator | |
rng(21, 'twister'); | |
posIndices = randperm(configParams.nPosSequences); | |
negIndices = randperm(configParams.nNegSequences); | |
posTestIndices = intersect(configParams.testIndices, posIndices); | |
negTestIndices = setdiff(configParams.testIndices, posIndices) - configParams.nPosSequences; | |
thisTestIndices{1} = configParams.testIndices; | |
for i=2:configParams.nOuterFolds | |
thisPosTestIndices = circshift(posIndices, [0, (i-1)*length(posTestIndices)]); | |
thisNegTestIndices = circshift(negIndices, [0, (i-1)*length(negTestIndices)]) + length(posIndices); | |
thisTestIndices{i} = [thisPosTestIndices(end-(length(posTestIndices))+1:end) thisNegTestIndices(end-(length(negTestIndices))+1:end)]; | |
end | |
%% Conditionally create a new pool; delete any existing pool of workers if NumWorkers is less than required, else use the same poolobj | |
poolobj = gcp('nocreate'); | |
if isempty(poolobj) | |
parpool('local', configParams.nOuterFolds); | |
else | |
if poolobj.NumWorkers >= configParams.nOuterFolds | |
logMessages(1, sprintf('Reusing existing parallel pool: %d out of %d workers\n', configParams.nOuterFolds, poolobj.NumWorkers), configParams.debugLevel); | |
else | |
logMessages(1, sprintf('Not enough workers in the existing parallel pool.\nDeleting existing pool with %d workers\n', poolobj.NumWorkers), configParams.debugLevel); | |
delete(poolobj); | |
parpool('local', configParams.nOuterFolds); | |
end | |
end | |
parfor (i=1:configParams.nOuterFolds, configParams.nOuterFolds) | |
thisFoldDirectory = strcat(configParams.outputFolder, '/outer_fold_', num2str(i)); | |
returnStatusVal = perform_one_outer_fold_for_comik(i, configParams.givenPosFastaFilename, configParams.givenNegFastaFilename, configParams.nPosSequences, ... | |
configParams.nNegSequences, configParams.oligoLen, configParams.maxDist, configParams.segmentSizeInBps, ... | |
configParams.nClusterVals, configParams.sigmaVals, configParams.Cs, configParams.mklNorm, configParams.nFolds, ... | |
thisTestIndices{i}, configParams.debugLevel, thisFoldDirectory, configParams.whetherToPlotHeatmap, ... | |
configParams.computationVersion, configParams.whetherToVisualizeWVector, configParams.debugMsgLocation); | |
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), configParams.debugLevel); | |
end | |
logMessages(1, sprintf('All outer folds completed!\n'), configParams.debugLevel); | |
end % comik_wrapper function ends | |
function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation) | |
% Runs one outer fold for CoMIKL | |
% Make the outputFolder for this outer fold | |
if exist(outputFolder, 'dir') == 7 | |
%do-nothing | |
else | |
status = mkdir(outputFolder); | |
end | |
% if debugMsgLocation is 1, it is the MATLAB command prompt, else a file which is to be closed at the end. | |
if debugMsgLocation ~= 1 | |
debugMsgLocation = fopen(strcat(outputFolder, '/', debugMsgLocation), 'w'); | |
end | |
% Write the testIndices to disk so that it is reproducible | |
testIndicesFilename = strcat(outputFolder, '/testIndicesOuterFold', num2str(outerFoldID), '.txt'); | |
testFid = fopen(testIndicesFilename, 'a'); | |
fprintf(testFid, 'testIndices for this outer fold:\n'); | |
fclose(testFid); | |
if exist(testIndicesFilename, 'file') == 2 | |
dlmwrite(testIndicesFilename, sort(testIndices'),'-append', 'roffset', 1); | |
logMessages(debugMsgLocation, sprintf('Test indices for this outer fold written to disk'), debugLevel); | |
end | |
for l=1:size(oligoLen,2) | |
filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l))); | |
runSummaryFilename = strcat(outputFolder, '/runSummary', filenameSuffix, '.txt'); | |
fid = fopen(runSummaryFilename, 'a'); | |
fprintf(fid, '\n==================================================================================\n'); | |
logMessages(debugMsgLocation, sprintf('\n==================================================================================\n'), debugLevel); | |
fprintf(fid, ' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n'); | |
logMessages(debugMsgLocation, sprintf(' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n'), debugLevel); | |
fprintf(fid, ' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n'); | |
logMessages(debugMsgLocation, sprintf(' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n'), debugLevel); | |
fprintf(fid, ' PosFasta: %s \n', givenPosFastaFilename); | |
logMessages(debugMsgLocation, sprintf(' PosFasta: %s \n', givenPosFastaFilename), debugLevel); | |
fprintf(fid, ' NegFasta: %s \n', givenNegFastaFilename); | |
logMessages(debugMsgLocation, sprintf(' NegFasta: %s \n', givenNegFastaFilename), debugLevel); | |
fprintf(fid, ' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps); | |
logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel); | |
fprintf(fid, ' mklNorm: %.2f\n', mklNorm); | |
logMessages(debugMsgLocation, sprintf(' mklNorm: %.2f\n', mklNorm), debugLevel); | |
fprintf(fid, ' Outer Fold: %d\n', outerFoldID); | |
logMessages(debugMsgLocation, sprintf(' Outer folds: %d\n', outerFoldID), debugLevel); | |
fprintf(fid, ' Cross-validation folds: %d\n', nFolds); | |
logMessages(debugMsgLocation, sprintf(' Cross-validation folds: %d\n', nFolds), debugLevel); | |
fprintf(fid, ' #Clusters: %d \n', nClusterVals); | |
logMessages(debugMsgLocation, sprintf(' #Clusters: %d \n', nClusterVals), debugLevel); | |
fprintf(fid, ' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals); | |
logMessages(debugMsgLocation, sprintf(' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals), debugLevel); | |
fprintf(fid, ' C for SVM: %.3f\n', Cs); | |
logMessages(debugMsgLocation, sprintf(' C for SVM: %.3f\n', Cs), debugLevel); | |
fprintf(fid, ' Timestamp: %s\n', datestr(now)); | |
logMessages(debugMsgLocation, sprintf(' Timestamp: %s\n', datestr(now)), debugLevel); | |
% | |
logMessages(debugMsgLocation, sprintf('Using weight vector \n'), debugLevel); | |
fprintf(fid, ' Using weight vector \n'); | |
% | |
fprintf(fid, '===================================================================================\n'); | |
logMessages(debugMsgLocation, sprintf('===================================================================================\n'), debugLevel); | |
fclose(fid); | |
% | |
% | |
[allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector); | |
fid=fopen(runSummaryFilename, 'a'); | |
fprintf(fid, '\n\n'); | |
fprintf(fid, '===== Validation results =====\n'); | |
fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps); | |
fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC); | |
fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC); | |
fprintf(fid, 'Best C: %.3f\n', bestVals.best_C); | |
fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma); | |
fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters); | |
fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC); | |
fprintf(fid, '====== END ======\n\n'); | |
fclose(fid); | |
end | |
if debugMsgLocation ~= 1 | |
% debugMsgLocation, a fileID is to be closed | |
fclose(debugMsgLocation); | |
end | |
statusVal = 'OK'; | |
end |