Skip to content
Permalink
a056a1c421
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
240 lines (211 sloc) 10.7 KB
function [] = comik_wrapper(configFile)
% COMIK_WRAPPER
% Usage:
% comik_wrapper(configFile)
%
% INPUT PARAMS
% Param 'configFile'
% Config filename with the param values set. More information on the params
% given below
%
% Only the following params have default values:
%
% param_name default_val
% ---------- -----------
% mklNorm 2.0
% nFolds 10
% nOuterFolds 5
% whetherToPlotHeatmap 'No'
% whetherToVisualizeWVector 'Yes'
% debugLevel 2
% debugMsgLocation 1
% computationVersion 'Looping'
%
% Rest, params that are required to be specified:
% -----------------------------------------------
% Param 'givenFastFilename' (string)
% The input FASTA file should contain all sequences together, positives
% followed by negatives.
%
% Params 'nPosSequences' and 'nNegSequences'
% Used to specify the number of positive and negative sequences in the dataset.
%
% Param 'testIndices' (vector)
% Indices of the sequences in the FASTA file which are to be considered as unseen
% test examples. For example, with a FASTA file containing a total of 100
% positive sequences followed by 100 negative sequences, the test indices
% are given as
% testIndices = [81:100 181:200]
% for the corresponding 20 positives and 20 negatives to be treated as test
% examples.
%
% Param 'outputFolder' (string)
% Specifies the path on disk where output can be written
%
% Params 'oligoLen' and 'maxDist'
% Specify the oligomer length and the maximum distance for the ODH
% representation. Caution: A combination of large values can be memory intensive!
%
% Param 'segmentSizeInBps'
% Specify the segment-size in basepairs for CoMIK
%
% Param 'nClusterVals'
% Specify the number of clusters for CoMIK
%
% Param 'sigmaVals'
% Specify the sigma values for the Gaussian transformation
%
% Param 'Cs'
% Cost values for SVM
%
% ADDITIONAL NOTES
%
%
% Author: snikumbh@mpi-inf.mpg.de
if nargin == 1
% check, this should be the config filename
% default name: config-comik.txt
if ischar(configFile)
if exist(configFile, 'file') == 2
fid = fopen(configFile, 'r');
l = fgetl(fid);
firstLineConfigFile = '## CoMIK CONFIG FILE';
if findstr(l, firstLineConfigFile) > 0
% configParams is a struct
configParams = readConfigFile(configFile);
end
fclose(fid);
else
eMsg = 'Given config file does not exist!';
error(eMsg);
end
else
error('Error.\nInput must be a char, not a %s.', class(configFile));
end
else
error('Specify only the config filename!');
end
if exist(configParams.outputFolder, 'dir') == 7
%do-nothing
else
status = mkdir(configParams.outputFolder);
end
% -- Perform different outer folds with parfor, asynchronously
% -- Use the given testIndices to know the percentages of examples used for
% training and test
% Set seed for the random number generator
rng(21, 'twister');
posIndices = randperm(configParams.nPosSequences);
negIndices = randperm(configParams.nNegSequences);
posTestIndices = intersect(configParams.testIndices, posIndices);
negTestIndices = setdiff(configParams.testIndices, posIndices) - configParams.nPosSequences;
thisTestIndices{1} = configParams.testIndices;
for i=2:configParams.nOuterFolds
thisPosTestIndices = circshift(posIndices, [0, (i-1)*length(posTestIndices)]);
thisNegTestIndices = circshift(negIndices, [0, (i-1)*length(negTestIndices)]) + length(posIndices);
thisTestIndices{i} = [thisPosTestIndices(end-(length(posTestIndices))+1:end) thisNegTestIndices(end-(length(negTestIndices))+1:end)];
end
%% Conditionally create a new pool; delete any existing pool of workers if NumWorkers is less than required, else use the same poolobj
poolobj = gcp('nocreate');
if isempty(poolobj)
parpool('local', configParams.nOuterFolds);
else
if poolobj.NumWorkers >= configParams.nOuterFolds
logMessages(1, sprintf('Reusing existing parallel pool: %d out of %d workers\n', configParams.nOuterFolds, poolobj.NumWorkers), configParams.debugLevel);
else
logMessages(1, sprintf('Not enough workers in the existing parallel pool.\nDeleting existing pool with %d workers\n', poolobj.NumWorkers), configParams.debugLevel);
delete(poolobj);
parpool('local', configParams.nOuterFolds);
end
end
parfor (i=1:configParams.nOuterFolds, configParams.nOuterFolds)
thisFoldDirectory = strcat(configParams.outputFolder, '/outer_fold_', num2str(i));
returnStatusVal = perform_one_outer_fold_for_comik(i, configParams.givenPosFastaFilename, configParams.givenNegFastaFilename, configParams.nPosSequences, ...
configParams.nNegSequences, configParams.oligoLen, configParams.maxDist, configParams.segmentSizeInBps, ...
configParams.nClusterVals, configParams.sigmaVals, configParams.Cs, configParams.mklNorm, configParams.nFolds, ...
thisTestIndices{i}, configParams.debugLevel, thisFoldDirectory, configParams.whetherToPlotHeatmap, ...
configParams.computationVersion, configParams.whetherToVisualizeWVector, configParams.debugMsgLocation);
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), configParams.debugLevel);
end
logMessages(1, sprintf('All outer folds completed!\n'), configParams.debugLevel);
end % comik_wrapper function ends
function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
% Runs one outer fold for CoMIKL
% Make the outputFolder for this outer fold
if exist(outputFolder, 'dir') == 7
%do-nothing
else
status = mkdir(outputFolder);
end
% if debugMsgLocation is 1, it is the MATLAB command prompt, else a file which is to be closed at the end.
if debugMsgLocation ~= 1
debugMsgLocation = fopen(strcat(outputFolder, '/', debugMsgLocation), 'w');
end
% Write the testIndices to disk so that it is reproducible
testIndicesFilename = strcat(outputFolder, '/testIndicesOuterFold', num2str(outerFoldID), '.txt');
testFid = fopen(testIndicesFilename, 'a');
fprintf(testFid, 'testIndices for this outer fold:\n');
fclose(testFid);
if exist(testIndicesFilename, 'file') == 2
dlmwrite(testIndicesFilename, sort(testIndices'),'-append', 'roffset', 1);
logMessages(debugMsgLocation, sprintf('Test indices for this outer fold written to disk'), debugLevel);
end
for l=1:size(oligoLen,2)
filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l)));
runSummaryFilename = strcat(outputFolder, '/runSummary', filenameSuffix, '.txt');
fid = fopen(runSummaryFilename, 'a');
fprintf(fid, '\n==================================================================================\n');
logMessages(debugMsgLocation, sprintf('\n==================================================================================\n'), debugLevel);
fprintf(fid, ' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n');
logMessages(debugMsgLocation, sprintf(' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n'), debugLevel);
fprintf(fid, ' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n');
logMessages(debugMsgLocation, sprintf(' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n'), debugLevel);
fprintf(fid, ' PosFasta: %s \n', givenPosFastaFilename);
logMessages(debugMsgLocation, sprintf(' PosFasta: %s \n', givenPosFastaFilename), debugLevel);
fprintf(fid, ' NegFasta: %s \n', givenNegFastaFilename);
logMessages(debugMsgLocation, sprintf(' NegFasta: %s \n', givenNegFastaFilename), debugLevel);
fprintf(fid, ' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps);
logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel);
fprintf(fid, ' mklNorm: %.2f\n', mklNorm);
logMessages(debugMsgLocation, sprintf(' mklNorm: %.2f\n', mklNorm), debugLevel);
fprintf(fid, ' Outer Fold: %d\n', outerFoldID);
logMessages(debugMsgLocation, sprintf(' Outer folds: %d\n', outerFoldID), debugLevel);
fprintf(fid, ' Cross-validation folds: %d\n', nFolds);
logMessages(debugMsgLocation, sprintf(' Cross-validation folds: %d\n', nFolds), debugLevel);
fprintf(fid, ' #Clusters: %d \n', nClusterVals);
logMessages(debugMsgLocation, sprintf(' #Clusters: %d \n', nClusterVals), debugLevel);
fprintf(fid, ' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals);
logMessages(debugMsgLocation, sprintf(' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals), debugLevel);
fprintf(fid, ' C for SVM: %.3f\n', Cs);
logMessages(debugMsgLocation, sprintf(' C for SVM: %.3f\n', Cs), debugLevel);
fprintf(fid, ' Timestamp: %s\n', datestr(now));
logMessages(debugMsgLocation, sprintf(' Timestamp: %s\n', datestr(now)), debugLevel);
%
logMessages(debugMsgLocation, sprintf('Using weight vector \n'), debugLevel);
fprintf(fid, ' Using weight vector \n');
%
fprintf(fid, '===================================================================================\n');
logMessages(debugMsgLocation, sprintf('===================================================================================\n'), debugLevel);
fclose(fid);
%
%
[allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
fid=fopen(runSummaryFilename, 'a');
fprintf(fid, '\n\n');
fprintf(fid, '===== Validation results =====\n');
fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps);
fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC);
fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC);
fprintf(fid, 'Best C: %.3f\n', bestVals.best_C);
fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma);
fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters);
fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC);
fprintf(fid, '====== END ======\n\n');
fclose(fid);
end
if debugMsgLocation ~= 1
% debugMsgLocation, a fileID is to be closed
fclose(debugMsgLocation);
end
statusVal = 'OK';
end