Skip to content
Switch branches/tags
Go to file
Cannot retrieve contributors at this time
function [] = comik_wrapper(configFile)
% Usage:
% comik_wrapper(configFile)
% Param 'configFile'
% Config filename with the param values set. More information on the params
% given below
% Only the following params have default values:
% param_name default_val
% ---------- -----------
% mklNorm 2.0
% nFolds 10
% nOuterFolds 5
% whetherToPlotHeatmap 'No'
% whetherToVisualizeWVector 'Yes'
% debugLevel 2
% debugMsgLocation 1
% computationVersion 'Looping'
% Rest, params that are required to be specified:
% -----------------------------------------------
% Param 'givenFastFilename' (string)
% The input FASTA file should contain all sequences together, positives
% followed by negatives.
% Params 'nPosSequences' and 'nNegSequences'
% Used to specify the number of positive and negative sequences in the dataset.
% Param 'testIndices' (vector)
% Indices of the sequences in the FASTA file which are to be considered as unseen
% test examples. For example, with a FASTA file containing a total of 100
% positive sequences followed by 100 negative sequences, the test indices
% are given as
% testIndices = [81:100 181:200]
% for the corresponding 20 positives and 20 negatives to be treated as test
% examples.
% Param 'outputFolder' (string)
% Specifies the path on disk where output can be written
% Params 'oligoLen' and 'maxDist'
% Specify the oligomer length and the maximum distance for the ODH
% representation. Caution: A combination of large values can be memory intensive!
% Param 'segmentSizeInBps'
% Specify the segment-size in basepairs for CoMIK
% Param 'nClusterVals'
% Specify the number of clusters for CoMIK
% Param 'sigmaVals'
% Specify the sigma values for the Gaussian transformation
% Param 'Cs'
% Cost values for SVM
% Author:
if nargin == 1
% check, this should be the config filename
% default name: config-comik.txt
if ischar(configFile)
if exist(configFile, 'file') == 2
fid = fopen(configFile, 'r');
l = fgetl(fid);
firstLineConfigFile = '## CoMIK CONFIG FILE';
if findstr(l, firstLineConfigFile) > 0
% configParams is a struct
configParams = readConfigFile(configFile);
eMsg = 'Given config file does not exist!';
error('Error.\nInput must be a char, not a %s.', class(configFile));
error('Specify only the config filename!');
if exist(configParams.outputFolder, 'dir') == 7
status = mkdir(configParams.outputFolder);
% -- Perform different outer folds with parfor, asynchronously
% -- Use the given testIndices to know the percentages of examples used for
% training and test
% Set seed for the random number generator
rng(21, 'twister');
posIndices = randperm(configParams.nPosSequences);
negIndices = randperm(configParams.nNegSequences);
posTestIndices = intersect(configParams.testIndices, posIndices);
negTestIndices = setdiff(configParams.testIndices, posIndices) - configParams.nPosSequences;
thisTestIndices{1} = configParams.testIndices;
for i=2:configParams.nOuterFolds
thisPosTestIndices = circshift(posIndices, [0, (i-1)*length(posTestIndices)]);
thisNegTestIndices = circshift(negIndices, [0, (i-1)*length(negTestIndices)]) + length(posIndices);
thisTestIndices{i} = [thisPosTestIndices(end-(length(posTestIndices))+1:end) thisNegTestIndices(end-(length(negTestIndices))+1:end)];
%% Conditionally create a new pool; delete any existing pool of workers if NumWorkers is less than required, else use the same poolobj
poolobj = gcp('nocreate');
if isempty(poolobj)
parpool('local', configParams.nOuterFolds);
if poolobj.NumWorkers >= configParams.nOuterFolds
logMessages(1, sprintf('Reusing existing parallel pool: %d out of %d workers\n', configParams.nOuterFolds, poolobj.NumWorkers), configParams.debugLevel);
logMessages(1, sprintf('Not enough workers in the existing parallel pool.\nDeleting existing pool with %d workers\n', poolobj.NumWorkers), configParams.debugLevel);
parpool('local', configParams.nOuterFolds);
parfor (i=1:configParams.nOuterFolds, configParams.nOuterFolds)
thisFoldDirectory = strcat(configParams.outputFolder, '/outer_fold_', num2str(i));
returnStatusVal = perform_one_outer_fold_for_comik(i, configParams.givenPosFastaFilename, configParams.givenNegFastaFilename, configParams.nPosSequences, ...
configParams.nNegSequences, configParams.oligoLen, configParams.maxDist, configParams.segmentSizeInBps, ...
configParams.nClusterVals, configParams.sigmaVals, configParams.Cs, configParams.mklNorm, configParams.nFolds, ...
thisTestIndices{i}, configParams.debugLevel, thisFoldDirectory, configParams.whetherToPlotHeatmap, ...
configParams.computationVersion, configParams.whetherToVisualizeWVector, configParams.debugMsgLocation);
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), configParams.debugLevel);
logMessages(1, sprintf('All outer folds completed!\n'), configParams.debugLevel);
end % comik_wrapper function ends
function statusVal = perform_one_outer_fold_for_comik(outerFoldID, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, outputFolder, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector, debugMsgLocation)
% Runs one outer fold for CoMIKL
% Make the outputFolder for this outer fold
if exist(outputFolder, 'dir') == 7
status = mkdir(outputFolder);
% if debugMsgLocation is 1, it is the MATLAB command prompt, else a file which is to be closed at the end.
if debugMsgLocation ~= 1
debugMsgLocation = fopen(strcat(outputFolder, '/', debugMsgLocation), 'w');
% Write the testIndices to disk so that it is reproducible
testIndicesFilename = strcat(outputFolder, '/testIndicesOuterFold', num2str(outerFoldID), '.txt');
testFid = fopen(testIndicesFilename, 'a');
fprintf(testFid, 'testIndices for this outer fold:\n');
if exist(testIndicesFilename, 'file') == 2
dlmwrite(testIndicesFilename, sort(testIndices'),'-append', 'roffset', 1);
logMessages(debugMsgLocation, sprintf('Test indices for this outer fold written to disk'), debugLevel);
for l=1:size(oligoLen,2)
filenameSuffix = strcat('_segment-size', num2str(segmentSizeInBps), '_oligoLen', num2str(oligoLen(l)));
runSummaryFilename = strcat(outputFolder, '/runSummary', filenameSuffix, '.txt');
fid = fopen(runSummaryFilename, 'a');
fprintf(fid, '\n==================================================================================\n');
logMessages(debugMsgLocation, sprintf('\n==================================================================================\n'), debugLevel);
fprintf(fid, ' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n');
logMessages(debugMsgLocation, sprintf(' Conformal Multi-Instance Kernels for Handling Varying Length Sequences In A \n Discriminative Setting\n'), debugLevel);
fprintf(fid, ' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n');
logMessages(debugMsgLocation, sprintf(' Authors: Sarvesh Nikumbh, Peter Ebert, Nico Pfeifer\n\n'), debugLevel);
fprintf(fid, ' PosFasta: %s \n', givenPosFastaFilename);
logMessages(debugMsgLocation, sprintf(' PosFasta: %s \n', givenPosFastaFilename), debugLevel);
fprintf(fid, ' NegFasta: %s \n', givenNegFastaFilename);
logMessages(debugMsgLocation, sprintf(' NegFasta: %s \n', givenNegFastaFilename), debugLevel);
fprintf(fid, ' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps);
logMessages(debugMsgLocation, sprintf(' oligoLength: %d, maxDist: %d, segmentSize: %d bps \n', oligoLen(l), maxDist, segmentSizeInBps), debugLevel);
fprintf(fid, ' mklNorm: %.2f\n', mklNorm);
logMessages(debugMsgLocation, sprintf(' mklNorm: %.2f\n', mklNorm), debugLevel);
fprintf(fid, ' Outer Fold: %d\n', outerFoldID);
logMessages(debugMsgLocation, sprintf(' Outer folds: %d\n', outerFoldID), debugLevel);
fprintf(fid, ' Cross-validation folds: %d\n', nFolds);
logMessages(debugMsgLocation, sprintf(' Cross-validation folds: %d\n', nFolds), debugLevel);
fprintf(fid, ' #Clusters: %d \n', nClusterVals);
logMessages(debugMsgLocation, sprintf(' #Clusters: %d \n', nClusterVals), debugLevel);
fprintf(fid, ' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals);
logMessages(debugMsgLocation, sprintf(' Sigma for Gaussian RBF Xformation: %.3f\n', sigmaVals), debugLevel);
fprintf(fid, ' C for SVM: %.3f\n', Cs);
logMessages(debugMsgLocation, sprintf(' C for SVM: %.3f\n', Cs), debugLevel);
fprintf(fid, ' Timestamp: %s\n', datestr(now));
logMessages(debugMsgLocation, sprintf(' Timestamp: %s\n', datestr(now)), debugLevel);
logMessages(debugMsgLocation, sprintf('Using weight vector \n'), debugLevel);
fprintf(fid, ' Using weight vector \n');
fprintf(fid, '===================================================================================\n');
logMessages(debugMsgLocation, sprintf('===================================================================================\n'), debugLevel);
[allSeqsAsBags, allSeqsConformedSetKernel, kernelweights, thetaVals, instanceWeightsInEachBag, resultString, bestVals, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen(l), maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector);
fid=fopen(runSummaryFilename, 'a');
fprintf(fid, '\n\n');
fprintf(fid, '===== Validation results =====\n');
fprintf(fid, 'OligoLen: %d, maxDist: %d, segment size: %d\n', oligoLen(l), maxDist, segmentSizeInBps);
fprintf(fid, 'Validation teAUPRC: %.4f\n', test_teAUPRC);
fprintf(fid, 'Validation teAUROC: %.4f\n', test_teAUROC);
fprintf(fid, 'Best C: %.3f\n', bestVals.best_C);
fprintf(fid, 'Best sigma: %.3f\n', bestVals.best_sigma);
fprintf(fid, 'Best nClusters: %d\n', bestVals.best_nClusters);
fprintf(fid, 'Best teAUROC: %.4f\n', bestVals.best_teAUROC);
fprintf(fid, '====== END ======\n\n');
if debugMsgLocation ~= 1
% debugMsgLocation, a fileID is to be closed
statusVal = 'OK';