Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
config file added. comik_wrapper now accepts a config file directly. …
…Minor edit in getExpansionPoints
  • Loading branch information
snikumbh committed Jul 31, 2017
1 parent 0166c98 commit 982dce4
Show file tree
Hide file tree
Showing 7 changed files with 2,083 additions and 4,563 deletions.
215 changes: 53 additions & 162 deletions comik_wrapper.m
@@ -1,11 +1,13 @@
function [] = comik_wrapper(givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, testIndices, outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, whetherToVisualizeWVector, debugLevel, debugMsgLocation, computationVersion)
function [] = comik_wrapper(configFile)

% COMIK_WRAPPER
% Usage:
% comik_wrapper(givenFastaFilename, nPosSequences, nNegSequences, ...
% outputFolder, oligoLen, maxDist, segmentSizeInBps, nClusterVals, ...
% sigmaVals, Cs, mklNorm, nFolds, nOuterFolds, whetherToPlotHeatmap, ...
% computationVersion, whetherToVisualizeWVector)
% comik_wrapper(configFile)
%
% INPUT PARAMS
% Param 'configFile'
% Config filename with the param values set. More information on the params
% given below
%
% Only the following params have default values:
%
Expand All @@ -20,9 +22,8 @@
% debugMsgLocation 1
% computationVersion 'Looping'
%
% Required params:
% ----------------
% INPUT PARAMS
% Rest, params that are required to be specified:
% -----------------------------------------------
% Param 'givenFastFilename' (string)
% The input FASTA file should contain all sequences together, positives
% followed by negatives.
Expand Down Expand Up @@ -63,146 +64,35 @@
%
% Author: snikumbh@mpi-inf.mpg.de

totalArguments = 20;
if nargin < totalArguments
computationVersion = 'Looping';
end
if nargin < totalArguments - 1
computationVersion = 'Looping';
debugMsgLocation = 1;
end
if nargin < totalArguments - 2
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
end
if nargin < totalArguments - 3
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
end
if nargin < totalArguments - 4
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
end
if nargin < totalArguments - 5
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
end
if nargin < totalArguments - 6
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
end
if nargin < totalArguments - 7
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
end
if nargin < totalArguments - 8
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
end
if nargin < totalArguments - 9
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
end
if nargin < totalArguments - 10
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
end
if nargin < totalArguments - 11
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
segmentSizeInBps = 100;
end
if nargin < totalArguments - 12
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
segmentSizeInBps = 100;
maxDist = 50;
end
if nargin < totalArguments - 13
computationVersion = 'Looping';
debugMsgLocation = 1;
debugLevel = 2;
whetherToVisualizeWVector = 'Yes';
whetherToPlotHeatmap = 'No';
nOuterFolds = 5;
nFolds = 10;
mklNorm = 2.0;
Cs = 10.^[-3:1:3];
sigmaVals = 10.^[-1:1:2];
nClusterVals = [2 5];
segmentSizeInBps = 100;
maxDist = 50;
oligoLen = [2];
end
if nargin == 1
% check, this should be the config filename
% default name: config-comik.txt
if ischar(configFile)
if exist(configFile, 'file') == 2
fid = fopen(configFile, 'r');
l = fgetl(fid);
firstLineConfigFile = '## CoMIK CONFIG FILE';
if findstr(l, firstLineConfigFile) > 0
% configParams is a struct
configParams = readConfigFile(configFile);
end
fclose(fid);
else
eMsg = 'Given config file does not exist!';
error(eMsg);
end
else
error('Error.\nInput must be a char, not a %s.', class(configFile));
end
else
error('Specify only the config filename!');
end


if exist(outputFolder, 'dir') == 7
if exist(configParams.outputFolder, 'dir') == 7
%do-nothing
else
status = mkdir(outputFolder);
status = mkdir(configParams.outputFolder);
end

% -- Perform different outer folds with parfor, asynchronously
Expand All @@ -211,13 +101,13 @@

% Set seed for the random number generator
rng(21, 'twister');
posIndices = randperm(nPosSequences);
negIndices = randperm(nNegSequences);
posIndices = randperm(configParams.nPosSequences);
negIndices = randperm(configParams.nNegSequences);

posTestIndices = intersect(testIndices, posIndices);
negTestIndices = setdiff(testIndices, posIndices) - nPosSequences;
posTestIndices = intersect(configParams.testIndices, posIndices);
negTestIndices = setdiff(configParams.testIndices, posIndices) - configParams.nPosSequences;

thisTestIndices{1} = testIndices;
thisTestIndices{1} = configParams.testIndices;
for i=2:nOuterFolds
thisPosTestIndices = circshift(posIndices, [0, (i-1)*length(posTestIndices)]);
thisNegTestIndices = circshift(negIndices, [0, (i-1)*length(negTestIndices)]) + length(posIndices);
Expand All @@ -226,25 +116,26 @@
%% Conditionally create a new pool; delete any existing pool of workers if NumWorkers is less than required, else use the same poolobj
poolobj = gcp('nocreate');
if isempty(poolobj)
parpool('local', nOuterFolds);
parpool('local', configParams.nOuterFolds);
else
if poolobj.NumWorkers >= nOuterFolds
logMessages(1, sprintf('Reusing existing parallel pool: %d out of %d workers\n', nOuterFolds, poolobj.NumWorkers), debugLevel);
if poolobj.NumWorkers >= configParams.nOuterFolds
logMessages(1, sprintf('Reusing existing parallel pool: %d out of %d workers\n', configParams.nOuterFolds, poolobj.NumWorkers), configParams.debugLevel);
else
logMessages(1, sprintf('Not enough workers in the existing parallel pool.\nDeleting existing pool with %d workers\n', poolobj.NumWorkers), debugLevel);
logMessages(1, sprintf('Not enough workers in the existing parallel pool.\nDeleting existing pool with %d workers\n', poolobj.NumWorkers), configParams.debugLevel);
delete(poolobj);
parpool('local', nOuterFolds);
parpool('local', configParams.nOuterFolds);
end
end
parfor (i=1:nOuterFolds, nOuterFolds)
thisFoldDirectory = strcat(outputFolder, '/outer_fold_', num2str(i));
returnStatusVal = perform_one_outer_fold_for_comik(i, givenPosFastaFilename, givenNegFastaFilename, nPosSequences, nNegSequences, oligoLen, ...
maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, ...
thisTestIndices{i}, debugLevel, thisFoldDirectory, whetherToPlotHeatmap, ...
computationVersion, whetherToVisualizeWVector, debugMsgLocation);
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), debugLevel);
parfor (i=1:configParams.nOuterFolds, configParams.nOuterFolds)
thisFoldDirectory = strcat(configParams.outputFolder, '/outer_fold_', num2str(i));
returnStatusVal = perform_one_outer_fold_for_comik(i, configParams.givenPosFastaFilename, configParams.givenNegFastaFilename, configParams.nPosSequences, ...
configParams.nNegSequences, configParams.oligoLen, configParams.maxDist, configParams.segmentSizeInBps, ...
configParams.nClusterVals, configParams.sigmaVals, configParams.Cs, configParams.mklNorm, configParams.nFolds, ...
thisTestIndices{i}, configParams.debugLevel, thisFoldDirectory, configParams.whetherToPlotHeatmap, ...
configParams.computationVersion, configParams.whetherToVisualizeWVector, configParams.debugMsgLocation);
logMessages(1, sprintf('\nOuter fold %d: Status %s\n', i, returnStatusVal), configParams.debugLevel);
end
logMessages(1, sprintf('All outer folds completed!\n'), debugLevel);
logMessages(1, sprintf('All outer folds completed!\n'), configParams.debugLevel);


end % comik_wrapper function ends
Expand Down
29 changes: 29 additions & 0 deletions config-comik.txt
@@ -0,0 +1,29 @@
## CoMIK CONFIG FILE

## Required Input
POSITIVE_FASTA_FILE=./sample_data/simulated_dataset1/pos.fasta
NEGATIVE_FASTA_FILE=./sample_data/simulated_dataset1/neg.fasta
NUMBER_OF_POSITIVES=600
NUMBER_OF_NEGATIVES=600
TEST_INDICES=[501:600 1101:1200]
OUTPUT_FOLDER=comik_run_simulated_dataset1

## ODH requirements
OLIGO_LEN=[2 3]
MAX_DIST=10

## For CoMIK
SEGMENT_SIZE_IN_BPS=10
NUMBER_OF_CLUSTERS=[2 5 7]
SIGMA_VALUES=10.^[1:1:2]
COST_VALUES=10.^[-3:1:3]
## Above are params which are required
## Further, params with otherwise default values
MKL_NORM=2.0
NUMBER_OF_INNER_FOLDS=10
NUMBER_OF_OUTER_FOLDS=5
WHETHER_TO_PLOT_HEATMAP=No
WHETHER_TO_VISUALIZE_WEIGHT_VECTOR=Yes
DEBUG_LEVEL=0
DEBUG_MSG_LOCATION=runLog.txt
COMPUTATION_VERSION=Looping
2 changes: 1 addition & 1 deletion getExpansionPoints.m
Expand Up @@ -35,7 +35,7 @@
rng('default');
logMessages(debugMsgLocation, sprintf('with K-means: \n'), debugLevel);
if debugLevel == 2
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','on');%, 'MaxIter',1000);%Display: final
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','Iter');%, 'MaxIter',1000);%Display: final
elseif debugLevel == 0
[idx, expansionPoints, sumD, matrixOfDistancesFromCentres] = kmeans(X, nClusters, 'Replicates', rep, 'Display','off');%, 'MaxIter',1000);
end
Expand Down

0 comments on commit 982dce4

Please sign in to comment.