diff --git a/comik_main_with_weight_vector.m b/comik_main_with_weight_vector.m index 9f35df4..7887379 100644 --- a/comik_main_with_weight_vector.m +++ b/comik_main_with_weight_vector.m @@ -1,9 +1,97 @@ function [allSeqsAsBags, allSeqsConformedSetKernel, subkernelWeights, thetaVals, instanceWeightsInEachBag, resultString, bestParamComb, test_teAUROC, test_teAUPRC, predictions] = comik_main_with_weight_vector(givenPosFastaFilename, givenNegFastaFilename, givenNPos, givenNNeg, oligoLen, maxDist, segmentSizeInBps, nClusterVals, sigmaVals, Cs, mklNorm, nFolds, testIndices, debugLevel, debugMsgLocation, outputFolder, runSummaryFilename, whetherToPlotHeatmap, computationVersion, whetherToVisualizeWVector) -% Author: snikumbh +% COMIK_MAIN_WITH_WITH_WEIHT_VECTOR +% Main function for CoMIK +% +% INPUT PARAMS +% Param 'givenPosFastaFilename' +% Name of FASTA file with positive examples +% +% Param 'givenNegFastaFilename' +% Name of FASTA file with negative examples +% +% Param 'givenNPos' +% Number of positive examples to use (can be less than those provided in file) +% +% Param 'givenNNeg' +% Number of negative examples to use (can be less than those provided in file) +% +% Param 'oligoLen' and 'maxDist' +% Specify the oligomer length and the maximum distance for the ODH +% representation. Caution: A combination of large values can be memory intensive! +% +% Param 'segmentSizeInBps' +% Specify the segment-size in basepairs for CoMIK +% +% Param 'nClusterVals' (vector) +% Specify the number of clusters for CoMIK +% +% Param 'sigmaVals' (vector) +% Specify the sigma values for the Gaussian transformation +% +% Param 'Cs' (vector) +% Cost values for SVM +% +% Param 'mklNorm' +% Specify p-norm value for MKL +% +% Param 'nFolds' +% Specify number of inner cross-validation folds +% +% Param 'testIndices' (vector) +% Indices of the sequences in the FASTA file which are to be considered as unseen +% test examples. For example, with a FASTA file containing a total of 100 +% positive sequences followed by 100 negative sequences, the test indices +% are given as +% testIndices = [81:100 181:200] +% for the corresponding 20 positives and 20 negatives to be treated as test +% examples. +% +% Param 'debugLevel' (0/1/2) % -% All segments, shifted and non-shifted in one bag; Order of segments: non-shifted segments followed by shifted segments. -% This results in #kernels = #clusterCentres +% Param 'debugMsgLocation' (1/fileID) +% +% Param 'outputFolder' +% Specify name of folder to write output results to +% +% OUTPUT PARAMS +% Param 'allSeqsAsBags' +% All sequences represented as bags +% +% Param 'allSeqsConformedSetKernel' (matrix) +% The conformally transformed MI kernel +% +% Param 'subkernelWeights' (vector) +% Weights assigned to each sub-kernel upon solving MKL +% +% Param 'thetaVals' (vector) +% Theta values used for obtaining the instance weights +% +% Param 'instanceWeightsInEachBag' +% Weights assigned to instances in each bag using the final model +% +% Param 'resultString' +% A combined string of per iteration results (as printed in the resultSummary file) +% +% Param 'bestParamComb' (Struct) +% Best performing values for various params +% +% Param 'test_teAUROC' +% AUROC value for the test sequences +% +% Param 'test_teAUPRC' +% AUPRC value for the test sequences +% +% Param 'predictions' +% Prediction vector +% +% ADDITIONAL NOTES +% -- Segmentation +% . All segments, shifted and non-shifted in one bag; Order of segments: non-shifted segments followed by shifted segments. +% This results in #kernels = #clusterCentres +% -- +% +% Author: snikumbh % % debugLevel 2 prints all messages % debugLevel 1 place holder level, may be used in the future diff --git a/comik_wrapper.m b/comik_wrapper.m index 7a50157..9f3f3fe 100644 --- a/comik_wrapper.m +++ b/comik_wrapper.m @@ -22,37 +22,46 @@ % % Required params: % ---------------- +% INPUT PARAMS +% Param 'givenFastFilename' (string) +% The input FASTA file should contain all sequences together, positives +% followed by negatives. % -% Param 'givenFastFilename: The input FASTA file should contain all sequences -% together -- positives followed by negatives. +% Params 'nPosSequences' and 'nNegSequences' +% Used to specify the number of positive and negative sequences in the dataset. % -% Params 'nPosSequences' and 'nNegSequences' are used to specify the number of -% positive and negative sequences in the dataset. -% -% Param 'testIndices' is a Matlab vector specifying the indices of the -% sequences in athe FASTA file which are to be considered as unseen -% test examples. For example, with a FASTA file containing a total of 100 -% positive sequences followed by 100 negative sequences, the test indices -% are given as +% Param 'testIndices' (vector) +% Indices of the sequences in the FASTA file which are to be considered as unseen +% test examples. For example, with a FASTA file containing a total of 100 +% positive sequences followed by 100 negative sequences, the test indices +% are given as % testIndices = [81:100 181:200] -% for the corresponding 20 positives and 20 negatives to be treated as -% test examples. +% for the corresponding 20 positives and 20 negatives to be treated as test +% examples. % -% Param 'outputFolder': Specifies the path on disk where output can be written +% Param 'outputFolder' (string) +% Specifies the path on disk where output can be written % -% Params 'oligoLen' and 'maxDist': Specify the oligomer length and the maximum -% distance for the ODH representation. Caution: A combination of large -% values can be memory intensive! +% Params 'oligoLen' and 'maxDist' +% Specify the oligomer length and the maximum distance for the ODH +% representation. Caution: A combination of large values can be memory intensive! % -% Param 'SegmentSizeInBps': Specify the segment-size in basepairs for CoMIKL +% Param 'segmentSizeInBps' +% Specify the segment-size in basepairs for CoMIK % -% Param 'nClusterVals': Specify the number of clusters for CoMIKL +% Param 'nClusterVals' +% Specify the number of clusters for CoMIK % -% Param 'sigmaVals': Specify the sigma values for the Gaussian transformation +% Param 'sigmaVals' +% Specify the sigma values for the Gaussian transformation +% +% Param 'Cs' +% Cost values for SVM % -% Param 'Cs': Cost values for SVM +% ADDITIONAL NOTES +% % -% Author: snikumbh +% Author: snikumbh@mpi-inf.mpg.de totalArguments = 20; if nargin < totalArguments diff --git a/computeConformedMultiInstanceKernel.m b/computeConformedMultiInstanceKernel.m index 383a289..0c32ab1 100644 --- a/computeConformedMultiInstanceKernel.m +++ b/computeConformedMultiInstanceKernel.m @@ -4,59 +4,59 @@ % Computes the conformally transformed multi-instance kernel % INPUT PARAMS -% param 'instanceStarts' (vector) +% Param 'instanceStarts' (vector) % Indices where instances (segments) of any sequence begin % -% param 'instanceEnds' (vector) +% Param 'instanceEnds' (vector) % Indices where instances (segments) of any sequence end % -% param 'instanceWideKernel' (matrix) +% Param 'instanceWideKernel' (matrix) % The instance-wide kernel matrix of dimension #Instances x #Instances % -% param 'trainIndices' (vector) +% Param 'trainIndices' (vector) % Indices of the training sequences % -% param 'allSeqsAsBags' (cell array) +% Param 'allSeqsAsBags' (cell array) % Collection of all segments per sequence % -% param 'nClusters' +% Param 'nClusters' % number of clusters for k-means % -% param 'sig' +% Param 'sig' % Gaussian bandwidth (sigma) value for the Gaussian transformation % -% param 'Y' (vector) +% Param 'Y' (vector) % Vector of labels % -% param 'computationVersion' ('AccumArray'/'Looping') +% Param 'computationVersion' ('AccumArray'/'Looping') % Flag specifying what approach to use for computing the kernel % -% param 'debugLevel' (0/1/2) +% Param 'debugLevel' (0/1/2) % -% param 'debugMsgLocation' (1/fileID) +% Param 'debugMsgLocation' (1/fileID) % % % OUTPUT PARAMS -% param 'conformedMultiInstanceKernel' (matrix) +% Param 'conformedMultiInstanceKernel' (matrix) % Normalized conformally transformed MI kernel % -% param 'rawConformedMultiInstanceKernel' (matrix) +% Param 'rawConformedMultiInstanceKernel' (matrix) % Unnormalized conformally transformed MI kernel % -% param 'instancesTransformationKernel' (matrix) +% Param 'instancesTransformationKernel' (matrix) % Transformation of the training instances % -% param 'clusterCentres' (matrix) +% Param 'clusterCentres' (matrix) % Cluster centres obtained upon k-means % -% ADDITIONAL NOTES: +% ADDITIONAL NOTES % -- Stratification % . If stratification is required, see below, set stratified = 1 % -- Replicates for k-means % . currently set to 1 % . can be increased % -% author: snikumbh@mpi-inf.mpg.de +% Author: snikumbh@mpi-inf.mpg.de % 1.1 Obtain expansion points diff --git a/computeInstanceWideKernel.m b/computeInstanceWideKernel.m index a6ae7f4..5126b25 100644 --- a/computeInstanceWideKernel.m +++ b/computeInstanceWideKernel.m @@ -4,33 +4,33 @@ % Computes the instance-wide kernel matrix given a bag of instances % % INPUT PARAMS -% param 'allSeqsAsBags' (cell array) +% Param 'allSeqsAsBags' (cell array) % Collection of all segments per sequence % -% param 'thisInstances' (vector) +% Param 'thisInstances' (vector) % #instances in each bag (in other words, #segments per sequence) % -% param 'sparseComputation' (0/1) +% Param 'sparseComputation' (0/1) % Flag specifying whether sparseComputation is to be performed (0/1) % -% param 'debugLevel' (0/1/2) +% Param 'debugLevel' (0/1/2) % -% param 'debugMsgLocation' (1/fileID) +% Param 'debugMsgLocation' (1/fileID) % % % OUTPUT PARAMS -% param 'instanceStarts' (vector) +% Param 'instanceStarts' (vector) % Indexes in the vector where instances (segments) of any sequence begin % -% param 'instanceEnds' (vector) +% Param 'instanceEnds' (vector) % Indexes in the vector where istances (segments) of any sequence end % -% param 'instanceWideKernel' (matrix) +% Param 'instanceWideKernel' (matrix) % The instance-wide kernel matrix of dimension #Instances x #Instances % -% ADDITIONAL NOTES: +% ADDITIONAL NOTES % -% author: snikumbh@mpi-inf.mpg.de +% Author: snikumbh@mpi-inf.mpg.de if nargin < 3 debugMsgLocation = 1;